In [2]:
# imports
import re
import json

import transformers
import torch

from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)

from deepeval.test_case import LLMTestCase
from deepeval import evaluate
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics import (HallucinationMetric, 
                              FaithfulnessMetric, 
                              BiasMetric,
                              ToolCorrectnessMetric
                              )

from pydantic import BaseModel

In [7]:
# define custom llm class for deepeval
class CustomLlama3_8B(DeepEvalBaseLLM):
    def __init__(self, model_path: str = None):

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )

        model_4bit = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto",
            quantization_config=quantization_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_path
        )

        self.model = model_4bit
        self.tokenizer = tokenizer
        self.model_name = model_path.split("/")[-1]

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        model = self.load_model()

        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_length=2500,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        parser = JsonSchemaParser(schema.model_json_schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            pipeline.tokenizer, parser
        )

        # Output and load valid JSON
        output_dict = pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt):]

        # remove all special tokens (whitespace, newline, etc)
        output = output.replace('\n', ' ').strip()
        json_result = json.loads(output)

        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        name = self.model_name
        return ' '.join(name.split('-'))

class Schema(BaseModel):
    answer: str

In [8]:
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
llama = CustomLlama3_8B(model_path=model_path)
schema = Schema

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
input= 'You are a Scientist. Tell me about TAF13.'
actual_output = llama.generate(input, schema)
print(actual_output)

answer='TAF13 is a transcriptional coactivator protein that plays a crucial role in the regulation of gene expression. It is a member of the TFIID complex, which is responsible for the recruitment of RNA polymerase II to promoters and the initiation of transcription. TAF13 has been shown to interact with various transcription factors, including p53, NF-κB, and STAT3, and to regulate the expression of a wide range of genes involved in various cellular processes, including cell growth, differentiation, and apoptosis. Dysregulation of TAF13 has been implicated in various diseases, including cancer, neurodegenerative disorders, and metabolic disorders. As a transcriptional coactivator, TAF13 plays a critical role in maintaining cellular homeostasis and is essential for normal cellular function. In addition, TAF13 has been shown to have a role in the regulation of epigenetic marks, such as histone modifications, and to interact with chromatin-modifying enzymes, suggesting that it may play a

In [16]:
# define context
context = ["TAF13, or TATA-Box Binding Protein Associated Factor 13, is a protein that is encoded by the TAF13 gene in humans.",
           "It is a subunit of the transcription initiation factor TFIID",
           "TAF13 is involved in RNA polymerase II transcription initiation and promoter clearance: TAF13 is part of the TFIID complex,which plays a major role in the initiation of transcription that is dependent on RNA polymerase II.",
           "TAF13 is involved in gene expression.",
           "TAF13 is involved in DNA-binding transcription factor activity."]

test_case = LLMTestCase(
    input=input,
    actual_output= actual_output.answer,  # EXTRACT STRING WITH .answer
    context=context,
    retrieval_context=["transcription initiation factor"],
)

hallucination_metric = HallucinationMetric(model=llama)
faithfulness_metric = FaithfulnessMetric(model=llama)
bias_metric = BiasMetric(model=llama)

# or evaluate test cases in bulk
evaluate([test_case], [hallucination_metric, faithfulness_metric, bias_metric])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 01:22, 82.14s/test case]



Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.5, strict: False, evaluation model: Meta Llama 3 8B Instruct, reason: The score is 0.00 because the actual output agrees with the provided context in all instances, indicating that the output is accurate and reliable, and there is no hallucination present in the output. This is reflected in the hallucination score of 0.00, which indicates that the output is highly accurate and reliable, with no hallucination present. The factual alignments and contradictions provided further support this conclusion, as they demonstrate that the actual output aligns with the provided context in all instances, and there are no contradictions or hallucinations present in the output. Therefore, the hallucination score of 0.00 is a reflection of the accuracy and reliability of the output, and the lack of hallucination present in the output., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluation model: Meta Llam




[TestResult(success=True, metrics_data=[MetricData(name='Hallucination', threshold=0.5, success=True, score=0.0, reason='The score is 0.00 because the actual output agrees with the provided context in all instances, indicating that the output is accurate and reliable, and there is no hallucination present in the output. This is reflected in the hallucination score of 0.00, which indicates that the output is highly accurate and reliable, with no hallucination present. The factual alignments and contradictions provided further support this conclusion, as they demonstrate that the actual output aligns with the provided context in all instances, and there are no contradictions or hallucinations present in the output. Therefore, the hallucination score of 0.00 is a reflection of the accuracy and reliability of the output, and the lack of hallucination present in the output.', strict_mode=False, evaluation_model='Meta Llama 3 8B Instruct', error=None, evaluation_cost=None, verbose_logs='Verd