# Half Precision

In [1]:
import os
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv
import re

# Import your RAG class from your rag.py file
from RAG_With_Finetuning import RAG

# Method to get the API key from the .env file
def get_api_key(api_name):
    env_path = "../.dummy_env"  # Adjust the path as needed
    load_dotenv(env_path)  # Load the environment variables
    return os.getenv(api_name)

# -------------------------------
# Setup Evaluator Model (Mistral)
# -------------------------------
# Define a custom cache directory
CUSTOM_CACHE_DIR = "/media/volume/vol-VisaWise/models/mistral_cache"

# Load the Mistral model and tokenizer in FP16 for faster inference
evaluator_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    cache_dir=CUSTOM_CACHE_DIR,  # Specify custom cache directory
    torch_dtype=torch.float16,   # Load in FP16
    token=get_api_key('HF_GAURI')
)
device = "cuda" if torch.cuda.is_available() else "cpu"
evaluator_model.to(device)
evaluator_model.eval()  # Set model to evaluation mode

evaluator_tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    cache_dir=CUSTOM_CACHE_DIR,  # Specify the same cache directory
    token=get_api_key('HF_GAURI')
)

# -------------------------------
# Create a QA Dataset for Evaluation
# -------------------------------
qa_dataset = [
    {
        "question": "How can an international student on an F-1 visa maintain their status during graduate studies?",
        "expected_answer": (
            "To maintain F-1 status during graduate studies, an international student must enroll as a full-time student, "
            "make normal progress toward a degree, and comply with all rules and regulations set by their educational "
            "institution and the U.S. government. This includes not engaging in unauthorized employment and keeping "
            "documentation up to date."
        )
    },
    {
        "question": "What is Curricular Practical Training (CPT) and how does it benefit F-1 students?",
        "expected_answer": (
            "CPT is a type of off-campus employment authorization for F-1 students that allows them to gain practical "
            "experience in their field of study. It is usually offered as part of an internship, co-op, or practicum "
            "course, and must be an integral part of the student's curriculum."
        )
    },
    {
        "question": "What are the eligibility criteria for Optional Practical Training (OPT) for international students on an F-1 visa?",
        "expected_answer": (
            "OPT allows F-1 students to work in the United States for up to 12 months after completing their degree. "
            "Eligibility typically requires that the student has maintained valid F-1 status, completed at least one "
            "academic year of study, and that the employment is directly related to the student's major field of study."
        )
    }
]

# -------------------------------
# Define Evaluation Function
# -------------------------------
def evaluate_answer(question, generated_answer, expected_answer):
    eval_prompt = (
        f"Question: {question}\n\n"
        f"Expected Answer: {expected_answer}\n\n"
        f"Generated Answer: {generated_answer}\n\n"
        "Please evaluate the generated answer compared to the expected answer. Provide a score between 1 (poor) "
        "and 5 (excellent) along with a brief justification for your score.\n\n"
        "Score and Justification:"
    )
    
    # Tokenize and move tokens to GPU
    tokens = evaluator_tokenizer.encode(eval_prompt, return_tensors="pt").to(device)
    
    # Disable gradient calculations and use AMP for efficiency
    with torch.no_grad():
        with torch.cuda.amp.autocast():
            generated_ids = evaluator_model.generate(tokens, max_new_tokens=100, do_sample=True)
    
    evaluation_text = evaluator_tokenizer.decode(generated_ids[0].tolist(), skip_special_tokens=True)
    return evaluation_text.strip()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# -------------------------------
# Define Evaluation Function
# -------------------------------
def evaluate_answer(question, generated_answer, expected_answer):
    """
    Uses the evaluator model (Mistral) to compare the generated answer with the expected answer.
    Returns the full justification after "Score and Justification:".
    """
    eval_prompt = (
        f"Question: {question}\n\n"
        f"Expected Answer: {expected_answer}\n\n"
        f"Generated Answer: {generated_answer}\n\n"
        "Please evaluate the generated answer compared to the expected answer. Provide a score between 1 (poor) "
        "and 5 (excellent) along with a brief justification for your score.\n\n"
        "Score and Justification:"
    )
    
    # Tokenize the evaluation prompt
    tokens = evaluator_tokenizer.encode(eval_prompt, return_tensors="pt").to(device)
    
    # Generate evaluation response
    generated_ids = evaluator_model.generate(tokens, max_new_tokens=100, do_sample=True)
    
    evaluation_text = evaluator_tokenizer.decode(generated_ids[0].tolist(), skip_special_tokens=True).strip()

    # Extract full "Score and Justification:" content
    match = re.search(r"Score and Justification:\s*(.*)", evaluation_text, re.IGNORECASE)
    
    if match:
        score_justification = match.group(1).strip()
    else:
        print(f"?? Warning: Could not extract 'Score and Justification' for question: {question}")
        print(f"Evaluation Text: {evaluation_text}\n")
        score_justification = None  # Set to None if extraction fails

    return evaluation_text, score_justification

# -------------------------------
# Main Evaluation Routine
# -------------------------------
def main():
    rag_model = RAG()
    evaluations = []  # List to store evaluation results
    
    for qa in qa_dataset:
        question = qa["question"]
        expected_answer = qa["expected_answer"]
        print(f"\nEvaluating Question: {question}")
        
        # Generate answer using your RAG model
        generated_answer = rag_model.generate_answer(question)
        print("Generated Answer:", generated_answer)
        
        # Evaluate the generated answer using the evaluator model
        eval_result, score_justification = evaluate_answer(question, generated_answer, expected_answer)
        print("Evaluation Result:", eval_result)
        print("Score and Justification:", score_justification if score_justification else "?? Not found")
        print("-" * 80)
        
        evaluations.append({
            "question": question,
            "expected_answer": expected_answer,
            "generated_answer": generated_answer,
            "evaluation": eval_result,
            "score_and_justification": score_justification  # Storing full justification
        })
    
    # Save evaluation results to a JSON file
    with open("rag_evaluation_results.json", "w") as outfile:
        json.dump(evaluations, outfile, indent=4)
    
    print("\nEvaluation completed. Results saved to rag_evaluation_results.json.")

if __name__ == "__main__":
    main()

ValueError: Unrecognized model in Jenitza182/Qwen2.5-7B-Instruct-law-lora_model. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth

In [3]:
from transformers import AutoConfig

model_name = "Jenitza182/Qwen2.5-7B-Instruct-law-lora_model"

config = AutoConfig.from_pretrained(model_name)
print(config.to_dict())


ValueError: Unrecognized model in Jenitza182/Qwen2.5-7B-Instruct-law-lora_model. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth