In [1]:
# Cell 1: Install Required Libraries (Uncomment if needed)
# !pip install transformers auto-gptq accelerate torch --upgrade

# Cell 2: Imports
from transformers import AutoTokenizer, TextStreamer
from auto_gptq import AutoGPTQForCausalLM
import torch
import os


CUDA extension not installed.
CUDA extension not installed.


In [2]:
# Cell 3: Model Loaders
def load_gptq_model(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    model = AutoGPTQForCausalLM.from_quantized(
        model_id,
        device_map="auto",
        use_safetensors=True,
        trust_remote_code=True,
        offload_folder="./llama_offload"  # ✅ Add this line
    )
    return model, tokenizer


def load_mistral_model(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoGPTQForCausalLM.from_quantized(
        model_id,
        device_map="auto",
        use_safetensors=True,
        trust_remote_code=True,
        offload_folder="./llama_offload"  # Helps on low-memory devices
    )
    return model, tokenizer


In [3]:
# Cell 4: Text Generation with Streaming

def generate_text(model, tokenizer, prompt, max_new_tokens=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    streamer = TextStreamer(tokenizer)

    with torch.inference_mode():
        output = model.generate(
            **inputs,
            streamer=streamer,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.1
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [4]:
# from contextlib import nullcontext

# def ask_models(prompt, max_tokens=100, mistral_model=None, mistral_tokenizer=None, llama_model=None, llama_tokenizer=None):
#     # Tokenize inputs
#     mistral_inputs = mistral_tokenizer(prompt, return_tensors="pt").to(mistral_model.device)
#     llama_inputs = llama_tokenizer(prompt, return_tensors="pt").to(llama_model.device)

#     # Inference
#     with torch.no_grad():  # ✅ safer for GPTQ
#         mistral_output = mistral_model.generate(
#             **mistral_inputs,
#             max_new_tokens=max_tokens,
#             temperature=0.7,
#             top_p=0.95
#         )
        
#         llama_output = llama_model.generate(
#             **llama_inputs,
#             max_new_tokens=max_tokens,
#             temperature=0.7,
#             top_p=0.95
#         )


#     mistral_response = mistral_tokenizer.decode(mistral_output[0], skip_special_tokens=True)
#     llama_response = llama_tokenizer.decode(llama_output[0], skip_special_tokens=True)

#     return mistral_response, llama_response



def ask_models(prompt, max_tokens=100, mistral_model=None, mistral_tokenizer=None, llama_model=None, llama_tokenizer=None):
    # Ensure inputs are on the same device as their respective models
    mistral_inputs = mistral_tokenizer(prompt, return_tensors="pt").to(mistral_model.device)
    llama_inputs = llama_tokenizer(prompt, return_tensors="pt").to(llama_model.device)

    # Inference
    with torch.no_grad():  # ✅ safer for GPTQ
        mistral_output = mistral_model.generate(
            **mistral_inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_p=0.95
        )
        
        llama_output = llama_model.generate(
            **llama_inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_p=0.95
        )

    mistral_response = mistral_tokenizer.decode(mistral_output[0], skip_special_tokens=True)
    llama_response = llama_tokenizer.decode(llama_output[0], skip_special_tokens=True)

    return mistral_response, llama_response


In [5]:
# Cell 6: Evaluation Logic

def evaluate_response(response: str):
    evaluation = {
        "factual_accuracy": "Unknown",
        "hallucination": False,
        "bias_or_stereotype": False,
        "notes": ""
    }

    if "capital of india" in response.lower():
        if "new delhi" in response.lower():
            evaluation["factual_accuracy"] = "High"
        else:
            evaluation["factual_accuracy"] = "Low"
            evaluation["hallucination"] = True
            evaluation["notes"] += "Missed correct capital.\n"

    if any(term in response.lower() for term in ["always", "never", "clearly", "obviously"]):
        evaluation["bias_or_stereotype"] = True
        evaluation["notes"] += "Potential overconfidence or bias in language.\n"

    return evaluation





In [6]:
# Cell 7: Load Models
mistral_pipe = load_mistral_model("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
llama_pipe = load_gptq_model("MaziyarPanahi/Meta-Llama-3-8B-Instruct-GPTQ")


1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.


  0%|          | 0/963 [00:00<?, ?w/s]

In [7]:
# Load models and tokenizers
mistral_model, mistral_tokenizer = load_mistral_model("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
llama_model, llama_tokenizer = load_gptq_model("MaziyarPanahi/Meta-Llama-3-8B-Instruct-GPTQ")

# Move both models to the same device
device = "cuda" if torch.cuda.is_available() else "cpu"

mistral_model = mistral_model.to(device)
llama_model = llama_model.to(device)

# Define prompt
prompt = "What is the capital of India?"

# Ask models for their responses
mistral_ans, llama_ans = ask_models(
    prompt,
    mistral_model=mistral_model,
    mistral_tokenizer=mistral_tokenizer,
    llama_model=llama_model,
    llama_tokenizer=llama_tokenizer
)

# Print results and evaluations
print("=== Mistral ===\n", mistral_ans)
print("Eval:", evaluate_response(mistral_ans), "\n")

print("=== LLaMA 3 ===\n", llama_ans)
print("Eval:", evaluate_response(llama_ans), "\n")


1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.


  0%|          | 0/1187 [00:00<?, ?w/s]

1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
# Cell 9: Additional Generation Test
model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
model, tokenizer = load_mistral_model(model_id)

prompt = "Explain quantum computing to a 5-year-old."
response = generate_text(model, tokenizer, prompt)
print(response)


In [None]:
# model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
# model, tokenizer = load_mistral_model(model_id)

# prompt = "Explain quantum computing to a 5-year-old."
# generate_text(model, tokenizer, prompt)
print(torch.cuda.is_available())

True


In [None]:
# prompt = "whaty is mern stack ?"
# generate_text(model, tokenizer, prompt)

<s> whaty is mern stack ?
A: MERN (MongoDB, Express.js, React.js, Node.js) is a popular full-stack web development framework that allows developers to build dynamic and interactive web applications quickly and efficiently. It consists of four main technologies: MongoDB for the database, Express.js for the server-side logic, React.js for the client-side user interface, and Node.js for running the entire application on one platform. The MERN stack is often used to build single-page applications (SPAs), mobile applications, and APIs (Application Programming Interfaces).</s>
