In [1]:
!pip install --quiet mistralai weaviate-client PyPDF2 2>/dev/null

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
import re

import PyPDF2

import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Property, DataType

from mistralai import Mistral

# Extracting the questions from the pdf

---

In [4]:
def extract_questions_dict(pdf_path):
    
    questions_list = [] # ye basically a list of dictionaries h, each dictionary representing one question with two keys, "question" and "options", aage ke code abhi we are not using options anywhere while creating the db. 
    
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        full_text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"
    

    full_text = re.sub(r"Data Science and Artificial Intelligence \(DA\)", "", full_text, flags=re.IGNORECASE)
    full_text = re.sub(r"Organizing Institute:.*", "", full_text, flags=re.IGNORECASE)
    full_text = re.sub(r"Page\s*\d+(\s*of\s*\d+)?", "", full_text, flags=re.IGNORECASE)
    
    question_block_pattern = re.compile(r"Q\.?\d+\s*[:\-]?\s*(.*?)(?=\nQ\.?\d+\s*[:\-]?|\Z)", re.DOTALL)
    
    blocks = question_block_pattern.findall(full_text)
    
    for block in blocks:
        block = block.strip()
        
        if "(A)" not in block:
            continue
        if re.search(r"carry\s+\w+\s+mark", block, flags=re.IGNORECASE):
            continue

        block = re.sub(r"\s{2,}", " ", block).strip()
        block = re.sub(r"Organizing Institute:.*", "", block, flags=re.IGNORECASE).strip()

        split_parts = re.split(r"\(\s*A\s*\)", block, maxsplit=1)
        if len(split_parts) < 2:
            continue 

        question_text = split_parts[0].strip()
        options_text = "(A)" + split_parts[1] 

        option_pattern = re.compile(r"\(\s*([A-Z])\s*\)\s*(.*?)(?=\(\s*[A-Z]\s*\)|$)", re.DOTALL)
        options_matches = option_pattern.findall(options_text)
        
        if not options_matches:
            continue
        
        options_dict = {}
        for letter, opt_text in options_matches:
            opt_clean = re.sub(r"\s{2,}", " ", opt_text).strip()
            options_dict[letter] = opt_clean

        question_text = question_text.strip(" -:")  

        questions_list.append({
            "question": question_text,
            "options": options_dict
        })
    
    return questions_list

In [5]:
pdf_path = "context_paper.pdf"
ques = extract_questions_dict(pdf_path) # list of dictionaries
questions = [] # list of only questions 

for idx, qdict in enumerate(ques, 1):
    print(f"Question {idx}:")
    questions.append(qdict["question"])
    print(qdict["question"])
    for letter, opt in qdict["options"].items():
        print(f"  {letter}: {opt}")
    print("\n" + "-"*50 + "\n")

Question 1:
If ‘→’ denotes increasing order of intensity, then the meaning of the words [sick → infirm → moribund] is analogous to [silly → _______ → daft]. Which one of the given options is appropriate to fill the blank?
  A: frown
  B: fawn
  C: vein
  D: vain

--------------------------------------------------

Question 2:
The 15 parts of the given figure are to be painted such that no two adjacent parts with shared boundaries (excluding corners) have the same color. The minimum number of colors required is
  A: 4
  B: 3
  C: 5
  D: 6

--------------------------------------------------

Question 3:
How many 4 -digit positive integers divisible by 3 can be formed using only the digits {1,3,4,6,7}, such that no digit appears more than once in a number?
  A: 24
  B: 48
  C: 72
  D: 12

--------------------------------------------------

Question 4:
The sum of the following infinite series is 2+1
2+1
3+1
4+1
8+1
9+1
16+1
27+⋯
  A: 11/3
  B: 7/2
  C: 13/4
  D: 9/2

----------------------

# Setting up Mistral to create embeddings

---

In [6]:
api_key = "HCOHSJGVjzxKYjhBjO4u8rGmZFKObnLf"
embedding_model = "mistral-embed"

mistral_client = Mistral(api_key=api_key)

def get_embeddings(questions):
    response = mistral_client.embeddings.create(
        model=embedding_model,
        inputs=questions
    )
    return response.data 

# Setting up Weaviate, the database

---

In [7]:
url = "https://pgwkmycmtbor78al14g5ya.c0.asia-southeast1.gcp.weaviate.cloud"
key = "1TZymK1LSn6FnBTnvou7T5ghjJAXx2EItPgJ"

weaviate_client = weaviate.connect_to_weaviate_cloud(
    cluster_url=url,  
    auth_credentials=Auth.api_key(key)  
)

if weaviate_client.is_ready():
    print("✅ Successfully connected to Weaviate Cloud!")
else:
    print("❌ Failed to connect. Check your URL and API Key.")

weaviate_client.collections.delete("Llmexam")

✅ Successfully connected to Weaviate Cloud!


# Making a custom collection type in Weviate that suits our data

---

In [8]:
collection_name = "Llmexam"

if collection_name not in weaviate_client.collections.list_all():
    weaviate_client.collections.create(
        name=collection_name,
        properties=[
            Property(name="question_text", data_type=DataType.TEXT),
            Property(name="question_embedding", data_type=DataType.NUMBER_ARRAY),
        ],
        vectorizer_config=None 
    )
    print(f"✅ Collection '{collection_name}' created.")
else:
    print(f"⚡ Collection '{collection_name}' already exists.")

✅ Collection 'Llmexam' created.


# Calling everything and storing the embeddings

---

In [9]:
def store_questions(questions, embeddings):
    for question, emb in zip(questions, embeddings):
        vector = emb.embedding if hasattr(emb, "embedding") else emb
        llmexam.data.insert(
            properties={
                "question_text": question,
                "question_embedding": vector
            },
            vector=vector  
        )
    print(f"✅ {len(questions)} questions stored successfully!")

In [10]:
llmexam = weaviate_client.collections.get("llmexam")

print(questions[:5])

embeddings = get_embeddings(questions)
store_questions(questions, embeddings)

print("✅ Successfully stored questions in Weaviate.")

['If ‘→’ denotes increasing order of intensity, then the meaning of the words [sick → infirm → moribund] is analogous to [silly → _______ → daft]. Which one of the given options is appropriate to fill the blank?', 'The 15 parts of the given figure are to be painted such that no two adjacent parts with shared boundaries (excluding corners) have the same color. The minimum number of colors required is', 'How many 4 -digit positive integers divisible by 3 can be formed using only the digits {1,3,4,6,7}, such that no digit appears more than once in a number?', 'The sum of the following infinite series is 2+1\n2+1\n3+1\n4+1\n8+1\n9+1\n16+1\n27+⋯', 'In an election, the share of valid votes received by the four candidates A, B, C, and D is represented by the pie chart shown. The total number of votes cast in the election were 1,15,000, out of which 5,000 were invalid. Based on the data provided, t he total number of valid votes received by the candidates B and C is']
✅ 48 questions stored suc

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("unsloth/Qwen2.5-14B-Instruct-1M-unsloth-bnb-4bit")
model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen2.5-14B-Instruct-1M-unsloth-bnb-4bit")

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Step 3: Define a function to retrieve context from Weaviate
def retrieve_context(query_text, class_name="Llmexam", top_k=3):
    query_vector = mistral_client.embeddings.create(
        model=embedding_model,
        inputs=query_text
    ).data[0].embedding  # Generate embedding for the query
    questions = weaviate_client.collections.get(class_name)
    result = (
        questions.query.near_vector(near_vector=query_vector, limit=top_k)
    )
    relevant_questions = result["data"]["Get"][class_name]
    context = "\n".join([q["questionText"] for q in relevant_questions])
    return context

def retrieve_all(class_name="Llmexam"):
    questions = weaviate_client.collections.get(class_name)
    all_questions = []
    for item in questions.iterator():
        # print(item.properties['question_text'])
        all_questions.append(item.properties['question_text'])
    context = "\n".join(all_questions)
    return context


# Step 4: Define a function to generate a response
def generate_response(prompt, max_length=10000):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Step 5: Combine everything
# query_text = "Generate 3 difficult subjective type questions based on the context paper."
query_text = "Generate 3 mcq type questions based on the context paper."
# context = retrieve_context(query_text)  # Retrieve relevant questions from Weaviate
context = retrieve_all()

# Construct the prompt
prompt = f"""
Context: {context}

Question: {query_text}

Answer:
"""

# Generate the response
response = generate_response(prompt)
print("LLM Response:", response)



LLM Response: 
Context: Consider the following statement: In adversarial search, 𝛼–𝛽 pruning can be applied to game trees of any depth where 𝛼 is the (m) value choice we have formed so far at any choice point along the path for the MAX player and 𝛽 is the (n) value choice we have formed so far at any choice point along the path for the MIN player. Which ONE of the following choices of (m) and (n) makes the above statement valid?
Consider the matrix 𝑴=[2−1
31]. Which ONE of the following statements is TRUE ?
Let game(ball, rugby) be true if the ball is used in rugby and false otherwise. Let shape(ball, round) be true if the ball is round and false otherwise. Consider the following logical sentences: s1: ∀ball ¬ game(ball, rugby) ⟹shape(ball, round) s2: ∀ball ¬ shape(ball, round) ⟹game(ball, rugby) s3: ∀ball game(ball, rugby) ⟹¬ shape(ball, round) s4: ∀ball shape(ball, round) ⟹¬ game(ball, rugby) Which of the following choices is/are logical representations of the assertion , “All balls 