In [1]:
import fitz
import os
import numpy as np
import json
import os
import torch
from sentence_transformers import SentenceTransformer
from openai import OpenAI

RuntimeError: Directory 'static/' does not exist

In [None]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file and prints the first `num_chars` characters.

    Args:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    # Open the PDF file
    mypdf = fitz.open(pdf_path)
    all_text = ""  # Initialize an empty string to store the extracted text

    # Iterate through each page in the PDF
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]  # Get the page
        text = page.get_text("text")  # Extract text from the page
        all_text += text  # Append the extracted text to the all_text string

    return all_text  # Return the extracted text

In [None]:
def chunk_text(text, n, overlap):
    """
    Chunks the given text into segments of n characters with overlap.

    Args:
    text (str): The text to be chunked.
    n (int): The number of characters in each chunk.
    overlap (int): The number of overlapping characters between chunks.

    Returns:
    List[str]: A list of text chunks.
    """
    chunks = []  # Initialize an empty list to store the chunks
    
    # Loop through the text with a step size of (n - overlap)
    for i in range(0, len(text), n - overlap):
        # Append a chunk of text from index i to i + n to the chunks list
        chunks.append(text[i:i + n])

    return chunks  # Return the list of text chunks

In [None]:
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key="YOUR_OPENROUTER_KEY",#it's free
)

In [None]:
pdf_path = "data/AI_Information.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
text_chunks = chunk_text(extracted_text,1000,200)
print("Number of Chunks", len(text_chunks))
print("\nFirst text chunk:")
print(text_chunks[0])

In [None]:
def create_embeddings(text,model = SentenceTransformer("all-MiniLM-L6-v2",device="cpu")):
    """
    Creates embeddings for the given text using the specified OpenAI model.

    Args:
    text (str): The input text for which embeddings are to be created.
    model (str): The model to be used for creating embeddings. Default is "BAAI/bge-en-icl".

    Returns:
    dict: The response from the OpenAI API containing the embeddings.
    """
    # Create embeddings for the input text using the specified model
    response = model.encode(text)

    return response  # Return the response containing the embeddings

# Create embeddings for the text chunks
response = create_embeddings(text_chunks)


In [None]:
def cosine_similarity(vec1,vec2):
    """
    Calculates the cosine similarity between two vectors.

    Args:
    vec1 (np.ndarray): The first vector
    vec2 (np.ndarray): The second vector

    Returns:
    float: The cosine similarity between the two vectors.
    """
    return np.dot(vec1,vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2))

In [None]:
def semantic_search(query, text_chunks,embeddings,k=5):
    """
    Performs the semantic search on the text chunks using the given query and embeddings.

    Args:
    query(str):The query for the semantic search.
    text_chunks(List[str]): A list of text chunks to search through
    embeddings(List[str]): A list of embeddings for the text chunks
    k (int): The number of top relevent text chunks to return. Default is 5

    Returns: 
    List[str]: A list of the top k most relevant text chunks based on the query
    """

    query_embedding = create_embeddings(query)
    similarity_scores = []

    #calculate similarity score between a query_embedding and and each text chunk embedding
    for i, chunk_embedding in enumerate(embeddings):
        similarity_score = cosine_similarity(np.array(query_embedding), np.array(chunk_embedding))
        similarity_scores.append((i,similarity_score))
    similarity_scores.sort(key=lambda x:x[1],reverse=True)
    top_indices = [index for index, _ in similarity_scores[:k]]
    return [text_chunks[index] for index in top_indices]

In [None]:
with open('data/val.json') as f:
    data = json.load(f)

query = data[0]['question']

top_chunks = semantic_search(query,text_chunks,response,k=2)

print("Query:", query)

for i,chunk in enumerate(top_chunks):
    print(f"Context {i+1}:\n{chunk}\n =============================")

In [None]:
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt,user_message,model="google/gemini-2.0-flash-thinking-exp-1219:free"):
    """
    Generates a response from the AI model based on the system prompt and user message.

    Args:
    system_prompt (str): The system prompt to guide the AI's behavior.
    user_message (str): The user's message or query.
    model (str): The model to be used for generating the response. Default is "google/gemma-3-4b-it:free".

    Returns:
    dict: The response from the AI model.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role":"system","content":system_prompt
            },
            {
                "role":"user","content":user_message
            }
        ]
    )
    return response

user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"

# Generate AI response
ai_response = generate_response(system_prompt, user_prompt)

In [None]:
evaluate_system_prompt = "You are an intelligent evaluation system tasked with assessing the AI assistant's responses. If the AI assistant's response is very close to the true response, assign a score of 1. If the response is incorrect or unsatisfactory in relation to the true response, assign a score of 0. If the response is partially aligned with the true response, assign a score of 0.5."

# Create the evaluation prompt by combining the user query, AI response, true response, and evaluation system prompt
evaluation_prompt = f"User Query: {query}\nAI Response:\n{ai_response.choices[0].message.content}\nTrue Response: {data[0]['ideal_answer']}\n{evaluate_system_prompt}"

# Generate the evaluation response using the evaluation system prompt and evaluation prompt
evaluation_response = generate_response(evaluate_system_prompt, evaluation_prompt)

# Print the evaluation response
print(evaluation_response.choices[0].message.content)