In [1]:
import os
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
from sklearn.preprocessing import normalize

import ollama

In [2]:
def load_json_data(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        return json.load(file)


In [3]:
def save_json_data(path, data):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

In [4]:
# TF-IDF vector creation
def create_tfidf_vectors_optimized(data):
    for entry in data:
        entry['tfidf_doc'] = entry['question'] + ' ' + entry['code']
        entry['query_str'] = entry['code'] + '\n' + entry['question']

    documents = [entry['tfidf_doc'] for entry in data]
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(documents)
    normalized_matrix = normalize(tfidf_matrix)

    return vectorizer, normalized_matrix

In [5]:
# Retrieve top documents by similarity
def retrieve_top_documents_optimized(query, query_index, data, vectorizer, normalized_matrix, top_n=5):
    query_vec = vectorizer.transform([query])
    query_vec = normalize(query_vec)
    similarities = (query_vec @ normalized_matrix.T).toarray().flatten()

    #similarities[query_index] = -1  # exclude self

    if np.any(similarities > 0):
        top_k_indices = np.argpartition(-similarities, top_n)[:top_n]
        top_k_indices = top_k_indices[np.argsort(-similarities[top_k_indices])]
        return [data[i] for i in top_k_indices if similarities[i] > 0]
    else:
        return []

In [6]:
# ---- Config . Comment out the system model you want to use---- #

SYSTEM_MODEL = 'llama3.1:8b'
#SYSTEM_MODEL = 'codellama:7b'
#SYSTEM_MODEL = 'mistral:7b'
#SYSTEM_MODEL = 'deepseek-coder:6.7b'

In [7]:
def generate_response_with_chain_of_thought(query, documents):
    context = ' '.join([f"{doc['question']} Hence, {doc['answer']}." for doc in documents])
    combined_input = f"Question: {query}\n\nContext: {context}\n\nAnswer in one to five words only."

    system_prompt = (
        "You are a code comprehension assistant. Your task is to analyze the code-related questions and the provided answers "
        "from previous examples, reason based on those, and generate a concise response to the new question.\n"
        "- Base your answer strictly on the provided question-answer context.\n"
        "- Your response should be one to five words.\n"
        "- Do not include explanations or restate the question.\n"
    )

    response = ollama.chat(
        model=SYSTEM_MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": combined_input}
        ]
    )
    return response['message']['content']


In [8]:
json_path = 'CodeQA_small_dataset.json'
output_path = 'Llama3_CodeQA_predictions.json'

data = load_json_data(json_path)
vectorizer, normalized_matrix = create_tfidf_vectors_optimized(data)

In [9]:
updated_data = []
save_every = 25

for i, entry in enumerate(tqdm(data, desc='Generating predictions')):
    query = entry['query_str']

    relevant_docs = retrieve_top_documents_optimized(
        query=query,
        query_index=i,
        data=data,
        vectorizer=vectorizer,
        normalized_matrix=normalized_matrix,
        top_n=5
    )

    if not relevant_docs:
        prediction = 'No relevant documents found.'
    else:
        try:
            prediction = generate_response_with_chain_of_thought(query, relevant_docs)
        except Exception as e:
            prediction = f'Error: {str(e)}'

    entry['prediction'] = prediction
    clean_entry = {
        "code": entry["code"],
        "question": entry["question"],
        "answer": entry["answer"],
        "prediction": prediction
    }
    updated_data.append(clean_entry)
   

    if (i + 1) % save_every == 0 or (i + 1) == len(data):
        try:
            save_json_data(output_path, updated_data)
        except Exception as e:
            print(f"Warning: Failed to save at entry {i + 1}: {str(e)}")

# Final save
save_json_data(output_path, updated_data)
print(f"\n Predictions saved to: {output_path}")


Generating predictions:   0%|          | 0/10 [00:00<?, ?it/s]


 Predictions saved to: Llama3_CodeQA_predictions.json
