In [None]:
!pip install faiss-cpu
!pip install pytrec_eval

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import faiss
import json
import os
import pytrec_eval
import csv

In [None]:
# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Load model and tokenizer from Kaggle input directory
model_path = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"  # Replace with your actual dataset name and model directory

# Load the tokenizer and model with legacy=False
tokenizer = AutoTokenizer.from_pretrained(model_path, torch_dtype=torch.float16)
# Set padding token to be the same as EOS token
tokenizer.pad_token = tokenizer.eos_token
model = AutoModel.from_pretrained(model_path, torch_dtype=torch.float16)
model.to(device)  # Move the model to GPU

In [None]:
def encode_document(document):
    inputs = tokenizer(str(document), padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        # Get model outputs for the single document
        outputs = model(**inputs)
        # Extract last hidden states
        embeddings = outputs.last_hidden_state[0][-1]
        
        #embeddings /= embeddings.norm()  # Normalize
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=0)

    torch.cuda.empty_cache()  # Clear cache
    return embeddings.cpu().numpy()

In [None]:
# original dataset was loaded to output target document after the seach

# Path to your input JSON file
path_to_json = '/kaggle/input/financial-times/merged_output.json'

# Load your JSON data from a file
with open(path_to_json, 'r') as file:
    data = json.load(file)

# Create a dictionary where DOCNO is the key and TEXT is the value
documents = {}
for entry in data:
    key = entry["DOCNO"]
    value = entry["TEXT"]
    documents[key] = value


In [None]:
# Load dense vectors from CSV file
def load_vectors_from_csv(file_path):
    ids = []
    vectors = []
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header row if it exists
        for row in reader:
            ids.append(row[0])  # Assuming the first column is IDs
            vectors.append([float(value) for value in row[1:]])  # Remaining columns are vector values
    vectors = np.array(vectors, dtype=np.float32)
    return ids, vectors

# Load dense vectors
ids, doc_vectors = load_vectors_from_csv('/kaggle/input/llama3-2-1b-instruct-embeddings/llama3_2_1b_instruct__fin_times_embeddings.csv')

# Move document vectors to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
doc_vectors_tensor = torch.tensor(doc_vectors).to(device)

# Create FAISS index for efficient similarity search (cosine similarity)
index = faiss.IndexFlatIP(doc_vectors_tensor.shape[1])  # documents were already normalized (normalization is not required)
# Add vectors to the index
index.add(doc_vectors)

In [None]:
def retrieve(query_embeddings, top_k=10):
        """
        Retrieve top-k most similar documents for multiple queries
        
        :param query_embeddings: Numpy array of query embeddings
        :param top_k: Number of documents to retrieve
        :return: Dictionary of results for pytrec_eval
        """
        # Ensure query embeddings are 2D
        if query_embeddings.ndim == 1:
            query_embeddings = query_embeddings.reshape(1, -1)
        
        # Search index
        distances, indices = index.search(query_embeddings, top_k)
        
        # Convert results to dictionary format for pytrec_eval
        results = {}
        for i, (doc_indices, doc_distances) in enumerate(zip(indices, distances)):
            # Use query index as string key
            query_key = str(i)
            results[query_key] = {
                #ids[idx]: float(1 / (1 + dist)) 
                ids[idx]: dist
                for idx, dist in zip(doc_indices, doc_distances) 
                if idx != -1
            }
        
        return results

def load_qrels(qrels_path):
    """
    Load relevance judgments from TREC qrels file
    
    :param qrels_path: Path to qrels file
    :return: Dictionary of relevance judgments
    """
    qrels = {}
    with open(qrels_path, 'r') as f:
        for line in f:
            # Typical TREC qrels format: query_id 0 doc_id relevance
            query_id, _, doc_id, relevance = line.strip().split()
            if query_id not in qrels:
                qrels[query_id] = {}
            qrels[query_id][doc_id] = int(relevance)
    return qrels

def load_queries(queries_file):
    """
    Load queries from a file
    
    :param queries_file: Path to queries JSON file
    :return: Tuple of (query_texts, query_ids)
    """
    with open(queries_file, 'r') as f:
        queries_data = json.load(f)
    
    # Assuming JSON structure with 'text' and 'id' fields
    query_texts = [query.get('text', '') for query in queries_data]
    query_ids = [query.get('id', str(idx)) for idx, query in enumerate(queries_data)]

    return query_texts, query_ids

In [None]:
def compute_aggregated_measures(metrics):
    """
    Manually compute aggregated measures across queries
    
    :param metrics: Dictionary of per-query metrics from pytrec_eval
    :return: Dictionary of aggregated metrics
    """
    aggregated_metrics = {}
    
    # Metrics to aggregate
    metric_keys = [
        'ndcg', 'map', 'recip_rank', 
        'P_5', 'P_10',
        'recall_5', 'recall_10', 
    ]
    
    for metric in metric_keys:
        # Collect all values for this metric
        metric_values = []
        for query_metrics in metrics.values():
            if metric in query_metrics:
                metric_values.append(query_metrics[metric])
        
        # Compute aggregation methods
        if metric_values:
            aggregated_metrics[f'{metric}_mean'] = np.mean(metric_values)
            aggregated_metrics[f'{metric}_median'] = np.median(metric_values)
            aggregated_metrics[f'{metric}_std'] = np.std(metric_values)
    
    return aggregated_metrics

In [None]:
query_texts, query_ids = load_queries('/kaggle/input/query-and-qrels/queries.json')

qrels = load_qrels('/kaggle/input/query-and-qrels/filtered_data.txt')

query_embeddings = []
for query in query_texts:
    query_embeddings.append(encode_document(query))


query_embeddings = np.array(query_embeddings, dtype=np.float32)


run = retrieve(query_embeddings)

#print(run)

run_with_query_ids = {
        query_ids[int(k)]: v for k, v in run.items()
    }


evaluator = pytrec_eval.RelevanceEvaluator(
        qrels, 
        {
            'ndcg', 'map', 'recip_rank', 
            'P_5', 'P_10', 
            'recall_5', 'recall_10',
        }
    )

corrected_version = {
    str(query_id): {str(doc_id): float(score) for doc_id, score in doc_scores.items()}
    for query_id, doc_scores in run_with_query_ids.items()
}

#print(run_with_query_ids)

metrics = evaluator.evaluate(corrected_version)

print("Aggregated Metrics:")
aggregated_measures = compute_aggregated_measures(
        metrics
    )

for metric, value in sorted(aggregated_measures.items()):
        print(f"{metric}: {value}")

In [None]:

def save_metrics_to_csv(metrics, aggregated_measures, model_name="llama3_2_1b_instruct", base_path="/kaggle/working", mode='w'):
    # Save aggregated metrics with model name as row
    aggregated_metrics_path = f"{base_path}/model_metrics.csv"
    file_exists = os.path.exists(aggregated_metrics_path)
    
    try:
        with open(aggregated_metrics_path, mode, newline='') as f:
            writer = csv.writer(f)
            # Write header only if file is new or in write mode
            if mode == 'w' or not file_exists:
                headers = ['model_name'] + sorted(aggregated_measures.keys())
                writer.writerow(headers)
            # Write values
            row = [model_name] + [aggregated_measures[metric] for metric in sorted(aggregated_measures.keys())]
            writer.writerow(row)
        print(f"Metrics saved to: {aggregated_metrics_path}")
    except Exception as e:
        print(f"Error saving metrics: {e}")

# Usage example:
save_metrics_to_csv(metrics, aggregated_measures, model_name="llama3_2_1b_instruct", mode='a')  # Use 'a' to append to existing file
