In [1]:
!pip install faiss-gpu
!pip install pytrec_eval

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308217 sha256=dc461848bb450ae3cb552b29ce926e0da910cb355b419fd61117077a26813212
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Inst

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import faiss
import json
import os
import pytrec_eval

In [3]:
# Set the environment variable for PyTorch CUDA memory allocation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Check if GPU is available and set device accordingly
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Load the tokenizer and model
model_path = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModel.from_pretrained(model_path)


# Extract the embedding layer
embedding_layer = model.get_input_embeddings()  # works for LLama 3.2
del model

embedding_layer.to(device)

True


Embedding(128256, 2048)

In [5]:
# Encode documents into dense vectors
def encode_documents(documents):
    inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        embeddings = embedding_layer(inputs['input_ids']).mean(dim=1) # Mean pooling on GPU
        embeddings /= embeddings.norm(dim=1, keepdim=True) # Normalize embeddings

    del inputs
    return embeddings.cpu().numpy()

In [6]:
# original dataset was loaded to output target document after the seach

# Path to your input JSON file
path_to_json = '/kaggle/input/merget-times/merged_output.json'

# Load your JSON data from a file
with open(path_to_json, 'r') as file:
    data = json.load(file)

# Create a dictionary where DOCNO is the key and TEXT is the value
documents = {}
for entry in data:
    key = entry["DOCNO"]
    value = entry["TEXT"]
    documents[key] = value


In [7]:
import csv
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load dense vectors from CSV file
def load_vectors_from_csv(file_path):
    ids = []
    vectors = []
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header row if it exists
        for row in reader:
            ids.append(row[0])  # Assuming the first column is IDs
            vectors.append([float(value) for value in row[1:]])  # Remaining columns are vector values
    vectors = np.array(vectors, dtype=np.float32)
    return ids, vectors

# Load dense vectors
ids, doc_vectors = load_vectors_from_csv('/kaggle/input/financial-timel-llama3-2-1b-instruct-dense-vectors/document_embeddings_Llama3_2_1b_instruct.csv')

# Move document vectors to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
doc_vectors_tensor = torch.tensor(doc_vectors).to(device)

# Create FAISS index for efficient similarity search (cosine similarity)
gpu_res = faiss.StandardGpuResources()  # Create resources for managing GPU memory
index = faiss.IndexFlatIP(doc_vectors_tensor.shape[1])  # Inner product index (for cosine similarity)

# Transfer the index to GPU and add document vectors directly
gpu_index = faiss.index_cpu_to_gpu(gpu_res, 0, index)  # Transfer index to GPU
gpu_index.add(doc_vectors_tensor.cpu().numpy())  # Add document vectors (needs numpy array)



In [9]:
# Function to perform a search query using GPU index
def search(query, k=5):
    
    query_vector = encode_documents(query)
    
    # Search using FAISS index directly on GPU
    distances, indices = gpu_index.search(query_vector, k)  
    
    return indices[0], distances[0]

In [10]:
def load_qrels(qrels_path):
    """
    Load relevance judgments from TREC qrels file
    
    :param qrels_path: Path to qrels file
    :return: Dictionary of relevance judgments
    """
    qrels = {}
    with open(qrels_path, 'r') as f:
        for line in f:
            # Typical TREC qrels format: query_id 0 doc_id relevance
            query_id, _, doc_id, relevance = line.strip().split()
            if query_id not in qrels:
                qrels[query_id] = {}
            qrels[query_id][doc_id] = int(relevance)
    return qrels

def load_queries(queries_file):
    """
    Load queries from a file
    
    :param queries_file: Path to queries JSON file
    :return: Tuple of (query_texts, query_ids)
    """
    with open(queries_file, 'r') as f:
        queries_data = json.load(f)
    
    # Assuming JSON structure with 'text' and 'id' fields
    query_texts = [query.get('text', '') for query in queries_data]
    query_ids = [query.get('id', str(idx)) for idx, query in enumerate(queries_data)]

    return query_texts, query_ids

In [11]:
def retrieve(query_embeddings, top_k=10):
        """
        Retrieve top-k most similar documents for multiple queries
        
        :param query_embeddings: Numpy array of query embeddings
        :param top_k: Number of documents to retrieve
        :return: Dictionary of results for pytrec_eval
        """
        # Ensure query embeddings are 2D
        if query_embeddings.ndim == 1:
            query_embeddings = query_embeddings.reshape(1, -1)
        
        # Search index
        distances, indices = gpu_index.search(query_embeddings, top_k)
        
        # Convert results to dictionary format for pytrec_eval
        results = {}
        for i, (doc_indices, doc_distances) in enumerate(zip(indices, distances)):
            # Use query index as string key
            query_key = str(i)
            results[query_key] = {
                #ids[idx]: float(1 / (1 + dist)) 
                ids[idx]: dist
                for idx, dist in zip(doc_indices, doc_distances) 
                if idx != -1
            }
        
        return results

In [12]:
def compute_aggregated_measures(metrics):
    """
    Manually compute aggregated measures across queries
    
    :param metrics: Dictionary of per-query metrics from pytrec_eval
    :return: Dictionary of aggregated metrics
    """
    aggregated_metrics = {}
    
    # Metrics to aggregate
    metric_keys = [
        'ndcg', 'map', 'recip_rank', 
        'P_5', 'P_10', 'P_20', 
        'recall_5', 'recall_10', 'recall_20'
    ]
    
    for metric in metric_keys:
        # Collect all values for this metric
        metric_values = []
        for query_metrics in metrics.values():
            if metric in query_metrics:
                metric_values.append(query_metrics[metric])
        
        # Compute aggregation methods
        if metric_values:
            aggregated_metrics[f'{metric}_mean'] = np.mean(metric_values)
            aggregated_metrics[f'{metric}_median'] = np.median(metric_values)
            aggregated_metrics[f'{metric}_std'] = np.std(metric_values)
    
    return aggregated_metrics

In [34]:
query_texts, query_ids = load_queries('/kaggle/input/query-and-qrels/queries.json')

qrels = load_qrels('/kaggle/input/query-and-qrels/filtered_data.txt')

query_embeddings = encode_documents(query_texts)

run = retrieve(query_embeddings)

#print(run)

run_with_query_ids = {
        query_ids[int(k)]: v for k, v in run.items()
    }


evaluator = pytrec_eval.RelevanceEvaluator(
        qrels, 
        {
            'ndcg', 'map', 'recip_rank', 
            'P_5', 'P_10', 'P_20', 
            'recall_5', 'recall_10', 'recall_20'
        }
    )

corrected_version = {
    str(query_id): {str(doc_id): 1 for doc_id, score in doc_scores.items()}
    for query_id, doc_scores in run_with_query_ids.items()
}

#print(run_with_query_ids)

metrics = evaluator.evaluate(corrected_version)

print("Aggregated Metrics:")
aggregated_measures = compute_aggregated_measures(
        metrics
    )

for metric, value in sorted(aggregated_measures.items()):
        print(f"{metric}: {value}")



Aggregated Metrics:
P_10_mean: 0.006711409395973154
P_10_median: 0.0
P_10_std: 0.02990910061094925
P_20_mean: 0.003355704697986577
P_20_median: 0.0
P_20_std: 0.014954550305474625
P_5_mean: 0.013422818791946308
P_5_median: 0.0
P_5_std: 0.0598182012218985
map_mean: 0.004852357827458591
map_median: 0.0
map_std: 0.042137814799734566
ndcg_mean: 0.008454156342145521
ndcg_median: 0.0
ndcg_std: 0.056357643355608514
recall_10_mean: 0.005572675687898926
recall_10_median: 0.0
recall_10_std: 0.04271952271257743
recall_20_mean: 0.005572675687898926
recall_20_median: 0.0
recall_20_std: 0.04271952271257743
recall_5_mean: 0.005572675687898926
recall_5_median: 0.0
recall_5_std: 0.04271952271257743
recip_rank_mean: 0.030425055928411632
recip_rank_median: 0.0
recip_rank_std: 0.1507961909801089


In [23]:
# Example search query
queries = ["Which bank decrease the mortage rate 11.5 per cent to 10.95", #FT921-11403
          "Who is the minister of international economy in hungary at 1990", #FT911-226
          "The situation and potential of Turkish/Turkey banks", #FT922-6909
           "Who is the other bank that cooperates with Austrian Airlines in Europe?", #FT922-6946
           "managing director of Renison Goldfields in 1992", #FT923-13976
           "who will become to chief execute London-quoted",#FT923-14206
           "how many people expected to on careers services at 92 march",#FT923-14215
           "How does Lawler define subsidiarity, and how is it applied in high-involvement organizations",#FT924-12138
           "What alternative solution does M. C. Kennedy propose to manage the fiscal situation without increasing taxes",#FT931-7937
           "How do international bond funds differ from gilt unit trusts in terms of risks and returns" #FT931-8107
          ]

exacts = ["FT921-11403", "FT911-226", "FT922-6909", "FT922-6946", 
          "FT923-13976", "FT923-14206", "FT923-14215", "FT924-12138", "FT931-7937", "FT931-8107"]

i = 0
for query in queries:
    print(query)
    print(exacts[i])
    indices, distances = search(query, 5)
    #print(indices)
    #print(distances)
    # Retrieve and display results safely
    for idx in range(len(indices)):
        print(f"Document: {ids[indices[idx]]} Distance: {distances[idx]}")
        #ind = ids[indices[idx]].replace(" ", "")
        if ids[indices[idx]] == exacts[i]:
            print("Exact document found in ranked document list")
        # the code below also prints document content
        # print(f"Document: {ids[indices[idx]]} Distance: {distances[idx]} content: {documents[ids[indices[idx]]]}")  
    i += 1
    



Which bank decrease the mortage rate 11.5 per cent to 10.95
FT921-11403
Document: FT932-14632 Distance: 0.8512439131736755
Document: FT933-10336 Distance: 0.850742757320404
Document: FT933-8586 Distance: 0.849536120891571
Document: FT943-9027 Distance: 0.8494161367416382
Document: FT933-12337 Distance: 0.8490382432937622
Who is the minister of international economy in hungary at 1990
FT911-226
Document: FT932-3841 Distance: 0.7925577759742737
Document: FT942-526 Distance: 0.7920131683349609
Document: FT934-13423 Distance: 0.7915689945220947
Document: FT944-8638 Distance: 0.7912970185279846
Document: FT941-7543 Distance: 0.7905198931694031
The situation and potential of Turkish/Turkey banks
FT922-6909
Document: FT943-95 Distance: 0.5412183403968811
Document: FT932-2641 Distance: 0.5361122488975525
Document: FT922-2836 Distance: 0.5327141284942627
Document: FT941-10738 Distance: 0.5316784977912903
Document: FT921-9052 Distance: 0.5314403772354126
Who is the other bank that cooperates wit