In [2]:
!pip uninstall torchvision
!pip install torchvision

SyntaxError: invalid syntax (2358569611.py, line 1)

In [1]:
from tqdm import tqdm
import json
import ir_datasets
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
from transformers import pipeline
import torch
from huggingface_hub import login

api_key = "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"
login(token=api_key)

# Check GPU availability
def get_device():
    if torch.cuda.is_available():
        device = "cuda"
        print(f"Using GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    else:
        device = "cpu"
        print("Using CPU")
    return device

device = get_device()


RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
operator torchvision::nms does not exist

In [None]:
# SECTION 1: DATASET LOADING AND PREPARATION
# Load dataset
print("Loading the Vaswani dataset...")
dataset = ir_datasets.load("cord19/trec-covid")

# Prepare documents and queries
print("Preparing documents and queries...")
all_docs = [{"doc_id": doc.doc_id, "abstract": doc.abstract} for doc in dataset.docs_iter()]
all_queries = [{"query_id": query.query_id, "title": query.title} for query in dataset.queries_iter()]

tokenized_docs = [doc['abstract'].split() for doc in all_docs]

In [None]:
# SECTION 2: EMBEDDINGS GENERATION
# Load or generate embeddings
def generate_embeddings():
    if os.path.exists("trec_covid_doc_embeddings.csv") and os.path.exists("trec_covid_query_embeddings.csv"):
        print("Loading precomputed embeddings...")
        doc_embeddings = pd.read_csv("trec_covid_doc_embeddings.csv").values
        query_embeddings = pd.read_csv("trec_covid_query_embeddings.csv").values
    else:
        print("Generating embeddings with SentenceTransformer...")
        model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
        doc_embeddings = model.encode(all_docs, batch_size=32, show_progress_bar=True)
        query_embeddings = model.encode(all_queries, batch_size=32, show_progress_bar=True)

        # Save embeddings for future use
        pd.DataFrame(doc_embeddings).to_csv("trec_covid_doc_embeddings.csv", index=False)
        pd.DataFrame(query_embeddings).to_csv("trec_covid_query_embeddings.csv", index=False)

    return doc_embeddings, query_embeddings

doc_embeddings, query_embeddings = generate_embeddings()

In [None]:
# SECTION 3: RETRIEVAL IMPLEMENTATION
# BM25 Sparse Retrieval
def bm25_retrieve(query, bm25, tokenized_docs, top_k=5):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    top_k_indices = np.argsort(scores)[-top_k:][::-1]
    return top_k_indices, scores[top_k_indices]

# Dense Retrieval
def dense_retrieve(query_embedding, doc_embeddings, top_k=5):
    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]
    return top_k_indices, similarities[top_k_indices]
# Rank Fusion Retrieval
def fusion_retrieve(dense_query_embedding, doc_embeddings, query, top_k=5, alpha=0.5):
    """
    Implementa il rank fusion riutilizzando le funzioni esistenti di retrieval
    """
    sparse_indices, sparse_scores = bm25_retrieve(query, bm25, tokenized_docs, top_k=len(doc_embeddings))
    dense_indices, dense_scores = dense_retrieve(dense_query_embedding, doc_embeddings, top_k=len(doc_embeddings))
    
    all_sparse_scores = np.zeros(len(doc_embeddings))
    all_dense_scores = np.zeros(len(doc_embeddings))
    all_sparse_scores[sparse_indices] = sparse_scores
    all_dense_scores[dense_indices] = dense_scores
    
    # Normalizza gli scores
    all_sparse_scores = (all_sparse_scores - all_sparse_scores.min()) / (all_sparse_scores.max() - all_sparse_scores.min())
    all_dense_scores = (all_dense_scores - all_dense_scores.min()) / (all_dense_scores.max() - all_dense_scores.min())
    
    # Combina i punteggi
    combined_scores = alpha * all_dense_scores + (1 - alpha) * all_sparse_scores
    
    # Ottieni i top k risultati
    top_k_indices = np.argsort(combined_scores)[-top_k:][::-1]
    return top_k_indices, combined_scores[top_k_indices]

# Cascading Retrieval
def cascade_retrieve(dense_query_embedding, doc_embeddings, query, initial_k=100, final_k=5, dense_threshold=0.7):
    """
    Implementa il cascading retrieval riutilizzando le funzioni esistenti di retrieval
    """
    # Stage 1: Usa BM25 per ottenere i candidati iniziali
    initial_indices, _ = bm25_retrieve(query, bm25, tokenized_docs, top_k=initial_k)
    
    # Stage 2: Re-rank usando dense retrieval
    candidate_embeddings = doc_embeddings[initial_indices]
    _, dense_scores = dense_retrieve(dense_query_embedding, candidate_embeddings, top_k=len(initial_indices))
    
    # Filtra per threshold
    qualified_mask = dense_scores >= dense_threshold
    if np.sum(qualified_mask) >= final_k:
        qualified_indices = np.where(qualified_mask)[0]
        top_indices = qualified_indices[np.argsort(dense_scores[qualified_indices])[-final_k:][::-1]]
    else:
        top_indices = np.argsort(dense_scores)[-final_k:][::-1]
    
    # Mappa gli indici ai documenti originali
    final_indices = initial_indices[top_indices]
    final_scores = dense_scores[top_indices]
    
    return final_indices, final_scores

bm25 = BM25Okapi(tokenized_docs)

# Run retrieval experiments
def run_retrieval_experiments():
    results = {
        "sparse": [],
        "dense": [],
        "rank_fusion": [],
        "cascade": []
    }
    print("Running retrieval experiments on all queries...")

    for query, query_embedding in tqdm(zip(all_queries, query_embeddings), total=len(all_queries)):
        query_text = query['title'] if isinstance(query, dict) else query
        
        # Sparse Retrieval (BM25)
        sparse_indices, sparse_scores = bm25_retrieve(query_text, bm25, tokenized_docs)
        sparse_results = [{"doc_id": all_docs[idx]['doc_id'], "score": float(score)} 
                         for idx, score in zip(sparse_indices, sparse_scores)]
        results["sparse"].append({"query": query, "results": sparse_results})

        # Dense Retrieval (cosine similarity)
        dense_indices, dense_scores = dense_retrieve(query_embedding, doc_embeddings)
        dense_results = [{"doc_id": all_docs[idx]['doc_id'], "score": float(score)} 
                        for idx, score in zip(dense_indices, dense_scores)]
        results["dense"].append({"query": query, "results": dense_results})

        # Rank Fusion
        fusion_indices, fusion_scores = fusion_retrieve(
            query_embedding, doc_embeddings, query_text
        )
        fusion_results = [{"doc_id": all_docs[idx]['doc_id'], "score": float(score)} 
                         for idx, score in zip(fusion_indices, fusion_scores)]
        results["rank_fusion"].append({"query": query, "results": fusion_results})

        # Cascade Retrieval
        cascade_indices, cascade_scores = cascade_retrieve(
            query_embedding, doc_embeddings, query_text
        )
        cascade_results = [{"doc_id": all_docs[idx]['doc_id'], "score": float(score)} 
                          for idx, score in zip(cascade_indices, cascade_scores)]
        results["cascade"].append({"query": query, "results": cascade_results})

    print("Saving results...")
    with open("retrieval_results.json", "w") as f:
        json.dump(results, f, indent=4)
    print("Retrieval results saved to retrieval_results.json")
    
    return results
run_retrieval_experiments()

In [None]:
# SECTION 4: QA WITH LANGUAGE MODEL
# QA for the first query
QUERY_INDEX = 3
query = all_queries[QUERY_INDEX - 1]
query_text = query['title'] if isinstance(query, dict) else query

# Retrieval calls
dense_top_k_indices, dense_top_k_scores = dense_retrieve(query_embeddings[QUERY_INDEX], doc_embeddings)
rank_top_k_indices, rank_top_k_scores = fusion_retrieve(
    query_embeddings[QUERY_INDEX], 
    doc_embeddings, 
    query_text
)
cascading_top_k_indices, cascading_top_k_scores = cascade_retrieve(
    query_embeddings[QUERY_INDEX], 
    doc_embeddings, 
    query_text
)

# Get retrieved documents
dense_retrieved_docs = [all_docs[idx]['abstract'] for idx in dense_top_k_indices]
rank_retrieved_docs = [all_docs[idx]['abstract'] for idx in rank_top_k_indices]
cascading_retrieved_docs = [all_docs[idx]['abstract'] for idx in cascading_top_k_indices]

# --- QUESTION-ANSWER USING DENSE 
context = "\n".join(dense_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"
#prompt = f"Question:\n{query}\n\nAnswer:"
print(f"Length of the prompt: {len(prompt.split())} words")

# Generate response
lm_pipeline = pipeline("text-generation", 
                      model="meta-llama/Llama-3.2-1B",
                      device=0 if device == "cuda" else -1)
response = lm_pipeline(prompt, 
                      max_new_tokens=150, 
                      temperature=0.1, 
                      truncation=False)[0]["generated_text"]
print("Generated Response:")
print(response)

# ---- QUESTION-ANSWER WITH NO CONTEXT PROVIDED WITH RAG----
prompt = f"""
Question:\n{query}\n\nAnswer:
"""
#print(f"Prompt: {prompt}")
response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.1,
                      truncation=False)[0]["generated_text"]
print("\n\n NO CONTEXT")
response = response.split("Answer:")[1].strip()
print(response)
print("fine")

# ---- QUESTION-ANSWER USING RANK FUSION ----
print("\n\n RANK FUSION CONTEXT")
context = "\n".join(rank_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query['title']}\n"
print(f"Length of the prompt: {len(prompt.split())} words")

# Generate response
response = lm_pipeline(prompt, 
                      max_new_tokens=150, 
                      temperature=0.1, 
                      truncation=False)[0]["generated_text"]
print("Generated Response:")
print(response)
print("fine")

# ---- QUESTION-ANSWER USING CASCADING RETRIEVAL ----
print("\n\n Generating response with cascading context")
context = "\n".join(cascading_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"
print(f"Length of the prompt: {len(prompt.split())} words")

# Generate response
response = lm_pipeline(prompt, 
                      max_new_tokens=150, 
                      temperature=0.7, 
                      truncation=False)[0]["generated_text"]
print("Generated Response:")
print(response)
print("fine")