In [None]:
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from annoy import AnnoyIndex
import faiss
import json

# Load dataset from line-by-line JSON
file_path = 'arxiv-metadata-oai-snapshot.json'  # Replace with your actual file path

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue

# Convert to DataFrame
df = pd.DataFrame(data)

# Clean the abstract text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\\[a-z]+\{.*?\}', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

df['cleaned_abstract'] = df['abstract'].apply(clean_text)

# Load SciBERT model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# Generate embeddings (batch-safe)
def get_embeddings(texts, batch_size=8):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        encoded = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512)
        with torch.no_grad():
            outputs = model(**encoded)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)

# Limit to 1000 entries for performance (optional)
sample_df = df.dropna(subset=['cleaned_abstract']).head(1000).reset_index(drop=True)
abstracts = sample_df['cleaned_abstract'].tolist()
embeddings = get_embeddings(abstracts)

# Build Annoy index
annoy_index = AnnoyIndex(embeddings.shape[1], 'angular')
for i, emb in enumerate(embeddings):
    annoy_index.add_item(i, emb)
annoy_index.build(10)

# Build Faiss index
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings.astype(np.float32))

# Query
query = "Differential calculation in quantum chromodynamics for photon pairs"
query_embedding = get_embeddings([clean_text(query)])

# Step 1: LSH broad retrieval
annoy_results = annoy_index.get_nns_by_vector(query_embedding[0], 5)
print("LSH Broad Retrieval (Annoy) Indices:", annoy_results)

# Step 2: Faiss fine ranking within filtered results
filtered_embeddings = embeddings[annoy_results].astype(np.float32)
_, fine_indices = faiss.IndexFlatL2(embeddings.shape[1]).search(filtered_embeddings, 5)

# Map fine indices back to original paper indices
final_indices = [annoy_results[i] for i in fine_indices[0]]
recommended_titles = sample_df.iloc[final_indices]['title'].tolist()

print("\nTop Recommended Papers:")
for idx, title in enumerate(recommended_titles, 1):
    print(f"{idx}. {title}")