In [1]:
import json
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Load Data ---
with open("task4b_train.json") as f:
    train_data = json.load(f)

with open("task4b_papers.json") as f:
    papers_data = json.load(f)

# --- Prepare Paper Documents ---
paper_ids = []
doc_texts = []

for paper in papers_data:
    paper_ids.append(paper["cord_uid"])
    text = paper.get("title", "") + " " + paper.get("abstract", "")
    doc_texts.append(text.lower())

# --- TF-IDF Vectorization ---
vectorizer = TfidfVectorizer(stop_words='english')
doc_matrix = vectorizer.fit_transform(doc_texts)

# --- Retrieve Top-5 Documents per Tweet ---
results = []

for item in tqdm(train_data):
    tweet_id = item["id"]
    tweet_text = item["tweet"].lower()

    query_vec = vectorizer.transform([tweet_text])
    similarities = cosine_similarity(query_vec, doc_matrix).flatten()

    top_indices = np.argsort(similarities)[::-1][:5]
    top_doc_ids = [paper_ids[i] for i in top_indices]

    results.append({
        "id": tweet_id,
        "doc_ids": top_doc_ids
    })

# --- Save Submission File ---
with open("submission_tfidf.json", "w") as f:
    json.dump(results, f, indent=2)

print("✅ TF-IDF submission saved as submission_tfidf.json")


FileNotFoundError: [Errno 2] No such file or directory: 'task4b_train.json'