In [1]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.9.0-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keybert
Successfully installed keybert-0.9.0


In [2]:
# Re-run this cell and follow the interactive prompts to authenticate
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
import os
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from transformers import pipeline
import nltk
import re
nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# ============================
# STEP 1: Load Dataset
# ============================
def load_arxiv_data(file_path, max_entries=200000):
    data = []
    with open(file_path, 'r') as f:
        for i, line in tqdm(enumerate(f), total=max_entries, desc="Loading data"):
            if i >= max_entries:
                break
            paper = json.loads(line)
            if 'title' in paper and 'abstract' in paper:
                data.append({
                    'title': paper['title'],
                    'abstract': paper['abstract'],
                    'authors': paper.get('authors', 'N/A'),
                    'id': paper.get('id', 'N/A'),
                    'categories': paper.get('categories', 'N/A')
                })
    return data


In [5]:

# ============================
# STEP 2: Preprocess & Embed
# ============================
def embed_texts(papers, model):
    texts = [paper['title'] + ". " + paper['abstract'] for paper in papers]
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings


In [6]:

# ============================
# STEP 3: Query Cleaning
# ============================
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip().lower()


In [7]:

# ============================
# STEP 4: Search Function
# ============================
def search_papers(query, papers, paper_embeddings, model, top_k=5):
    query = clean_text(query)
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, paper_embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:top_k]

    results = []
    for idx in top_indices:
        paper = papers[idx]
        results.append({
            'title': paper['title'],
            'abstract': paper['abstract'],
            'similarity': float(similarities[idx]),
            'authors': paper['authors'],
            'categories': paper['categories']
        })
    return results


In [8]:

# ============================
# STEP 5 : Keyword Extraction
# ============================
def extract_keywords(papers):
    kw_model = KeyBERT()
    for paper in papers:
        keywords = kw_model.extract_keywords(paper['abstract'], top_n=5)
        paper['keywords'] = [kw[0] for kw in keywords]
    return papers


In [9]:

# ============================
# MAIN EXECUTION
# ============================
if __name__ == "__main__":
    # Provide the correct path to your arxiv data file (jsonl)
    dataset_path = '/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json'  # <-- CHANGE THIS TO YOUR FILE

    print("Loading dataset...")
    papers = load_arxiv_data(dataset_path, max_entries=200000)

    print("Generating embeddings...")
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embed_texts(papers, embed_model)

    print("Extracting keywords")
    papers = extract_keywords(papers)

    # Example search
    while True:
        query = input("\nEnter your research query (or 'exit' to quit): ")
        if query.lower() == "exit":
            break
        results = search_papers(query, papers, embeddings, embed_model, top_k=5)
        for i, res in enumerate(results):
            print(f"\nResult {i+1}:")
            print(f"Title: {res['title']}")
            print(f"Authors: {res['authors']}")
            print(f"Categories: {res['categories']}")
            print(f"Similarity Score: {res['similarity']:.4f}")
            print(f"Abstract: {res['abstract']}")
            if 'summary' in res:
                print(f"Summary: {res['summary']}")


Loading dataset...


Loading data: 100%|██████████| 200000/200000 [00:08<00:00, 24816.18it/s]


Generating embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/6250 [00:00<?, ?it/s]

Extracting keywords

Enter your research query (or 'exit' to quit): optimization 

Result 1:
Title: An ellipsoidal branch and bound algorithm for global optimization
Authors: William Hager, Dzung Phan
Categories: math.OC
Similarity Score: 0.5667
Abstract:   A branch and bound algorithm is developed for global optimization. Branching
in the algorithm is accomplished by subdividing the feasible set using
ellipses. Lower bounds are obtained by replacing the concave part of the
objective function by an affine underestimate. A ball approximation algorithm,
obtained by generalizing of a scheme of Lin and Han, is used to solve the
convex relaxation of the original problem. The ball approximation algorithm is
compared to SEDUMI as well as to gradient projection algorithms using randomly
generated test problems with a quadratic objective and ellipsoidal constraints.


Result 2:
Title: Apply Ant Colony Algorithm to Search All Extreme Points of Function
Authors: Chao-Yang Pang, Hui Liu, Xia Li, Y

In [12]:
import pandas as pd

# If it's a list of dicts:
df = pd.DataFrame(papers)

# Optional: Ensure column names are exactly as required
df = df[["title", "authors", "keywords"]]

# Save to CSV
df.to_csv("papers.csv", index=False)

print("✅ papers.csv file created successfully.")

✅ papers.csv file created successfully.
