In [17]:
# Re-run this cell and follow the interactive prompts to authenticate
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
import json
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Step 1: Load the dataset
def load_arxiv_data(file_path, max_entries=20000):
    data = []
    with open(file_path, 'r') as f:
        for i, line in tqdm(enumerate(f), total=max_entries, desc="Loading data"):
            if i >= max_entries:
                break
            paper = json.loads(line)
            if 'title' in paper and 'abstract' in paper:
                data.append({
                    'title': paper['title'],
                    'abstract': paper['abstract'],
                    'authors': paper.get('authors', 'N/A'),
                    'id': paper.get('id', 'N/A'),
                    'categories': paper.get('categories', 'N/A')
                })

    return data

In [25]:
# Step 2: Prepare text corpus
def prepare_corpus(data):
    return [entry['title'] + " " + entry['abstract'] for entry in data]

# Step 3: Embed the corpus
def get_embeddings(corpus, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(corpus, show_progress_bar=True)
    return embeddings, model



In [26]:
# Step 4: Semantic search
def search(query, corpus_embeddings, model, data, top_k=5):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]

    print("\n🔍 Top Results:")
    for idx in top_indices:
        paper = data[idx]
        print("\n-------------------------------")
        print(f"📄 Title: {paper['title']}")
        print(f"👨‍🔬 Authors: {paper['authors']}")
        print(f"🏷 Categories: {paper['categories']}")
        print(f"📑 Abstract: {paper['abstract'][:500]}...")
        print(f"🔗 ID: {paper['id']}")


In [27]:
# Main Function
def main():
    # Load dataset
    json_path = '/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json'
    if not os.path.exists(json_path):
        print("Dataset file not found. Please place 'arxiv-metadata.json' in the current directory.")
        return

    print("🔄 Loading and preparing data...")
    data = load_arxiv_data(json_path, max_entries=10000)
    corpus = prepare_corpus(data)

    print("🧠 Creating embeddings...")
    corpus_embeddings, model = get_embeddings(corpus)

    # Search loop
    while True:
        query = input("\nEnter your search query (or type 'exit' to quit): ")
        if query.lower() == 'exit':
            break
        search(query, corpus_embeddings, model, data)

if __name__ == "__main__":
    main()

🔄 Loading and preparing data...


Loading data: 100%|██████████| 10000/10000 [00:00<00:00, 48032.89it/s]


🧠 Creating embeddings...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]


Enter your search query (or type 'exit' to quit): deep learning in healthcare

🔍 Top Results:

-------------------------------
📄 Title: Multi-Dimensional Recurrent Neural Networks
👨‍🔬 Authors: Alex Graves, Santiago Fernandez, Juergen Schmidhuber
🏷 Categories: cs.AI cs.CV
📑 Abstract:   Recurrent neural networks (RNNs) have proved effective at one dimensional
sequence learning tasks, such as speech and online handwriting recognition.
Some of the properties that make RNNs suitable for such tasks, for example
robustness to input warping, and the ability to access contextual information,
are also desirable in multidimensional domains. However, there has so far been
no direct way of applying RNNs to data with more than one spatio-temporal
dimension. This paper introduces multi-di...
🔗 ID: 0705.2011

-------------------------------
📄 Title: Automatic Detection of Pulmonary Embolism using Computational
  Intelligence
👨‍🔬 Authors: Simon Scurrell, Tshilidzi Marwala and David Rubin
🏷 Categories: