<a href="https://colab.research.google.com/github/Lelouchlamperougexd/article_experiment/blob/main/new_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install datasets pandas numpy torch tqdm faiss-cpu sentence-transformers transformers openpyxl kaggle

# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import faiss
from sentence_transformers import SentenceTransformer
import json
import os

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.

In [2]:
# Set up Kaggle API and download dataset
print("Downloading ArXiv dataset from Kaggle...")
os.environ["KAGGLE_CONFIG_DIR"] = "/content"

# Download dataset
!kaggle datasets download -d cornell-university/arxiv --unzip

# Path to dataset file
FILE_PATH = "arxiv-metadata-oai-snapshot.json"

Downloading ArXiv dataset from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/cornell-university/arxiv
License(s): CC0-1.0
Downloading arxiv.zip to /content
 98% 1.38G/1.40G [00:08<00:00, 202MB/s]
100% 1.40G/1.40G [00:08<00:00, 180MB/s]


In [3]:
# Load dataset from JSON
print("Loading ArXiv dataset...")
try:
    with open(FILE_PATH, "r") as f:
        arxiv_data = [json.loads(line) for line in f]

    docs_arxiv_df = pd.DataFrame(arxiv_data)

    # Keep only necessary columns
    docs_arxiv_df = docs_arxiv_df[['id', 'title', 'abstract', 'categories', 'update_date']]
    docs_arxiv_df = docs_arxiv_df.rename(columns={'id': 'doc_id'})

    # Process full text
    docs_arxiv_df['full_text'] = docs_arxiv_df['title'] + ". " + docs_arxiv_df['abstract'].fillna("")

    # Convert date
    docs_arxiv_df['update_date'] = pd.to_datetime(docs_arxiv_df['update_date'], errors='coerce')

    # Remove NaT dates
    docs_arxiv_df = docs_arxiv_df.dropna(subset=['update_date'])

    # Split into old (before 2020) and new (2021+)
    old_docs_df = docs_arxiv_df[docs_arxiv_df['update_date'] < '2020-01-01']
    new_docs_df = docs_arxiv_df[docs_arxiv_df['update_date'] >= '2021-01-01']

    print(f"Loaded {len(docs_arxiv_df)} total documents.")
    print(f"Filtered {len(old_docs_df)} old papers and {len(new_docs_df)} new papers.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    old_docs_df = new_docs_df = pd.DataFrame()


Loading ArXiv dataset...
Loaded 2677526 total documents.
Filtered 1536774 old papers and 966884 new papers.


In [None]:
# Create FAISS index for old papers
print("Creating FAISS index for old papers...")
model = SentenceTransformer('all-MiniLM-L6-v2')  # Compact transformer model
old_texts = old_docs_df['full_text'].tolist()
old_doc_ids = old_docs_df['doc_id'].tolist()

# Encode documents into embeddings
old_embeddings = model.encode(old_texts, convert_to_tensor=True)
old_embeddings_np = old_embeddings.cpu().numpy()

# Normalize for cosine similarity
faiss.normalize_L2(old_embeddings_np)
index = faiss.IndexFlatIP(old_embeddings_np.shape[1])
index.add(old_embeddings_np)

Creating FAISS index for old papers...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Function to determine novelty

def evaluate_novelty(new_text, top_k=5):
    """Evaluate novelty of a new article by comparing with old papers."""
    new_embedding = model.encode([new_text], convert_to_tensor=True).cpu().numpy()
    faiss.normalize_L2(new_embedding)
    scores, indices = index.search(new_embedding, k=top_k)
    avg_similarity = np.mean(scores)
    novelty_score = 1 - avg_similarity  # Higher score means more novel
    return novelty_score

In [None]:
# Run novelty evaluation
print("Evaluating novelty of new papers...")
novelty_results = []
for _, row in tqdm(new_docs_df.iterrows(), total=len(new_docs_df)):
    novelty_score = evaluate_novelty(row['full_text'])
    novelty_results.append({'doc_id': row['doc_id'], 'title': row['title'], 'novelty_score': novelty_score})

In [None]:
# Convert to DataFrame and sort by novelty
novelty_df = pd.DataFrame(novelty_results).sort_values(by='novelty_score', ascending=False)
print("Top 10 most novel papers:")
print(novelty_df.head(10))

In [None]:
# Save results
novelty_df.to_csv("novelty_results.csv", index=False)
print("Saved novelty results to novelty_results.csv")