In [20]:
import arxiv
import datetime
import pandas as pd
import time

# Get the current year and compute the last 10 years
current_year = datetime.datetime.now().year
start_year = current_year - 10  # Get the year from 10 years ago
start_date = f"{start_year}0101"  # Format: YYYYMMDD
end_date = f"{current_year}1231"  # Format: YYYYMMDD

# Define search query with a date range for the last 10 years
query = f"(artificial intelligence OR deep learning OR machine learning OR transformers) AND submittedDate:[{start_date} TO {end_date}]"

# Initialize result list
papers = []

# Pagination settings
max_results_per_query = 100  # arXiv allows up to 100 results per request
total_results = 500  # Adjust as needed
start_index = 0  # Start from first result

while start_index < total_results:
    try:
        # Fetch papers
        search = arxiv.Search(
            query=query,
            max_results=max_results_per_query,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        # Process results
        batch_papers = []
        for result in search.results():
            batch_papers.append({
                "title": result.title,
                "abstract": result.summary,
                "published_date": result.published.date(),
                "categories": ", ".join(result.categories),
                "arxiv_url": result.entry_id
            })

        # Check if results are empty
        if not batch_papers:
            print(f"⚠️ No results found for start index {start_index}. Skipping.")
            break  # Exit loop if no results are returned

        papers.extend(batch_papers)
        print(f"✅ Retrieved {len(batch_papers)} papers (Start index: {start_index})")

        # Move to the next batch
        start_index += max_results_per_query
        time.sleep(2)  # Avoid hitting API rate limits

    except Exception as e:
        print(f"❌ Error at start index {start_index}: {e}")
        print("🔄 Retrying after a short delay...")
        time.sleep(5)  # Wait before retrying

# Save results to CSV
df = pd.DataFrame(papers)
df.to_csv("ai_research_papers_last_10_years.csv", index=False)

print(f"\n✅ Completed! Collected {len(df)} AI research papers from {start_year} to {current_year}.")

  for result in search.results():


✅ No more results found after 0 papers. Fetching complete!

✅ Completed! Collected 0 AI research papers and saved them to 'all_ai_research_papers.csv'.


In [16]:
df.head()  # Display the first few rows of the dataset

Unnamed: 0,title,abstract,published_date,categories,arxiv_url
0,Analyzing Interference from Static Cellular Co...,The problem of base station cooperation has re...,2015-01-30,"cs.IT, math.IT",http://arxiv.org/abs/1502.00033v1
1,SHOE: Supervised Hashing with Output Embeddings,We present a supervised binary encoding scheme...,2015-01-30,cs.CV,http://arxiv.org/abs/1502.00030v1
2,Unconventional phase selection in high-driven ...,Phase selection in deeply undercooled liquids ...,2015-01-30,cond-mat.mtrl-sci,http://arxiv.org/abs/1502.00023v1
3,Characterizing Transiting Planet Atmospheres t...,[Abridged] We have only been able to comprehen...,2015-01-30,astro-ph.EP,http://arxiv.org/abs/1502.00004v1
4,"Threshold corrections, generalised prepotentia...",We continue our study of one-loop integrals as...,2015-01-30,"hep-th, math.NT",http://arxiv.org/abs/1502.00007v2


In [28]:
import arxiv
import pandas as pd
import time

# Define the AI research query
query = "cat:cs.AI AND (artificial intelligence OR deep learning OR machine learning OR neural networks OR supervised learning OR unsupervised learning OR semi-supervised learning OR reinforcement learning OR self-supervised learning OR few-shot learning OR zero-shot learning OR contrastive learning OR multi-modal learning OR transfer learning OR generative models OR autoencoders OR diffusion models OR attention mechanisms OR representation learning OR adversarial learning OR optimization algorithms OR neurosymbolic AI OR causality in AI OR graph-based learning OR computational neuroscience OR meta-learning OR scalable AI OR knowledge distillation OR neural compression OR transformers OR large language models OR prompt engineering OR in-context learning OR LLMs OR GPT OR BERT OR T5 OR vision transformers OR graph neural networks OR diffusion models OR GANs OR VAE OR AI fairness OR explainable AI OR federated learning OR privacy-preserving AI OR quantum machine learning OR neuromorphic computing OR AI ethics OR human-AI collaboration OR robotics and AI OR autonomous vehicles OR AI for healthcare OR AI for finance OR AI for scientific discovery OR AI for cybersecurity OR adversarial robustness OR AI for business forecasting OR AI for industrial automation OR AI for climate change OR AI for social good OR trustworthy AI OR causal inference OR explainable reinforcement learning) AND submittedDate:[20150101 TO 20251231]"

# Initialize result storage
papers = []
batch_size = 100  # Maximum allowed per request
file_path = "all_ai_research_papers.csv"

# Initialize the arXiv API client
client = arxiv.Client()

# Function to save results progressively
def save_to_csv(data, file_path, first_save=False):
    df = pd.DataFrame(data)
    mode = 'w' if first_save else 'a'  # 'w' creates a new file, 'a' appends data
    header = first_save  # Only include headers if creating a new file
    df.to_csv(file_path, mode=mode, header=header, index=False)

# Fetch papers continuously until no more are available
first_save = True  # Flag to track first save operation

try:
    search = arxiv.Search(
        query=query,
        max_results=batch_size,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    # Fetch results using the updated method
    batch_papers = []
    for result in client.results(search):  # FIX: Use `Client.results()`
        batch_papers.append({
            "title": result.title,
            "abstract": result.summary,
            "published_date": result.published.date(),
            "categories": ", ".join(result.categories),
            "arxiv_url": result.entry_id
        })

    # Save batch to CSV
    if batch_papers:
        save_to_csv(batch_papers, file_path, first_save)
        papers.extend(batch_papers)
        print(f"✅ Retrieved {len(batch_papers)} papers (Total: {len(papers)})")
    else:
        print(f"✅ No more results found. Fetching complete!")

except Exception as e:
    print(f"❌ Error occurred: {e}")
    print("🔄 Retrying after a short delay...")
    time.sleep(5)

print(f"\n✅ Completed! Collected {len(papers)} AI research papers and saved them to '{file_path}'.")

✅ Retrieved 100 papers (Total: 100)

✅ Completed! Collected 100 AI research papers and saved them to 'all_ai_research_papers.csv'.


In [29]:
import arxiv
import pandas as pd
import time

# Define the AI research query
query = "cat:cs.AI AND (artificial intelligence OR deep learning OR machine learning OR neural networks OR supervised learning OR unsupervised learning OR semi-supervised learning OR reinforcement learning OR self-supervised learning OR few-shot learning OR zero-shot learning OR contrastive learning OR multi-modal learning OR transfer learning OR generative models OR autoencoders OR diffusion models OR attention mechanisms OR representation learning OR adversarial learning OR optimization algorithms OR neurosymbolic AI OR causality in AI OR graph-based learning OR computational neuroscience OR meta-learning OR scalable AI OR knowledge distillation OR neural compression OR transformers OR large language models OR prompt engineering OR in-context learning OR LLMs OR GPT OR BERT OR T5 OR vision transformers OR graph neural networks OR diffusion models OR GANs OR VAE OR AI fairness OR explainable AI OR federated learning OR privacy-preserving AI OR quantum machine learning OR neuromorphic computing OR AI ethics OR human-AI collaboration OR robotics and AI OR autonomous vehicles OR AI for healthcare OR AI for finance OR AI for scientific discovery OR AI for cybersecurity OR adversarial robustness OR AI for business forecasting OR AI for industrial automation OR AI for climate change OR AI for social good OR trustworthy AI OR causal inference OR explainable reinforcement learning) AND submittedDate:[20150101 TO 20251231]"

# Initialize result storage
papers = []
batch_size = 100  # Maximum allowed per request
file_path = "all_ai_research_papers.csv"

# Function to save results progressively
def save_to_csv(data, file_path, first_save=False):
    df = pd.DataFrame(data)
    mode = 'w' if first_save else 'a'  # 'w' creates a new file, 'a' appends data
    header = first_save  # Only include headers if creating a new file
    df.to_csv(file_path, mode=mode, header=header, index=False)

# Fetch papers continuously until no more are available
first_save = True  # Flag to track first save operation

while True:
    try:
        search = arxiv.Search(
            query=query,
            max_results=batch_size,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        batch_papers = []
        for result in search.results():
            batch_papers.append({
                "title": result.title,
                "abstract": result.summary,
                "published_date": result.published.date(),
                "categories": ", ".join(result.categories),
                "arxiv_url": result.entry_id
            })

        # If no new papers are found, stop fetching
        if not batch_papers:
            print(f"✅ No more results found. Fetching complete!")
            break

        # Save batch to CSV
        save_to_csv(batch_papers, file_path, first_save)
        first_save = False  # After first save, switch to append mode

        # Append batch to in-memory list
        papers.extend(batch_papers)
        print(f"✅ Retrieved {len(batch_papers)} papers (Total: {len(papers)})")

        # Prevent hitting API rate limits
        time.sleep(2)

    except Exception as e:
        print(f"❌ Error occurred: {e}")
        print("🔄 Retrying after a short delay...")
        time.sleep(5)

print(f"\n✅ Completed! Collected {len(papers)} AI research papers and saved them to '{file_path}'.")

  for result in search.results():


✅ Retrieved 100 papers (Total: 100)
✅ Retrieved 100 papers (Total: 200)
✅ Retrieved 100 papers (Total: 300)
✅ Retrieved 100 papers (Total: 400)
✅ Retrieved 100 papers (Total: 500)
✅ Retrieved 100 papers (Total: 600)
✅ Retrieved 100 papers (Total: 700)
✅ Retrieved 100 papers (Total: 800)
✅ Retrieved 100 papers (Total: 900)
✅ Retrieved 100 papers (Total: 1000)
✅ Retrieved 100 papers (Total: 1100)
✅ Retrieved 100 papers (Total: 1200)
✅ Retrieved 100 papers (Total: 1300)
✅ Retrieved 100 papers (Total: 1400)
✅ Retrieved 100 papers (Total: 1500)
✅ Retrieved 100 papers (Total: 1600)
✅ Retrieved 100 papers (Total: 1700)
✅ Retrieved 100 papers (Total: 1800)
✅ Retrieved 100 papers (Total: 1900)
✅ Retrieved 100 papers (Total: 2000)
✅ Retrieved 100 papers (Total: 2100)
✅ Retrieved 100 papers (Total: 2200)
✅ Retrieved 100 papers (Total: 2300)
✅ Retrieved 100 papers (Total: 2400)
✅ Retrieved 100 papers (Total: 2500)
✅ Retrieved 100 papers (Total: 2600)
✅ Retrieved 100 papers (Total: 2700)
✅ Retrieve

KeyboardInterrupt: 

In [8]:
import json
import os
import pandas as pd

# Define AI-related keywords
keywords = [
    "artificial intelligence", "deep learning", "machine learning", "neural networks",
    "supervised learning", "unsupervised learning", "semi-supervised learning",
    "reinforcement learning", "self-supervised learning", "few-shot learning",
    "zero-shot learning", "contrastive learning", "multi-modal learning",
    "transfer learning", "generative models", "autoencoders", "diffusion models",
    "attention mechanisms", "representation learning", "adversarial learning",
    "optimization algorithms", "neurosymbolic AI", "causality in AI",
    "graph-based learning", "computational neuroscience", "meta-learning",
    "scalable AI", "knowledge distillation", "neural compression", "transformers",
    "large language models", "prompt engineering", "in-context learning",
    "LLMs", "GPT", "BERT", "T5", "vision transformers", "graph neural networks",
    "GANs", "VAE", "AI fairness", "explainable AI", "federated learning",
    "privacy-preserving AI", "quantum machine learning", "neuromorphic computing",
    "AI ethics", "human-AI collaboration", "robotics and AI", "autonomous vehicles",
    "AI for healthcare", "AI for finance", "AI for scientific discovery",
    "AI for cybersecurity", "adversarial robustness", "AI for business forecasting",
    "AI for industrial automation", "AI for climate change", "AI for social good",
    "trustworthy AI", "causal inference", "explainable reinforcement learning"
]

# Initialize list to store filtered results
filtered_papers = []

main_directory = os.path.join('/', *os.getcwd().split('/')[:-1])
data_folder = os.path.join(main_directory, 'data')

# Open the JSON file and process line by line
with open(f"{data_folder}/arxiv-metadata-oai-snapshot.json", "r", encoding="utf-8") as file:
    for line in file:
        # Parse JSON line
        paper = json.loads(line)

        # Extract necessary fields
        categories = paper.get("categories", "")
        abstract = paper.get("abstract", "").lower()
        title = paper.get("title", "")
        authors = paper.get("authors", "")
        doi = paper.get("doi", "")
        versions = paper.get("versions", [])

        # Extract first submission date
        submitted_date = None
        if isinstance(versions, list) and len(versions) > 0:
            submitted_date = versions[0].get("created", None)

        # Convert to datetime format and extract year
        if submitted_date:
            submitted_year = pd.to_datetime(submitted_date, errors="coerce").year
        else:
            continue  # Skip papers with no date

        # Apply filters: Category, Abstract, and Date
        if "cs.AI" in categories and any(keyword in abstract for keyword in keywords) and 2015 <= submitted_year <= 2025:
            filtered_papers.append({
                "Title": title,
                "Abstract": abstract,
                "Year": submitted_year,
                "Authors": authors,
                "DOI": doi
            })

        # Optional: Stop after processing a certain number of papers (for testing)
        # if len(filtered_papers) >= 50000: break

# Convert to DataFrame
filtered_df = pd.DataFrame(filtered_papers)

# Save to CSV
filtered_df.to_csv("filtered_arxiv_AI_papers.csv", index=False)

# Print summary
print(f"Total AI-related papers saved: {len(filtered_df)}")
print("Filtered data saved as 'filtered_arxiv_AI_papers.csv'")

Total AI-related papers saved: 58584
Filtered data saved as 'filtered_arxiv_AI_papers.csv'


In [9]:
filtered_df.head()  # Display the first few rows of the filtered dataset

Unnamed: 0,Title,Abstract,Year,Authors,DOI
0,The Information-theoretic and Algorithmic Appr...,we survey concepts at the frontier of resear...,2015,"Nicolas Gauvrit, Hector Zenil, Jesper Tegn\'er",
1,Statistical-mechanical analysis of pre-trainin...,"in this paper, we present a statistical-mech...",2015,Masayuki Ohzeki,10.7566/JPSJ.84.034003
2,Injury risk prediction for traffic accidents i...,this study describes the experimental applic...,2015,Christian S. Perone,
3,Delving Deep into Rectifiers: Surpassing Human...,rectified activation units (rectifiers) are ...,2015,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",
4,A Quantum Production Model,the production system is a theoretical model...,2015,Lu\'is Tarrataca and Andreas Wichert,10.1007/s11128-011-0241-2


In [2]:
import json
import os
import pandas as pd

# Define AI-related keywords
keywords = [
    "artificial intelligence", "deep learning", "machine learning", "neural networks",
    "supervised learning", "unsupervised learning", "semi-supervised learning",
    "reinforcement learning", "self-supervised learning", "few-shot learning",
    "zero-shot learning", "contrastive learning", "multi-modal learning",
    "transfer learning", "generative models", "autoencoders", "diffusion models",
    "attention mechanisms", "representation learning", "adversarial learning",
    "optimization algorithms", "neurosymbolic AI", "causality in AI",
    "graph-based learning", "computational neuroscience", "meta-learning",
    "scalable AI", "knowledge distillation", "neural compression", "transformers",
    "large language models", "prompt engineering", "in-context learning",
    "LLMs", "GPT", "BERT", "T5", "vision transformers", "graph neural networks",
    "GANs", "VAE", "AI fairness", "explainable AI", "federated learning",
    "privacy-preserving AI", "quantum machine learning", "neuromorphic computing",
    "AI ethics", "human-AI collaboration", "robotics and AI", "autonomous vehicles",
    "AI for healthcare", "AI for finance", "AI for scientific discovery",
    "AI for cybersecurity", "adversarial robustness", "AI for business forecasting",
    "AI for industrial automation", "AI for climate change", "AI for social good",
    "trustworthy AI", "causal inference", "explainable reinforcement learning"
]

# Initialize list to store filtered results
filtered_papers = []

main_directory = os.path.join('/', *os.getcwd().split('/')[:-1])
data_folder = os.path.join(main_directory, 'data')

# Open the JSON file and process line by line
with open(f"{data_folder}/arxiv-metadata-oai-snapshot.json", "r", encoding="utf-8") as file:
    for line in file:
        # Parse JSON line
        paper = json.loads(line)

        # Extract necessary fields
        categories = paper.get("categories", "")
        abstract = paper.get("abstract", "").lower()
        title = paper.get("title", "")
        authors = paper.get("authors", "")
        doi = paper.get("doi", "")
        versions = paper.get("versions", [])

        # Extract first submission date
        submitted_date = None
        if isinstance(versions, list) and len(versions) > 0:
            submitted_date = versions[0].get("created", None)

        # Convert to datetime format and extract year
        if submitted_date:
            submitted_year = pd.to_datetime(submitted_date, errors="coerce").year
        else:
            continue  # Skip papers with no date

        # Apply filters: Category, Abstract, and Date
        if "cs.AI" in categories and 2015 <= submitted_year <= 2025:
            filtered_papers.append({
                "Title": title,
                "Abstract": abstract,
                "Year": submitted_year,
                "Authors": authors,
                "DOI": doi
            })

        # Optional: Stop after processing a certain number of papers (for testing)
        # if len(filtered_papers) >= 50000: break

# Convert to DataFrame
filtered_df = pd.DataFrame(filtered_papers)

# Save to CSV
filtered_df.to_csv("all_arxiv_AI_papers.csv", index=False)

# Print summary
print(f"Total AI-related papers saved: {len(filtered_df)}")
print("Filtered data saved as 'filtered_arxiv_AI_papers.csv'")

Total AI-related papers saved: 110272
Filtered data saved as 'filtered_arxiv_AI_papers.csv'


In [3]:
filtered_df.head()  # Display the first few rows of the filtered dataset

Unnamed: 0,Title,Abstract,Year,Authors,DOI
0,Ultimate Intelligence Part I: Physical Complet...,we propose that solomonoff induction is comp...,2015,"Eray \""Ozkural",
1,Hostile Intent Identification by Movement Patt...,"in the recent years, the problem of identify...",2015,"Souham Biswas, Manisha J. Nene",10.13140/2.1.4429.7281
2,A Novel Design of a Parallel Machine Learnt Ge...,the generational garbage collection involves...,2015,Vasanthakumar Soundararajan,
3,Constraint-based sequence mining using constra...,the goal of constraint-based sequence mining...,2015,Benjamin Negrevergne and Tias Guns,
4,On the Relationship between Sum-Product Networ...,"in this paper, we establish some theoretical...",2015,"Han Zhao, Mazen Melibari and Pascal Poupart",
