## Trying it out with the open acces API arXiv:

In [11]:
from bs4 import BeautifulSoup
import requests
import csv

# Define the API endpoint and query parameters
api_url = 'http://export.arxiv.org/api/query'
query_params = {
    'search_query': 'cat:cs.AI',
    'start': 0,
    'max_results': 2000,
}

# Initialize a list to store publication data
publications = []

# Make API requests in batches
while query_params['start'] < query_params['max_results']:
    # Make the API request
    response = requests.get(api_url, params=query_params)

    # Check the response status
    if response.status_code == 200:
        # Parse the HTML response using BeautifulSoup with 'html.parser'
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract publication information
        for entry in soup.find_all('entry'):
            title = entry.find('title').text.strip()
            abstract = entry.find('summary').text.strip()

            # Check if 'arxiv:comment' element exists before accessing 'text'
            arxiv_comment = entry.find('arxiv:comment')
            if arxiv_comment:
                citations = arxiv_comment.text.strip()
            else:
                citations = 'N/A'

            # Append publication data to the list
            publications.append([title, abstract, citations])

        # Update the 'start' parameter for the next batch
        query_params['start'] += len(soup.find_all('entry'))
    else:
        print(f"API request failed with status code {response.status_code}")
        break

# Specify the CSV file name
csv_filename = 'publications.csv'

# Write the data to the CSV file
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(['Title', 'Abstract', 'Citations'])
    # Write the publication data
    csv_writer.writerows(publications)

print(f"Data saved to '{csv_filename}'")


Data saved to 'publications.csv'


This seems to work

In [12]:
import pandas as pd

In [13]:
df = pd.read_csv("publications.csv")

In [14]:
df

Unnamed: 0,Title,Abstract,Citations
0,Dynamic Backtracking,Because of their occasional need to return to ...,See http://www.jair.org/ for an online appendi...
1,A Market-Oriented Programming Environment and ...,Market price systems constitute a well-underst...,See http://www.jair.org/ for any accompanying ...
2,An Empirical Analysis of Search in GSAT,We describe an extensive study of search in GS...,See http://www.jair.org/ for any accompanying ...
3,The Difficulties of Learning Logic Programs wi...,As real logic programmers normally use cut (!)...,See http://www.jair.org/ for any accompanying ...
4,Software Agents: Completing Patterns and Const...,To support the goal of allowing users to recor...,See http://www.jair.org/ for an online appendi...
...,...,...,...
2195,Machine Generalization and Human Categorizatio...,In designing an intelligent system that must b...,Appears in Proceedings of the First Conference...
2196,Exact Reasoning Under Uncertainty,This paper focuses on designing expert systems...,Appears in Proceedings of the First Conference...
2197,The Estimation of Subjective Probabilities via...,Theoretically as well as experimentally it is ...,Appears in Proceedings of the First Conference...
2198,A Cure for Pathological Behavior in Games that...,The traditional approach to choosing moves in ...,Appears in Proceedings of the First Conference...
