<a href="https://colab.research.google.com/github/MatthewK84/LinkedIn-Learning-Journey/blob/main/ARXIV_Conformal_Prediction_Revised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install arxiv --user

In [1]:
import pandas as pd
import arxiv

In [2]:
# i only use these if I want to remove annoying deprecation warnings from my analysis
import warnings
warnings.filterwarnings('ignore')

In [3]:
def search_arxiv(query, max_results=10):

    data = {}
    i = 0

    search = arxiv.Search(query=query, max_results=max_results)

    for result in search.results():

        try:

            data[i] = {}

            data[i]['title'] = result.title
            data[i]['date_published'] = result.published
            data[i]['authors'] = [a.name for a in result.authors]
            data[i]['summary'] = result.summary
            data[i]['url'] = result.pdf_url
            data[i]['category'] = result.primary_category

        except:

            print('weird arxiv error')

        # there are more fields that can be added; add as many as you need

        i += 1

    df = pd.DataFrame(data).T
    df = df[['date_published', 'title', 'authors', 'summary', 'url', 'category']]
    df['date_published'] = pd.to_datetime(df['date_published'])
    df.sort_values('date_published', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [4]:
# at 1000 articles, it takes about 30 seconds
# at 10000 articles, it takes a while; put laptop down and walk away
# i haven't tried beyond 10000 yet

# at the moment, this is finicky beyond 10000; needs debugged or something
# could also be a problem with the arxiv library as it seems to be inside the search

# so far, i haven't noticed obvious throttling, but possibly throttling is causing the issue

query = 'Conformal Prediction'

max_results = 5100

df = search_arxiv(query, max_results)
df.head()

Unnamed: 0,date_published,title,authors,summary,url,category
0,2023-11-03 09:01:37+00:00,Fast ellipsoidal conformal and quasi-conformal...,[Gary P. T. Choi],Surface parameterization plays a fundamental r...,http://arxiv.org/pdf/2311.01788v1,cs.CG
1,2023-11-02 17:59:30+00:00,Conformal Policy Learning for Sensorimotor Con...,"[Huang Huang, Satvik Sharma, Antonio Loquercio...",This paper focuses on the problem of detecting...,http://arxiv.org/pdf/2311.01457v1,cs.RO
2,2023-11-02 17:58:37+00:00,Investigating the Behavior of Diffusion Models...,"[Daniel Rothchild, Andrew S. Rosen, Eric Taw, ...",We present an investigation into diffusion mod...,http://arxiv.org/pdf/2311.01491v1,physics.chem-ph
3,2023-11-02 01:56:03+00:00,From O(3) to Cubic CFT: Conformal Perturbation...,"[Junchen Rong, Ning Su]",The Cubic CFT can be understood as the O(3) in...,http://arxiv.org/pdf/2311.00933v1,hep-th
4,2023-11-01 18:37:07+00:00,Conformalized Deep Splines for Optimal and Eff...,"[Nathaniel Diamant, Ehsan Hajiramezanali, Tomm...",Uncertainty estimation is critical in high-sta...,http://arxiv.org/pdf/2311.00774v1,cs.LG


In [5]:
df.shape

(5100, 6)

In [6]:
outfile = '/content/sample_data/conformal_prediction_data.csv'

df.to_csv(outfile, index=False)

In [37]:
import csv

# The path to the CSV file
input_file_path = '/content/sample_data/conformal_prediction_data.csv'
output_file_path = '/content/sample_data/conformal_prediction_data_filtered_final.csv'


# List of categories to include
included_categories = [
    'cs.CG', 'cs.RO', 'cs.LG', 'stat.ME', 'cs.NE', 'stat.ML',
    'cs.IT', 'cs.AI', 'cs.SD', 'econ.GN', 'cs.CV', 'cs.CL', 'cs.GR', 'cs.IR', 'cs.NI','cs.SE', 'cs.DC', 'cs.SI', 'cs.OH', 'cs.CE',
    'cs.HC', 'stat.AP', 'cs.CY', 'q-fin.ST','cs.ET', 'q-fin.GN', 'stat.OT', 'cs.LO', 'cs.CR', 'cs.PF','q-fin.RM', 'cs.DB', 'econ.EM', 'cs.SY', 'cs.MS', 'stat.CO'
]

# Read the CSV file and filter rows
with open(input_file_path, mode='r', newline='', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    fieldnames = reader.fieldnames  # Capture the fieldnames for writing

    # Write the filtered rows to a new CSV file
    with open(output_file_path, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()  # Write the header to the output file

        for row in reader:
            # Check if the 'category' is in the list of included categories
            if row['category'] in included_categories:
                writer.writerow(row)

In [38]:
Conformal_Final = pd.read_csv('/content/sample_data/conformal_prediction_data_filtered_final.csv')
Conformal_Final.head()

Unnamed: 0,date_published,title,authors,summary,url,category
0,2023-11-03 09:01:37+00:00,Fast ellipsoidal conformal and quasi-conformal...,['Gary P. T. Choi'],Surface parameterization plays a fundamental r...,http://arxiv.org/pdf/2311.01788v1,cs.CG
1,2023-11-02 17:59:30+00:00,Conformal Policy Learning for Sensorimotor Con...,"['Huang Huang', 'Satvik Sharma', 'Antonio Loqu...",This paper focuses on the problem of detecting...,http://arxiv.org/pdf/2311.01457v1,cs.RO
2,2023-11-01 18:37:07+00:00,Conformalized Deep Splines for Optimal and Eff...,"['Nathaniel Diamant', 'Ehsan Hajiramezanali', ...",Uncertainty estimation is critical in high-sta...,http://arxiv.org/pdf/2311.00774v1,cs.LG
3,2023-10-30 18:28:50+00:00,GPCR-BERT: Interpreting Sequential Design of G...,"['Seongwon Kim', 'Parisa Mollaei', 'Akshay Ant...",With the rise of Transformers and Large Langua...,http://arxiv.org/pdf/2310.19915v1,cs.LG
4,2023-10-30 07:41:42+00:00,D4Explainer: In-Distribution GNN Explanations ...,"['Jialin Chen', 'Shirley Wu', 'Abhijit Gupta',...",The widespread deployment of Graph Neural Netw...,http://arxiv.org/pdf/2310.19321v1,cs.LG


In [39]:
Conformal_Final.shape

(626, 6)