<a href="https://colab.research.google.com/github/MatthewK84/LinkedIn-Learning-Journey/blob/main/ARXIV_Time_Series_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install arxiv --user

In [1]:
import pandas as pd
import arxiv

In [2]:
# i only use these if I want to remove annoying deprecation warnings from my analysis
import warnings
warnings.filterwarnings('ignore')

In [3]:
def search_arxiv(query, max_results=10):

    data = {}
    i = 0

    search = arxiv.Search(query=query, max_results=max_results)

    for result in search.results():

        try:

            data[i] = {}

            data[i]['title'] = result.title
            data[i]['date_published'] = result.published
            data[i]['authors'] = [a.name for a in result.authors]
            data[i]['summary'] = result.summary
            data[i]['url'] = result.pdf_url
            data[i]['category'] = result.primary_category

        except:

            print('weird arxiv error')

        # there are more fields that can be added; add as many as you need

        i += 1

    df = pd.DataFrame(data).T
    df = df[['date_published', 'title', 'authors', 'summary', 'url', 'category']]
    df['date_published'] = pd.to_datetime(df['date_published'])
    df.sort_values('date_published', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [4]:
# at 1000 articles, it takes about 30 seconds
# at 10000 articles, it takes a while; put laptop down and walk away
# i haven't tried beyond 10000 yet

# at the moment, this is finicky beyond 10000; needs debugged or something
# could also be a problem with the arxiv library as it seems to be inside the search

# so far, i haven't noticed obvious throttling, but possibly throttling is causing the issue

query = 'Time Series'

max_results = 5100

df = search_arxiv(query, max_results)
df.head()

Unnamed: 0,date_published,title,authors,summary,url,category
0,2023-11-02 17:58:09+00:00,Time Series Anomaly Detection using Diffusion-...,"[Ioana Pintilie, Andrei Manolache, Florin Brad]",Diffusion models have been recently used for a...,http://arxiv.org/pdf/2311.01452v1,cs.LG
1,2023-11-02 17:55:41+00:00,Deep Double Descent for Time Series Forecastin...,"[Valentino Assandri, Sam Heshmati, Burhaneddin...","Deep learning models, particularly Transformer...",http://arxiv.org/pdf/2311.01442v1,cs.LG
2,2023-11-02 17:26:49+00:00,Castor: Causal Temporal Regime Structure Learning,"[Abdellah Rahmani, Pascal Frossard]",The task of uncovering causal relationships am...,http://arxiv.org/pdf/2311.01412v1,cs.LG
3,2023-11-02 09:41:49+00:00,Dynamically Maintaining the Persistent Homolog...,"[Sebastiano Cultrera di Montesano, Herbert Ede...",We present a dynamic data structure for mainta...,http://arxiv.org/pdf/2311.01115v1,cs.DS
4,2023-11-01 17:45:52+00:00,What User Behaviors Make the Differences Durin...,"[Shahin Doroudian, Zekun Wu, Aidong Lu]",The understanding of visual analytics process ...,http://arxiv.org/pdf/2311.00690v1,cs.HC


In [5]:
df.shape

(5100, 6)

In [6]:
outfile = '/content/sample_data/arxix_time_series_data.csv'

df.to_csv(outfile, index=False)

In [7]:
Time_Series = pd.read_csv('/content/sample_data/arxix_time_series_data.csv')
Time_Series.head()

Unnamed: 0,date_published,title,authors,summary,url,category
0,2023-11-02 17:58:09+00:00,Time Series Anomaly Detection using Diffusion-...,"['Ioana Pintilie', 'Andrei Manolache', 'Florin...",Diffusion models have been recently used for a...,http://arxiv.org/pdf/2311.01452v1,cs.LG
1,2023-11-02 17:55:41+00:00,Deep Double Descent for Time Series Forecastin...,"['Valentino Assandri', 'Sam Heshmati', 'Burhan...","Deep learning models, particularly Transformer...",http://arxiv.org/pdf/2311.01442v1,cs.LG
2,2023-11-02 17:26:49+00:00,Castor: Causal Temporal Regime Structure Learning,"['Abdellah Rahmani', 'Pascal Frossard']",The task of uncovering causal relationships am...,http://arxiv.org/pdf/2311.01412v1,cs.LG
3,2023-11-02 09:41:49+00:00,Dynamically Maintaining the Persistent Homolog...,"['Sebastiano Cultrera di Montesano', 'Herbert ...",We present a dynamic data structure for mainta...,http://arxiv.org/pdf/2311.01115v1,cs.DS
4,2023-11-01 17:45:52+00:00,What User Behaviors Make the Differences Durin...,"['Shahin Doroudian', 'Zekun Wu', 'Aidong Lu']",The understanding of visual analytics process ...,http://arxiv.org/pdf/2311.00690v1,cs.HC


In [8]:
Time_Series['category'].value_counts()

cs.LG              1444
stat.ME             483
math.ST             264
stat.ML             248
physics.data-an     178
                   ... 
physics.app-ph        1
q-fin.EC              1
q-bio.SC              1
math.SG               1
math.LO               1
Name: category, Length: 140, dtype: int64

In [9]:
import csv

# The path to the CSV file
input_file_path = '/content/sample_data/arxix_time_series_data.csv'
output_file_path = '/content/sample_data/time_series_filtered.csv'


# List of categories to include
included_categories = ['cs.LG', 'stat.ME', 'math.ST', 'stat.ML']

# Read the CSV file and filter rows
with open(input_file_path, mode='r', newline='', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    fieldnames = reader.fieldnames  # Capture the fieldnames for writing

    # Write the filtered rows to a new CSV file
    with open(output_file_path, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()  # Write the header to the output file

        for row in reader:
            # Check if the 'category' is in the list of included categories
            if row['category'] in included_categories:
                writer.writerow(row)

In [11]:
Time_Series_Final = pd.read_csv('/content/sample_data/time_series_filtered.csv')
Time_Series_Final.head()

Unnamed: 0,date_published,title,authors,summary,url,category
0,2023-11-02 17:58:09+00:00,Time Series Anomaly Detection using Diffusion-...,"['Ioana Pintilie', 'Andrei Manolache', 'Florin...",Diffusion models have been recently used for a...,http://arxiv.org/pdf/2311.01452v1,cs.LG
1,2023-11-02 17:55:41+00:00,Deep Double Descent for Time Series Forecastin...,"['Valentino Assandri', 'Sam Heshmati', 'Burhan...","Deep learning models, particularly Transformer...",http://arxiv.org/pdf/2311.01442v1,cs.LG
2,2023-11-02 17:26:49+00:00,Castor: Causal Temporal Regime Structure Learning,"['Abdellah Rahmani', 'Pascal Frossard']",The task of uncovering causal relationships am...,http://arxiv.org/pdf/2311.01412v1,cs.LG
3,2023-11-01 13:44:45+00:00,Retrieval-Based Reconstruction For Time-series...,"['Maxwell A. Xu', 'Alexander Moreno', 'Hui Wei...",The success of self-supervised contrastive lea...,http://arxiv.org/pdf/2311.00519v1,cs.LG
4,2023-11-01 01:23:59+00:00,WinNet:time series forecasting with a window-e...,"['Wenjie Ou', 'Dongyue Guo', 'Zheng Zhang', 'Z...","Recently, Transformer-based methods have signi...",http://arxiv.org/pdf/2311.00214v1,cs.LG


In [12]:
Time_Series_Final.shape

(2439, 6)