<a href="https://colab.research.google.com/github/MNoichl/tttms_public/blob/main/procuring_arxiv_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary packages


In [None]:
!pip install xmltodict
!pip install flatten_dict
!pip install boto3
!pip install kaggle

# Load packages and link google drive

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json

import re

import tqdm

import tarfile
import glob
import os

from shutil import copyfile, copy, copy2
import shutil
import xmltodict

import gzip

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Download Arxiv-Meta-data from Kaggle

In [None]:

# censored for review before publication

!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {"username":"USERNAME","key":"PRIVAT_KEY"}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download Cornell-University/arxiv

In [None]:
import zipfile
with zipfile.ZipFile('arxiv.zip', 'r') as zip_ref:
    zip_ref.extractall('arxiv_metadata')

In [None]:
json_lines = []
for line in tqdm.tqdm_notebook(open(r'arxiv_metadata/arxiv-metadata-oai-snapshot.json', 'r')):
    json_lines.append(json.loads(line))

In [None]:
selected_years = ['2019','2020','2021']
articles_within_years = [x for x in tqdm.tqdm_notebook(json_lines) if re.findall('\d{4}',x['versions'][0]['created'])[0] in selected_years]

In [None]:
del json_lines # to conserve RAM

In [None]:
pd.DataFrame(articles_within_years[0:10])

# Download ArXiv fulltexts from Amazon-AWS

In [None]:
!rm -rf config.ini


# censored for review before publication
with open('config.ini', 'w') as f:
    f.write("""[DEFAULT]
ACCESS_KEY = ACCESS_KEY
SECRET_KEY =SECRET_KEY""")

The following code is adapted/taken from a tutorial by Brienna Herold, https://towardsdatascience.com/how-to-bulk-access-arxiv-full-text-preprints-58026e19e8ef

In [None]:
import boto3, configparser

s3resource = None

def setup():
    """Creates S3 resource & sets configs to enable download."""

    print('Connecting to Amazon S3...')

    # Securely import configs from private config file
    configs = configparser.SafeConfigParser()
    configs.read('config.ini')

    # Create S3 resource & set configs
    global s3resource
    s3resource = boto3.resource(
        's3',  # the AWS resource we want to use
        aws_access_key_id=configs['DEFAULT']['ACCESS_KEY'],
        aws_secret_access_key=configs['DEFAULT']['SECRET_KEY'],
        region_name='us-east-1'  # same region arxiv bucket is in
    )

if __name__ == '__main__':
    """Runs if script is called on command line"""

    # Create S3 resource & set configs
    setup()

In [None]:
import boto3, configparser, os, botocore

def download_file(key):
    """
    Downloads given filename from source bucket to destination directory.
    Parameters
    ----------
    key : str
        Name of file to download
    """

    # Ensure src directory exists 
    if not os.path.isdir('src'):
        os.makedirs('src')

    # Download file
    print('\nDownloading s3://arxiv/{} to {}...'.format(key, key))

    try:
        s3resource.meta.client.download_file(
            Bucket='arxiv', 
            Key=key,  # name of file to download from
            Filename=key,  # path to file to download to
            ExtraArgs={'RequestPayer':'requester'})
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print('ERROR: ' + key + " does not exist in arxiv bucket")

if __name__ == '__main__':
    """Runs if script is called on command line"""

    # Download manifest file to current directory
    download_file('src/arXiv_src_manifest.xml')

In [None]:
with open('src/arXiv_src_manifest.xml', 'r') as file:
    data = file.read()
xml_content = xmltodict.parse(data)


In [None]:
manifest_frame = pd.DataFrame(xml_content['arXivSRC']['file'])

In [None]:
def rm_make(dir):
  try:
    shutil.rmtree(dir)
  except:
    pass

  try:
    !mkdir $dir
  except:
    pass


def return_dir_paths(path):
  all_files = []
  for path, subdirs, files in os.walk(path):
      for name in files:
          all_files.append(os.path.join(path, name))
  return all_files

In [None]:
output_directory = 'drive/MyDrive/arxiv_2019_2022_tex_collection_fourth_run'

In [None]:
all_query_codes = []
for year in tqdm.tqdm_notebook(selected_years):
  for month in tqdm.tqdm_notebook(range(1,13)):
    all_query_codes.append(year[-2:]+str(month).zfill(2))

In [None]:
for query_code in ['2012']: # all_query_codes[3:]:
  print('Downloading code: ', query_code)
  this_years_queries = manifest_frame[manifest_frame["yymm"] == query_code]
  for ix, row in tqdm.tqdm_notebook(this_years_queries.iterrows()):
    key = row["filename"]
    print('    Downloading file: ', key)
    try:
      rm_make("src")
      s3resource.meta.client.download_file(
        Bucket="arxiv",
        Key=key,  # name of key to download from
        Filename=key,  # path to file to download to
        ExtraArgs={"RequestPayer": "requester"},
      )

      # extract the whole package to temporary directory
      rm_make("temporary_files")
      rm_make("temporary_files_extracted")
      tar = tarfile.open(key, "r:")
      tar.extractall(path="temporary_files")
      tar.close()

      # loop over all extracted files, copying the .tex files to our output directory, into their own folder named after their arxiv id
      all_files = return_dir_paths("temporary_files")
      print('    Extracting file: ', key)
      for this_file in tqdm.tqdm_notebook(all_files):
        this_file_path = this_file.split("/")[-1].replace(".gz", "")
        try:
          if this_file[-2:] == "gz":
            tar = tarfile.open(this_file)
            tar.extractall(
              path="temporary_files_extracted/" + this_file_path
            )
            tar.close()
            for extracted_file in return_dir_paths(
              "temporary_files_extracted/" + this_file_path
            ):
              if extracted_file.endswith(".tex"):
                try:
                  copy(
                    extracted_file,
                    output_directory
                    + "/"
                    + this_file_path
                    + "/"
                    + extracted_file.split("/")[-1],
                  )
                except IOError as io_err:
                  os.makedirs(output_directory + "/" + this_file_path)
                  copy(
                    extracted_file,
                    output_directory
                    + "/"
                    + this_file_path
                    + "/"
                    + extracted_file.split("/")[-1],
                  )

        except (KeyboardInterrupt, SystemExit):
          print("program stopped manually")
          raise
        except:
          # print('file')
          pass

    except botocore.exceptions.ClientError as e:
      if e.response["Error"]["Code"] == "404":
        print("ERROR: " + key + " does not exist in arxiv bucket")
