In [423]:
import requests 
import os
from datetime import datetime
from operator import itemgetter
import pandas as pd
import numpy as np
import re

In [424]:
API_URL = 'http://65.108.50.112:5000/data/arxiv'

In [425]:
# Get data from API
r = requests.get(url=API_URL)

# Convert json to dataframe
j = r.json()
df = pd.json_normalize(j, ["result"])
print(len(df))

10


In [426]:
df.head()

Unnamed: 0,abstract,authors,authors_parsed,categories,comments,doi,id,journal-ref,license,report-no,submitter,title,update_date,versions
0,In this contribution we go through the devel...,Paolo Di Vecchia,"[[Di Vecchia, Paolo, ]]",hep-th,"Latex 60 pages, 2 figures, uses svmult.cls. Co...",,704.0101,"Lect.NotesPhys.737:59-118,2008",,NORDITA-2007-13,Paolo Di Vecchia pdv,The birth of string theory,2008-11-26,"[{'created': 'Sun, 1 Apr 2007 19:30:02 GMT', '..."
1,We prove a duality theorem for certain grade...,"Marc Chardin, Steven Dale Cutkosky, Juergen He...","[[Chardin, Marc, ], [Cutkosky, Steven Dale, ],...",math.AC math.AG,18 pages,,704.0102,,,,Steven Dale Cutkosky,Duality and Tameness,2007-05-23,"[{'created': 'Sun, 1 Apr 2007 19:40:22 GMT', '..."
2,The physical consistency of the match of pie...,Gianluca Gemelli,"[[Gemelli, Gianluca, ]]",gr-qc,26 pages.,10.1007/s10773-007-9450-y,704.0103,"Int.J.Theor.Phys.46:3312-3330,2007",,,Gianluca Gemelli,Generalized regularly discontinuous solutions ...,2008-11-26,"[{'created': 'Sun, 1 Apr 2007 19:43:30 GMT', '..."
3,Given an orientable weakly self-dual manifol...,"Giovanni Gaiffi, Michele Grassi","[[Gaiffi, Giovanni, ], [Grassi, Michele, ]]",math.DG math.RT,"16 pages, no figures",,704.0104,,,,Michele Grassi,"A geometric realization of sl(6,C)",2007-05-23,"[{'created': 'Sun, 1 Apr 2007 19:52:31 GMT', '..."
4,We show that there is an hierarchy of inters...,Michael Entov and Leonid Polterovich,"[[Entov, Michael, ], [Polterovich, Leonid, ]]",math.SG,Significant corrections and changes in the par...,10.1112/S0010437X0900400X,704.0105,,http://arxiv.org/licenses/nonexclusive-distrib...,,Michael Entov,Rigid subsets of symplectic manifolds,2014-01-14,"[{'created': 'Sun, 1 Apr 2007 19:57:30 GMT', '..."


In [427]:
# Dropping the license, report-no, comments, journal-ref, submitter, categories
df.drop(['license', 'report-no', 'comments', 'journal-ref', 'submitter', 'categories'], axis=1, inplace=True)

In [428]:
# Rename the id column to arxiv_id
df = df.rename(columns={"id": "arxiv_id"})

In [429]:
# df.head()

In [430]:
# Dropping withdrawn papers using information from the ‘abstract’ column after 
# which ‘abstract’ column is dropped.
print(len(df))
p = re.compile('\s+(This|The) (paper|submission|manuscript) (has been|is being|is) withdrawn')
df = df.loc[df['abstract'].apply(p.match).isnull()]
df.drop(['abstract'], axis=1, inplace=True)
print(len(df))

10
10


In [431]:
# Dropping papers with empty authors
print(len(df))
df.dropna(subset=['authors'])
print(len(df))

10
10


In [432]:
# Removing records where both title and authors are duplicates.
# Prefer to keep the records with DOIs; if no DOIs, prefer newer update dates.
# After duplicates are removed, drop the authors and update_date columns as they are no longer needed.

print(f'Ridu enne duplikaatide droppimist: {len(df)}')
df = df.sort_values(by=['doi','update_date']).drop_duplicates(subset=(['title','authors']), keep='last')
print(f'Ridu peale duplikaatide droppimist: {len(df)}')

df.drop(['authors'], axis=1, inplace=True)
df.drop(['update_date'], axis=1, inplace=True)

Ridu enne duplikaatide droppimist: 10
Ridu peale duplikaatide droppimist: 5


In [433]:
# Getting the latest version number from versions field and dropping the original ‘versions’ column
df['latest_version'] = df['versions'].apply(itemgetter(-1))
df['latest_version_nr'] = [d.get('version') for d in df.latest_version]
df.drop(['versions'], axis=1, inplace=True)
df.drop(['latest_version'], axis=1, inplace=True)
#df.head()


In [434]:
# Separating authors and authors’ affiliations from the authors_parsed field

def affiliations(authors_parsed):
    affiliations = []

    for p in authors_parsed:
        try:
            end = p.index("")
            affiliations.append(list(filter(len, p[end:])))
        except ValueError:
            affiliations.append([])
    return affiliations

def authors_to_df(authors_parsed_list):
    first_names = []
    last_names = []
    ids = []
    
    for line in authors_parsed_list:
        first_name = [e[1] for e in line]
        last_name = [e[0] for e in line]
        id = [e[0] + "_" + e[1] + "_" for e in line]
        id = [i.replace(' ' ,'_') for i in id]
        first_names.append(first_name)
        last_names.append(last_name)
        ids.append(id)

    df = pd.DataFrame()
    df['author_id'] = ids
    df['first_name'] = first_names
    df['last_name'] = last_names

    return df

In [435]:
# Separating authors and authors’ affiliations from the authors_parsed field into a new dataframe
authors = authors_to_df(df['authors_parsed'])
authors['affiliations'] = df['authors_parsed'].map(affiliations).values

In [436]:
# Add author_ids into the original dataframe and drop 'authors_parsed' column
df['author_id'] = authors['author_id'].values
df.drop(['authors_parsed'], axis=1, inplace=True)

In [437]:
# Explode the authors table so that there is one row per author
authors = authors.apply(pd.Series.explode)

In [438]:
authors

Unnamed: 0,author_id,first_name,last_name,affiliations
0,Gemelli_Gianluca_,Gianluca,Gemelli,[]
1,Entov_Michael_,Michael,Entov,[]
1,Polterovich_Leonid_,Leonid,Polterovich,[]
2,Chardin_Marc_,Marc,Chardin,[]
2,Cutkosky_Steven_Dale_,Steven Dale,Cutkosky,[]
2,Herzog_Juergen_,Juergen,Herzog,[]
2,Srinivasan_Hema_,Hema,Srinivasan,[]
3,Gaiffi_Giovanni_,Giovanni,Gaiffi,[]
3,Grassi_Michele_,Michele,Grassi,[]
4,Di_Vecchia_Paolo_,Paolo,Di Vecchia,[]


In [439]:
df

Unnamed: 0,doi,arxiv_id,title,latest_version_nr,author_id
7,10.1007/s10773-007-9450-y,704.0103,Generalized regularly discontinuous solutions ...,v1,[Gemelli_Gianluca_]
9,10.1112/S0010437X0900400X,704.0105,Rigid subsets of symplectic manifolds,v2,"[Entov_Michael_, Polterovich_Leonid_]"
6,,704.0102,Duality and Tameness,v1,"[Chardin_Marc_, Cutkosky_Steven_Dale_, Herzog_..."
8,,704.0104,"A geometric realization of sl(6,C)",v1,"[Gaiffi_Giovanni_, Grassi_Michele_]"
5,,704.0101,The birth of string theory,v1,[Di_Vecchia_Paolo_]
