# ARXIV Scholar

In [90]:
import pandas as pd
import csv
import json
import ast
import requests

In [85]:
#Clean dataset

def contains_number(string):
    return any(char.isdigit() for char in string)

with open('./data/arxivData.json', 'r') as dataset:
    papers = json.loads(dataset.read())

for paper in papers:
    names = []

    # authors
    for author in ast.literal_eval(paper['author']):        
        names.append(author['name'])
    paper['authors'] = names

    # link
    for link in ast.literal_eval(paper['link']):
        if link['rel'] == 'alternate':
            paper['link'] = link['href']

    # date
    date = str(paper['year']) + '-' + str(paper['month']) + '-' +  str(paper['day']) 
    paper['date'] = date

    # tags
    tags = []
    for tag in ast.literal_eval(paper['tag']):
        if (contains_number(tag['term'])):
            continue
        tags.append(tag['term'])
    paper['tags'] = tags

    paper.pop('author')
    paper.pop('id')
    paper.pop('year')
    paper.pop('month')
    paper.pop('day')
    paper.pop('tag')



In [86]:
def getTitle(tag):
    URL = 'https://arxiv.org/list/'
    url = URL + tag + '/recent'

    response = requests.get(url)
    text = response.text

    dlPageIndex = text.find("dlpage")

    h1Index = text.find("<h1>", dlPageIndex+1)
    h1FinalIndex = text.find("</h1>", h1Index)

    result = text[h1Index+4:h1FinalIndex-1]
    return result

tagsDict = {}

nReqs = 0

for paper in papers:
    urlTags = paper['tags']
    newTags = []
    
    for tag in urlTags:
        if tag not in tagsDict.keys():
            tagsDict[tag] = getTitle(tag)
            nReqs+=1
        
        newTags.append(tagsDict[tag])
    paper['tags'] = newTags

print("Number of requests made: ", nReqs)

Number of requests made:  161


In [87]:
field_names = ['link', 'summary', 'title', 'authors', 'date', 'tags']
with open('dateset.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames = field_names)
    writer.writeheader()
    writer.writerows(papers)

In [93]:
df = pd.read_csv('dateset.csv')
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41000 entries, 0 to 40999
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   link     41000 non-null  object
 1   summary  41000 non-null  object
 2   title    41000 non-null  object
 3   authors  41000 non-null  object
 4   date     41000 non-null  object
 5   tags     41000 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


link       0
summary    0
title      0
authors    0
date       0
tags       0
dtype: int64

In [92]:
df

Unnamed: 0,link,summary,title,authors,date,tags
0,http://arxiv.org/abs/1802.00209v1,We propose an architecture for VQA which utili...,Dual Recurrent Attention Units for Visual Ques...,"['Ahmed Osman', 'Wojciech Samek']",2018-2-1,"['Artificial Intelligence', 'Computation and L..."
1,http://arxiv.org/abs/1603.03827v1,Recent approaches based on artificial neural n...,Sequential Short-Text Classification with Recu...,"['Ji Young Lee', 'Franck Dernoncourt']",2016-3-12,"['Computation and Language', 'Artificial Intel..."
2,http://arxiv.org/abs/1606.00776v2,We introduce the multiresolution recurrent neu...,Multiresolution Recurrent Neural Networks: An ...,"['Iulian Vlad Serban', 'Tim Klinger', 'Gerald ...",2016-6-2,"['Computation and Language', 'Artificial Intel..."
3,http://arxiv.org/abs/1705.08142v2,Multi-task learning is motivated by the observ...,Learning what to share between loosely related...,"['Sebastian Ruder', 'Joachim Bingel', 'Isabell...",2017-5-23,"['Machine Learning', 'Artificial Intelligence'..."
4,http://arxiv.org/abs/1709.02349v2,We present MILABOT: a deep reinforcement learn...,A Deep Reinforcement Learning Chatbot,"['Iulian V. Serban', 'Chinnadhurai Sankar', 'M...",2017-9-7,"['Computation and Language', 'Artificial Intel..."
...,...,...,...,...,...,...
40995,http://arxiv.org/abs/1404.4702v2,We study the complexity of learning and approx...,Nearly Tight Bounds on $\ell_1$ Approximation ...,"['Vitaly Feldman', 'Pravesh Kothari', 'Jan Von...",2014-4-18,"['Machine Learning', 'Data Structures and Algo..."
40996,http://arxiv.org/abs/1404.5421v1,We consider the problem of multiple users targ...,Concurrent bandits and cognitive radio networks,"['Orly Avner', 'Shie Mannor']",2014-4-22,"['Machine Learning', 'Multiagent Systems']"
40997,http://arxiv.org/abs/1404.5899v1,"In this paper, we compare and analyze clusteri...",A Comparison of Clustering and Missing Data Me...,"['Ran Zhao', 'Deanna Needell', 'Christopher Jo...",2014-4-22,"['Numerical Analysis', 'Machine Learning']"
40998,http://arxiv.org/abs/1404.6369v1,Cylindrical algebraic decomposition(CAD) is a ...,Applying machine learning to the problem of ch...,"['Zongyan Huang', 'Matthew England', 'David Wi...",2014-4-25,"['Symbolic Computation', 'Machine Learning']"
