In [1]:
import xml.etree.ElementTree as ET
from fuzzywuzzy import fuzz
import json
import xmltodict
import urllib
import dateutil.parser

In [2]:
def if_same_person(first, second):
    ser = fuzz.token_set_ratio(first, second)    
    return ser > 75

In [3]:
def extract_year(time_string):
    return dateutil.parser.parse(time_string).year

In [4]:
def build_url(first_name, last_name):
    url = "http://export.arxiv.org/api/query?search_query=au:"
    name = last_name + "_" + first_name[0]
    url += name + "&max_results=1000"
    return url

In [5]:
def get_list(src):
    ret = []
    #try:
    for entry in src['feed'][u'entry']:

        unit = {}

        # get authors
        unit['authors'] = entry['author']

        # get authors' name strings
        if type(entry['author']) is list:
            name_list = [ p['name'] for p in entry['author'] ]
            name_list_string = "; ".join(name_list) + "; "
        else:
            name_list_string = entry['author']['name']
        # get title, w/ name strings attached in the front
        unit['title'] = name_list_string + entry['title']

        # get article id
        unit['id'] = entry['id']

        # get date
        unit['year'] = dateutil.parser.parse(entry['published']).year

        unit['collaborators'] = []
        ret.append(unit)
    #except:
    #    pass
    return ret

In [6]:
def get_papers(first_name, last_name):
    
    ret = {}
    try:
        url = build_url(first_name, last_name)
        print url
        data = urllib.urlopen(url).read()
        hold = json.loads(json.dumps(xmltodict.parse(data)))
        papers = get_list(hold)
    
    
        for paper in papers:
            pid = paper['id']
            ret[pid] = paper
    except:
        pass
    return ret

## Load Profile

In [7]:
with open("profile.json") as f:
    old_profile_base = json.load(f)

In [8]:
def make_mapping_from_id_to_person(base):
    mapping = {}
    for p in base['items']:
        mapping[p['member_id']] = p
    return mapping

#### Build paper list for each professor

In [9]:
paper_base = {}

In [None]:
for person in old_profile_base['items']:
    pid = person['member_id']
    paper_base[pid] = get_papers(person['name'], person['surname'])
    print person['name'], person['surname'], len(paper_base[pid])

http://export.arxiv.org/api/query?search_query=au:Agol_I&max_results=1000
Ian Agol 30
http://export.arxiv.org/api/query?search_query=au:Andersen_J&max_results=1000
Jørgen Andersen 304
http://export.arxiv.org/api/query?search_query=au:Athreya_J&max_results=1000
Jayadev Athreya 28
http://export.arxiv.org/api/query?search_query=au:Baraglia_D&max_results=1000
David Baraglia 18
http://export.arxiv.org/api/query?search_query=au:Basmajian_A&max_results=1000
Ara Basmajian 5
http://export.arxiv.org/api/query?search_query=au:Belolipetsky_M&max_results=1000
Mikhail Belolipetsky 26
http://export.arxiv.org/api/query?search_query=au:Benoist_Y&max_results=1000
Yves Benoist 5
http://export.arxiv.org/api/query?search_query=au:Biquard_O&max_results=1000
Olivier Biquard 22
http://export.arxiv.org/api/query?search_query=au:Boden_H&max_results=1000
Hans Boden 24
http://export.arxiv.org/api/query?search_query=au:Boileau_M&max_results=1000
Michel Boileau 26
http://export.arxiv.org/api/query?search_query=au:B

In [None]:
paper_base

# Merge databases

In [None]:
with open("papers.json") as f:
    old_paper_base = json.load(f)
    
with open("profile.json") as f:
    old_profile_base = json.load(f)

In [None]:
print "Oscar Garcia-Prada"

In [None]:
def get_unused_min_id(base):
    ids = [p['id'] for p in base['papers']]
    return max(idx)+1

In [None]:
def make_mapping_from_id_to_person(base):
    mapping = {}
    for p in base['items']:
        mapping[p['member_id']] = p
    return mapping

In [None]:
def make_all_unique_pairs(mapping):
    ret = []
    keys = mapping.keys()
    for i in range(0, len(keys)-1):
        for j in range(i+1, len(keys)):
            ret.append((keys[i],keys[j]))
    return ret

In [None]:
temp_paper_base = {}
temp_paper_collaborator = {}

In [None]:
# find papers

id2person = make_mapping_from_id_to_person(old_profile_base)
pairs = make_all_unique_pairs(id2person)

for person in id2person.values():
    if type(person['gear_collaborators']) is not list:
        person['gear_collaborators'] = []
        
for pair in pairs:
    
    if(len(temp_paper_base)%10 == 0):
        print len(temp_paper_base)
        
    p0 = id2person[pair[0]]
    p1 = id2person[pair[1]]
    papers = get_papers(p0['surname'], p0['name'], p1['surname'], p1['name'])
    
    for p in papers:
        pid = p['id']
        temp_paper_base[pid] = p
        if pid in temp_paper_collaborator:
            temp_paper_collaborator[pid].extend([pair[0], pair[1]])
            temp_paper_collaborator[pid] = list(set(temp_paper_collaborator[pid]))
        else:
            temp_paper_collaborator[pid] = [pair[0], pair[1]]

In [None]:
# main

id2person = make_mapping_from_id_to_person(old_profile_base)
pairs = make_all_unique_pairs(id2person)

for person in id2person.values():
    if type(person['gear_collaborators']) is not list:
        person['gear_collaborators'] = []
        
for pair in pairs:
    p0 = id2person[pair[0]]
    p1 = id2person[pair[1]]
    papers = get_papers(p0['surname'], p0['name'], p1['surname'], p1['name'])
    if len(papers) > 0:
        
        p0['gear_collaborators'].append(pair[1])
        p0['gear_collaborators'] = list(set(p0['gear_collaborators']))
        p1['gear_collaborators'].append(pair[0])
        p1['gear_collaborators'] = list(set(p1['gear_collaborators']))


In [None]:
data = urllib.urlopen("http://export.arxiv.org/api/query?search_query=au:Bradlow_S&max_results=1000").read()

In [None]:
data

In [None]:
hold = json.loads(json.dumps(xmltodict.parse(data)))

In [None]:
hold

In [None]:
get_list(hold)

In [None]:
def extract_year(time_string):
    return dateutil.parser.parse(time_string).year