In [1]:
import xml.etree.ElementTree as ET
from fuzzywuzzy import fuzz
import json
import xmltodict
import urllib
import dateutil.parser

In [2]:
def if_same_person(first, second):
    ser = fuzz.token_set_ratio(first, second)    
    return ser > 75

In [3]:
def extract_year(time_string):
    return dateutil.parser.parse(time_string).year

In [4]:
def build_url(first_name, last_name):
    url = "http://export.arxiv.org/api/query?search_query=au:"
    name = last_name + "_" + first_name[0]
    url += name + "&max_results=1000"
    return url

In [5]:
def get_list(src):
    ret = []
    #try:
    for entry in src['feed'][u'entry']:

        unit = {}

        # get authors
        unit['authors'] = entry['author']

        # get authors' name strings
        if type(entry['author']) is list:
            name_list = [ p['name'] for p in entry['author'] ]
            name_list_string = "; ".join(name_list) + "; "
        else:
            name_list_string = entry['author']['name']
        # get title, w/ name strings attached in the front
        unit['title'] = name_list_string + entry['title']

        # get article id
        unit['id'] = entry['id']

        # get date
        unit['year'] = dateutil.parser.parse(entry['published']).year

        unit['collaborators'] = []
        ret.append(unit)
    #except:
    #    pass
    return ret

In [6]:
def get_papers(first_name, last_name):
    
    ret = {}
    try:
        url = build_url(first_name, last_name)
        print url
        data = urllib.urlopen(url).read()
        hold = json.loads(json.dumps(xmltodict.parse(data)))
        papers = get_list(hold)
    
    
        for paper in papers:
            pid = paper['id']
            ret[pid] = paper
    except:
        pass
    return ret

## Load Profile

In [7]:
with open("profile.json") as f:
    old_profile_base = json.load(f)

In [8]:
def make_mapping_from_id_to_person(base):
    mapping = {}
    for p in base['items']:
        mapping[p['member_id']] = p
    return mapping

#### Build paper list for each professor

In [9]:
paper_base = {}

In [None]:
for person in old_profile_base['items']:
    pid = person['member_id']
    
    paper_base[pid] = get_papers(person['name'], person['surname'])
    print person['name'], person['surname'], len(paper_base[pid])

In [None]:
paper_base

# Merge databases

In [None]:
with open("papers.json") as f:
    old_paper_base = json.load(f)
    
with open("profile.json") as f:
    old_profile_base = json.load(f)

In [None]:
print "Oscar Garcia-Prada"

In [None]:
def get_unused_min_id(base):
    ids = [p['id'] for p in base['papers']]
    return max(idx)+1

In [None]:
def make_mapping_from_id_to_person(base):
    mapping = {}
    for p in base['items']:
        mapping[p['member_id']] = p
    return mapping

In [None]:
def make_all_unique_pairs(mapping):
    ret = []
    keys = mapping.keys()
    for i in range(0, len(keys)-1):
        for j in range(i+1, len(keys)):
            ret.append((keys[i],keys[j]))
    return ret

In [None]:
temp_paper_base = {}
temp_paper_collaborator = {}

In [None]:
# find papers

id2person = make_mapping_from_id_to_person(old_profile_base)
pairs = make_all_unique_pairs(id2person)

for person in id2person.values():
    if type(person['gear_collaborators']) is not list:
        person['gear_collaborators'] = []
        
for pair in pairs:
    
    if(len(temp_paper_base)%10 == 0):
        print len(temp_paper_base)
        
    p0 = id2person[pair[0]]
    p1 = id2person[pair[1]]
    papers = get_papers(p0['surname'], p0['name'], p1['surname'], p1['name'])
    
    for p in papers:
        pid = p['id']
        temp_paper_base[pid] = p
        if pid in temp_paper_collaborator:
            temp_paper_collaborator[pid].extend([pair[0], pair[1]])
            temp_paper_collaborator[pid] = list(set(temp_paper_collaborator[pid]))
        else:
            temp_paper_collaborator[pid] = [pair[0], pair[1]]

In [None]:
# main

id2person = make_mapping_from_id_to_person(old_profile_base)
pairs = make_all_unique_pairs(id2person)

for person in id2person.values():
    if type(person['gear_collaborators']) is not list:
        person['gear_collaborators'] = []
        
for pair in pairs:
    p0 = id2person[pair[0]]
    p1 = id2person[pair[1]]
    papers = get_papers(p0['surname'], p0['name'], p1['surname'], p1['name'])
    if len(papers) > 0:
        
        p0['gear_collaborators'].append(pair[1])
        p0['gear_collaborators'] = list(set(p0['gear_collaborators']))
        p1['gear_collaborators'].append(pair[0])
        p1['gear_collaborators'] = list(set(p1['gear_collaborators']))


In [10]:
data = urllib.urlopen("http://export.arxiv.org/api/query?search_query=au:Bradlow_S&max_results=1000").read()

In [11]:
data

'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dau%3ABradlow_S%26id_list%3D%26start%3D0%26max_results%3D1000" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=au:Bradlow_S&amp;id_list=&amp;start=0&amp;max_results=1000</title>\n  <id>http://arxiv.org/api/pG4cmntU0oTaAlHarLLoqS4Jr+s</id>\n  <updated>2016-03-29T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">24</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1000</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/alg-geom/9506008v1</id>\n    <updated>1995-06-05T23:50:03Z</updated>\n    <published>1995-06-05T23:50:03Z</published>\n    <title>Hermitian-Einstein Inequa

In [12]:
hold = json.loads(json.dumps(xmltodict.parse(data)))

In [13]:
hold

{u'feed': {u'@xmlns': u'http://www.w3.org/2005/Atom',
  u'entry': [{u'arxiv:comment': {u'#text': u'AMSTeX v2.1, to appear in Int.J.Math',
     u'@xmlns:arxiv': u'http://arxiv.org/schemas/atom'},
    u'arxiv:primary_category': {u'@scheme': u'http://arxiv.org/schemas/atom',
     u'@term': u'alg-geom',
     u'@xmlns:arxiv': u'http://arxiv.org/schemas/atom'},
    u'author': {u'name': u'Steven B. Bradlow'},
    u'category': [{u'@scheme': u'http://arxiv.org/schemas/atom',
      u'@term': u'alg-geom'},
     {u'@scheme': u'http://arxiv.org/schemas/atom', u'@term': u'math.AG'}],
    u'id': u'http://arxiv.org/abs/alg-geom/9506008v1',
    u'link': [{u'@href': u'http://arxiv.org/abs/alg-geom/9506008v1',
      u'@rel': u'alternate',
      u'@type': u'text/html'},
     {u'@href': u'http://arxiv.org/pdf/alg-geom/9506008v1',
      u'@rel': u'related',
      u'@title': u'pdf',
      u'@type': u'application/pdf'}],
    u'published': u'1995-06-05T23:50:03Z',
    u'summary': u'Unstable holomorphic bundles

In [14]:
get_list(hold)

[{'authors': {u'name': u'Steven B. Bradlow'},
  'collaborators': [],
  'id': u'http://arxiv.org/abs/alg-geom/9506008v1',
  'title': u'Steven B. BradlowHermitian-Einstein Inequalities and Harder-Narasimhan Filtration',
  'year': 1995},
 {'authors': [{u'name': u'S. Bradlow'},
   {u'name': u'G. Daskalopoulos'},
   {u'name': u'R. Wentworth'}],
  'collaborators': [],
  'id': u'http://arxiv.org/abs/alg-geom/9304001v2',
  'title': u'S. Bradlow; G. Daskalopoulos; R. Wentworth; Birational Equivalences of Vortex Moduli',
  'year': 1993},
 {'authors': [{u'name': u'Steven Bradlow'}, {u'name': u'Oscar Garcia-Prada'}],
  'collaborators': [],
  'id': u'http://arxiv.org/abs/alg-geom/9602010v2',
  'title': u'Steven Bradlow; Oscar Garcia-Prada; Non-abelian monopoles and vortices',
  'year': 1996},
 {'authors': [{u'name': u'Steven Bradlow'}, {u'name': u'Graeme Wilkin'}],
  'collaborators': [],
  'id': u'http://arxiv.org/abs/1308.1460v1',
  'title': u'Steven Bradlow; Graeme Wilkin; Morse Theory, Higgs fie

In [15]:
def extract_year(time_string):
    return dateutil.parser.parse(time_string).year