# Web scraping, Stage 10

Requery all authors to retrieve citations and interests

In [1]:
import numpy as np
import pandas as pd
import json
from scholarly import scholarly, ProxyGenerator
from scholarly._navigator import MaxTriesExceededException
from tqdm.notebook import tqdm

In [2]:
def fill_in_citations_interests(item):
    num_authors = len(item['authors'])
    for i, a in tqdm(enumerate(item['authors']), 
                     total=len(item['authors']), 
                     desc='Retrieving citations and interests' + 
                     ' for University of {}'.format(item['university'].capitalize())):
        try:
            try:
                author = scholarly.search_author_id(a['scholar_id'])
                author = scholarly.fill(author, sections=['counts'])
            except MaxTriesExceededException:
                print('Query failed for author {} with id {}.'.format(a['name'], a['scholar_id']),
                      'and University of', item['university'] , 'Generating new proxy')
                pg = ProxyGenerator()
                pg.FreeProxies()
                scholarly.use_proxy(pg)
                print('Retrying')
                author = scholarly.search_author_id(a['scholar_id'])
                author = scholarly.fill(author, sections=['indices', 'coauthors', 'publications'])
        except:
            print('Query for author with id {} failed.'.format(a))
            
        a['interests'] = author['interests']
        a['cites_per_year'] = author['cites_per_year']

In [3]:
authors = json.load(open('../stage8/uni_authors.json'))

In [4]:
for uni in authors:
    fill_in_citations_interests(uni)

Retrieving citations and interests for University of Oulu:   0%|          | 0/87 [00:00<?, ?it/s]

Retrieving citations and interests for University of Bochum:   0%|          | 0/36 [00:00<?, ?it/s]

Retrieving citations and interests for University of Porto:   0%|          | 0/66 [00:00<?, ?it/s]

Retrieving citations and interests for University of Bordeaux:   0%|          | 0/36 [00:00<?, ?it/s]

Retrieving citations and interests for University of Lodz:   0%|          | 0/51 [00:00<?, ?it/s]

Retrieving citations and interests for University of Thessaloniki:   0%|          | 0/64 [00:00<?, ?it/s]

Verify interests and cites have been inserted.

In [5]:
authors[0]['authors'][0].keys()

dict_keys(['scholar_id', 'name', 'affiliation', 'gender', 'role', 'url_picture', 'coauthors', 'i10index', 'i10index5y', 'hindex', 'hindex5y', 'citedby', 'citedby5y', 'num_publications', 'interests', 'cites_per_year'])

Save the authors to a JSON file. 

In [8]:
with open('uni_authors.json', 'w') as f:
    json.dump(authors, f)