# Web scraping, Stage 4

Retrieving h-index, i10-index and citedby information.

Import needed libraries

In [1]:
import numpy as np
import pandas as pd
import json
from scholarly import scholarly, ProxyGenerator
from scholarly._navigator import MaxTriesExceededException
from tqdm.notebook import tqdm

## Retrieve index information

In [2]:
with open('../stage2/uni_authors.json') as f:
    authors_json = json.load(f)

In [3]:
def fill_in_author_metrics(item):
    num_authors = len(item['authors'])
    for i, a in tqdm(enumerate(item['authors']), 
                     total=len(item['authors']), 
                     desc='Retrieving metrics for University of {}'.format(item['university'].capitalize())):
        try:
            author = scholarly.search_author_id(a['scholar_id'])
            author = scholarly.fill(author, sections=['indices', 'coauthors', 'publications'])
        except MaxTriesExceededException:
            print('Query failed for author {} with id {}.'.format(a['name'], a['scholar_id']),
                  'Generating new proxy')
            pg = ProxyGenerator()
            pg.FreeProxies()
            scholarly.use_proxy(pg)
            print('Retrying')
            author = scholarly.search_author_id(a['scholar_id'])
            author = scholarly.fill(author, sections=['indices', 'coauthors', 'publications'])
            
        a['i10index'] = author['i10index']
        a['i10index5y'] = author['i10index5y']
        a['hindex'] = author['hindex']
        a['hindex5y'] = author['hindex5y']
        a['citedby'] = author['citedby']
        a['citedby5y'] = author['citedby5y']
        a['num_publications'] = len(author['publications'])

In [4]:
for item in tqdm(authors_json):
    while True:
        try:
            fill_in_author_metrics(item)
            break
        except:
            print('Retrying for university of', item['university'])
            print('Generating new proxy')
            pg = ProxyGenerator()
            pg.FreeProxies()
            scholarly.use_proxy(pg)

  0%|          | 0/5 [00:00<?, ?it/s]

Retrieving metrics for University of Oulu:   0%|          | 0/87 [00:00<?, ?it/s]

Retrieving metrics for University of Bochum:   0%|          | 0/36 [00:00<?, ?it/s]

Retrieving metrics for University of Porto:   0%|          | 0/66 [00:00<?, ?it/s]

Retrieving metrics for University of Bordeaux:   0%|          | 0/36 [00:00<?, ?it/s]

Retrieving metrics for University of Lodz:   0%|          | 0/51 [00:00<?, ?it/s]

In [5]:
with open('uni_authors.json', 'w') as f:
    json.dump(authors_json, f)