# Web scraping, Stage 7

Retrieving coauthor data

In [1]:
import numpy as np
import pandas as pd
import json
from scholarly import scholarly, ProxyGenerator
from scholarly._navigator import MaxTriesExceededException
from tqdm.notebook import tqdm

In [2]:
def retrieve_coauthors(coauthors):
    successful = []
    failed = []

    print('Retrieving coauthor information...')
    num_coauthors = len(coauthors)
    for ca in tqdm(coauthors,
                   total=num_coauthors):
        try:
            try:
                author = scholarly.search_author_id(ca)
                author = scholarly.fill(author, sections=['indices', 'coauthors', 'publications'])
            except MaxTriesExceededException:
                print('Query failed for coauthor with id {}.'.format(ca),
                      'Generating new proxy.')
                pg = ProxyGenerator()
                pg.FreeProxies()
                scholarly.use_proxy(pg)
                print('Retrying')
                author = scholarly.search_author_id(ca)
                author = scholarly.fill(author, sections=['indices', 'coauthors', 'publications'])

            url_picture = ''
            if 'url_picture' in author:
                url_picture = author['url_picture']
            # edge cases
            citedby = 0
            if 'citedby' in author:
                citedby = author['citedby']
            citedby5y = 0
            if 'citedby5y' in author:
                citedby5y = author['citedby5y']
                
            author_dict = {'scholar_id': author['scholar_id'],
                           'name': author['name'],
                           'affiliation': author['affiliation'],
                           'url_picture': url_picture,
                           'i10index': author['i10index'],
                           'i10index5y': author['i10index5y'],
                           'hindex': author['hindex'],
                           'hindex5y': author['hindex5y'],
                           'citedby': citedby,
                           'citedby5y': citedby5y,
                           'num_publications': len(author['publications']),
                           'coauthors': [coauth['scholar_id'] for coauth in author['coauthors']]
                           }
            successful.append(author_dict)

        except:
            print('Query for coauthor with id {} failed.'.format(ca),
                  'Adding coauthor to the failed list.')
            failed.append(ca)

    print('Number of retrieved authors:', len(successful))
    print('Number of failed authors:', len(failed))

    return successful, failed

In [3]:
with open('../stage6/uni_authors.json') as f:
    uni_authors = json.load(f)

In [4]:
coauthors = []
for university in uni_authors:
    for author in university['authors']:
        coauthors += author['coauthors']
coauthors = set(coauthors)

In [5]:
successful, failed = retrieve_coauthors(coauthors)

Retrieving coauthor information...


  0%|          | 0/2514 [00:00<?, ?it/s]

Query failed for coauthor with id DEgZaUoAAAAJ. Generating new proxy.
Retrying
Query for coauthor with id DEgZaUoAAAAJ failed. Adding coauthor to the failed list.
Query failed for coauthor with id vKIlvHIAAAAJ. Generating new proxy.
Retrying
Query failed for coauthor with id LM0e_fcAAAAJ. Generating new proxy.
Retrying
Query failed for coauthor with id v-WRbB4AAAAJ. Generating new proxy.
Retrying
Query failed for coauthor with id PYu6uf0AAAAJ. Generating new proxy.
Query for coauthor with id PYu6uf0AAAAJ failed. Adding coauthor to the failed list.
Query failed for coauthor with id vz1IXCIAAAAJ. Generating new proxy.
Retrying
Query failed for coauthor with id c9Dl7qwAAAAJ. Generating new proxy.
Retrying
Number of retrieved authors: 2512
Number of failed authors: 2


In [6]:
coauthors_json = {'successful': successful, 'failed': failed}

In [7]:
with open('coauthors.json', 'w') as f:
    json.dump(coauthors_json, f)