# Web scraping, Stage 3

Retrieving AUTH staff.

Import needed libraries

In [1]:
import numpy as np
import pandas as pd
import json
from scholarly import scholarly, ProxyGenerator
from scholarly._navigator import MaxTriesExceededException

We read the data and create a new `Full name` column.

In [2]:
auth_df = pd.read_excel('AUTHstaff.xlsx')
auth_df['Full name'] = auth_df['First name'].str.cat(auth_df['Last name'], sep=' ')

We re-use the function we specified in previous notebooks by extending it to include h-index, i10 index and citedby information.

In [3]:
def retrieve_authors(df, uni_name):
    authors = []
    failed = []

    num_authors = len(df)
    for i, name in enumerate(df['Full name']):
        if i % 10 == 0:
            print('Retrieving author no. {} out of {} for University of {}'.format(i + 1,
                                                                                   num_authors, 
                                                                                   uni_name.capitalize()))
        if 'Alternative search' in df and not pd.isnull(df['Alternative search'][i]):
            if df['Alternative search'][i] == '<blank>':
                query = scholarly.search_author(name)
            else:
                query = scholarly.search_author(name + ', ' + df['Alternative search'][i])
        else:
            query = scholarly.search_author(name + ', ' + uni_name)
        try:
            try:
                author = scholarly.fill(next(query), sections=['indices', 'coauthors', 'publications'])
            except MaxTriesExceededException:
                print('Query failed. Generating new proxy')
                pg = ProxyGenerator()
                pg.FreeProxies()
                scholarly.use_proxy(pg)
                author = scholarly.fill(next(query), sections=['coauthors'])
            
            author_dict = {}
            author_dict['scholar_id']  = author['scholar_id']
            author_dict['name'] = author['name']
            author_dict['affiliation'] = author['affiliation']
            author_dict['gender'] = df['Gender'][i]
            author_dict['role'] = df['Role'][i]
            author_dict['url_picture'] = author['url_picture']
            author_dict['i10index'] = author['i10index']
            author_dict['i10index5y'] = author['i10index5y']
            author_dict['hindex'] = author['hindex']
            author_dict['hindex5y'] = author['hindex5y']
            author_dict['citedby'] = author['citedby']
            author_dict['citedby5y'] = author['citedby5y']
            author_dict['num_publications'] = len(author['publications'])
            author_dict['coauthors'] = [coauth['scholar_id'] for coauth in author['coauthors']]
            authors.append(author_dict)
        except StopIteration:
            failed.append(name)

    print('Number of retrieved authors:', len(authors))
    print('Number of failed authors:', len(failed))
    
    return authors, failed

We retrieve the successfully queried and failed authors.

In [4]:
uni_name = 'thessaloniki'
authors, failed = retrieve_authors(auth_df, uni_name)
auth_queried_authors = {'university': uni_name, 'authors': authors, 'failed': failed}

Retrieving author no. 1 out of 74 for University of Thessaloniki
Retrieving author no. 11 out of 74 for University of Thessaloniki
Retrieving author no. 21 out of 74 for University of Thessaloniki
Retrieving author no. 31 out of 74 for University of Thessaloniki
Retrieving author no. 41 out of 74 for University of Thessaloniki
Retrieving author no. 51 out of 74 for University of Thessaloniki
Retrieving author no. 61 out of 74 for University of Thessaloniki
Retrieving author no. 71 out of 74 for University of Thessaloniki
Number of retrieved authors: 54
Number of failed authors: 20


We save them into a JSON file.

In [5]:
with open('auth_authors.json', 'w') as f:
    json.dump(auth_queried_authors, f)