# Web scraping, Stage 5

Requerying failed AUTH authors

Import needed libraries

In [1]:
import numpy as np
import pandas as pd
import json
from scholarly import scholarly, ProxyGenerator
from scholarly._navigator import MaxTriesExceededException
from tqdm.notebook import tqdm

In [2]:
def retrieve_authors(df, uni_name):
    authors = []
    failed = []

    num_authors = len(df)
    for i, name in tqdm(enumerate(df['Full name']),
                        total=num_authors, 
                        desc='Retrieving authors for University of {}'.format(uni_name.capitalize())):
        
        if 'Alternative search' in df and not pd.isnull(df['Alternative search'][i]):
            if df['Alternative search'][i] == '<blank>':
                query = scholarly.search_author(name)
            else:
                query = scholarly.search_author(name + ', ' + df['Alternative search'][i])
        else:
            query = scholarly.search_author(name + ', ' + uni_name)
        try:
            try:
                author = scholarly.fill(next(query), sections=['indices', 'coauthors', 'publications'])
            except MaxTriesExceededException:
                print('Query failed. Generating new proxy')
                pg = ProxyGenerator()
                pg.FreeProxies()
                scholarly.use_proxy(pg)
                author = scholarly.fill(next(query), sections=['coauthors'])
            
            author_dict = {}
            author_dict['scholar_id']  = author['scholar_id']
            author_dict['name'] = author['name']
            author_dict['affiliation'] = author['affiliation']
            author_dict['gender'] = df['Gender'][i]
            author_dict['role'] = df['Role'][i]
            author_dict['url_picture'] = author['url_picture']
            author_dict['i10index'] = author['i10index']
            author_dict['i10index5y'] = author['i10index5y']
            author_dict['hindex'] = author['hindex']
            author_dict['hindex5y'] = author['hindex5y']
            author_dict['citedby'] = author['citedby']
            author_dict['citedby5y'] = author['citedby5y']
            author_dict['num_publications'] = len(author['publications'])
            author_dict['coauthors'] = [coauth['scholar_id'] for coauth in author['coauthors']]
            authors.append(author_dict)
        except StopIteration:
            failed.append(name)

    print('Number of retrieved authors:', len(authors))
    print('Number of failed authors:', len(failed))
    
    return authors, failed

## Requery failed authors

In [3]:
with open('failed_for_search.txt') as f:
    for l in f.readlines():
        l = l.split(',')
        idx_to_search = [int(i) for i in l]

In [4]:
df = pd.read_excel('AUTHstaff.xlsx').loc[idx_to_search].reset_index()
df['Full name'] = df['First name'].str.cat(df['Last name'], sep=' ')

In [5]:
df

Unnamed: 0,index,First name,Last name,Email,Gender,Role,Faculty,Alternative search,Full name
0,10,Nikos,Laskaris,laskaris@csd.auth.gr,M,Associate Professor,CS,auth,Nikos Laskaris
1,13,Petros,Nicopolitidis,petros@csd.auth.gr,M,Associate Professor,CS,,Petros Nicopolitidis
2,17,I.,Pitas,pitas@csd.auth.gr,M,Professor,CS,,I. Pitas
3,20,Ioannis,Stamelos,stamelos@csd.auth.gr,M,Professor,CS,aristotle,Ioannis Stamelos
4,25,Athena,Vakali,avakali@csd.auth.gr,F,Professor,CS,aristotle,Athena Vakali
5,35,Athanasios,Kehagias,kehagiat@auth.gr,M,Associate Professor,ECE,aristotle,Athanasios Kehagias
6,52,Georgios,Sergiadis,sergiadi@auth.gr,M,Professor,ECE,aristotle,Georgios Sergiadis
7,54,Dimitrios,Chrissoulidis,dpchriss@auth.gr,M,Professor,ECE,auth,Dimitrios Chrissoulidis
8,62,Charis,Demoulias,chdimoul@auth.gr,M,Professor,ECE,auth,Charis Demoulias
9,69,Vasilis,Chatziathanasiou,hatziath@auth.gr,M,Associate Professor,ECE,auth,Vasilis Chatziathanasiou


In [6]:
uni_name = 'thessaloniki'
authors, failed = retrieve_authors(df, uni_name)
requeried_auth_authors = {'university': uni_name, 'authors': authors, 'failed': failed}

Retrieving authors for University of Thessaloniki:   0%|          | 0/10 [00:00<?, ?it/s]

Number of retrieved authors: 10
Number of failed authors: 0


## Merge successful and re-queried authors

In [7]:
with open('../stage3/auth_authors.json') as f:
    auth_authors = json.load(f)

In [8]:
scholarIDs = map(lambda x: x['scholar_id'], auth_authors['authors'])
for ra in requeried_auth_authors['authors']:
    if ra['scholar_id'] not in scholarIDs:
        auth_authors['authors'].append(ra)

## Remove re-queried authors

In [9]:
df2 = pd.read_excel('../stage3/AUTHstaff.xlsx')
df2['Full name'] = df2['First name'].str.cat(df2['Last name'], sep=' ')

In [10]:
for name in df2.loc[df['index']]['Full name']:
    try:
        auth_authors['failed'].remove(name)
    except ValueError:
            print(uni_name + ', ' +  name)

## Save new author data

In [11]:
with open('auth_authors.json', 'w') as f:
    json.dump(auth_authors, f)