# Web scraping, Stage 1

Retrieving the initial data.

Import needed libraries.

In [1]:
import pandas as pd
from scholarly import scholarly, ProxyGenerator
from scholarly._navigator import MaxTriesExceededException
import json

Read the data. Make sure all sheets are selected by setting `sheet_name` to `None`.

In [2]:
dfs = pd.read_excel('staff.xlsx', None)

Create a list of the university names for each sheet.

In [3]:
universities = ['oulu',
               'bochum',
               'porto',
               'bordeaux',
               'lodz']

Rename the keys of the dfs dictionary object containing the sheets in the form of DataFrames.

In [4]:
dfs = dict((universities[i], value) for ((key, value), i) in zip(dfs.items(), range(5)))

Replace `Male` and `Female` values to `M` and `F` respectively and create a new column for each dataframe containing the full name of candidate authors.

In [5]:
for df in dfs.values():
    df['Gender'].replace({'Male': 'M', 'Female': 'F'}, inplace=True)
    if not 'Full name' in df:
        df['Full name'] = df['First name'].str.cat(df['Last name'], sep=' ')

Create a function for retrieving the successfully queried authors and the names of failed authors.

In [6]:
def retrieve_authors(df, uni_name):
    # Create the authors and failed authors list
    authors = []
    failed = []

    num_authors = len(df)
    # For each author
    for i, name in enumerate(df['Full name']):
        # Print the progress every 10 steps
        if i % 10 == 0:
            print('Retrieving author no. {} out of {} for University of {}'.format(i + 1,
                                                                                   num_authors, 
                                                                                   uni_name.capitalize()))
        # If an alternative search keyword for the university name is provided instead, use it
        if 'Alternative search' in df and not pd.isnull(df['Alternative search'][i]):
            query = scholarly.search_author(name + ', ' + df['Alternative search'][i])
        else:
            query = scholarly.search_author(name + ', ' + uni_name)
        try:
            try:
                # Try to retrieve the first item from the query
                author = scholarly.fill(next(query), sections=['coauthors'])
            except MaxTriesExceededException:
                # If the query fails due to a maximum tries exception, generate a new proxy
                print('Query failed. Generating new proxy')
                pg = ProxyGenerator()
                pg.FreeProxies()
                scholarly.use_proxy(pg)
                # And try again
                author = scholarly.fill(next(query), sections=['coauthors'])
            
            # Make the author dictionary object and append it to the authors
            author_dict = {}
            author_dict['scholar_id']  = author['scholar_id']
            author_dict['name'] = author['name']
            author_dict['affiliation'] = author['affiliation']
            author_dict['gender'] = df['Gender'][i]
            author_dict['role'] = df['Role'][i]
            author_dict['url_picture'] = author['url_picture']
            author_dict['coauthors'] = [coauth['scholar_id'] for coauth in author['coauthors']]
            authors.append(author_dict)
        except StopIteration:
            # If the query returns nothing append the name of the author to the failed list
            # WARNING: We should make sure that if the first author in the query is not the correct one
            # the query should return nothing. To do this, tinker with the `Alternative search` field.
            # To verify no authors are returned, simply query Google Scholar from your web browser as such
            # <author name>, <invalid keyword>
            failed.append(name)

    print('Number of retrieved authors:', len(authors))
    print('Number of failed authors:', len(failed))
    
    # Return the successfully queried authors and the names of the failed ones
    return authors, failed

Generate a proxy to make sure you overcome the maximum number of tries error.

Google Scholar API can be pinged as many times as we need to generate our data but cannot be pinged infinetely.

In [7]:
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)

Retrieve the successfully queried and the failed authors for each university.

In [8]:
uni_authors = []
for uni_name, df in dfs.items():
    authors, failed = retrieve_authors(df, uni_name)
    uni_authors.append({'university': uni_name, 'authors': authors, 'failed': failed})

Retrieving author no. 1 out of 116 for University of Oulu
Retrieving author no. 11 out of 116 for University of Oulu
Retrieving author no. 21 out of 116 for University of Oulu
Retrieving author no. 31 out of 116 for University of Oulu
Retrieving author no. 41 out of 116 for University of Oulu
Retrieving author no. 51 out of 116 for University of Oulu
Retrieving author no. 61 out of 116 for University of Oulu
Retrieving author no. 71 out of 116 for University of Oulu
Retrieving author no. 81 out of 116 for University of Oulu
Retrieving author no. 91 out of 116 for University of Oulu
Retrieving author no. 101 out of 116 for University of Oulu
Retrieving author no. 111 out of 116 for University of Oulu
Number of retrieved authors: 78
Number of failed authors: 38
Retrieving author no. 1 out of 54 for University of Bochum
Retrieving author no. 11 out of 54 for University of Bochum
Retrieving author no. 21 out of 54 for University of Bochum
Retrieving author no. 31 out of 54 for University o

Dump all retrieved author information into a JSON file.

In [14]:
with open('uni_authors.json', 'w') as f:
    json.dump(uni_authors, f)

Also dump the failed author names into another, separate JSON file.

In [20]:
uni_authors_failed = [{'university': authors['university'], 
                       'failed': authors['failed']} for authors in uni_authors]

with open('uni_authors_failed.json', 'w') as f:
    json.dump(uni_authors_failed, f)