<a href="https://colab.research.google.com/github/NaughtyYoda/Web-Log-Analysis/blob/main/Web_Scraping_Citation_Information.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
# dataframe manipulation
import pandas as pd
# linear algebra
import numpy as np

# data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# web crawling
from bs4 import BeautifulSoup
import requests

# Get all professors from Deakin Staff list

In [2]:
# execute HTTP request
url = 'https://www.deakin.edu.au/information-technology/staff-listing'
req = requests.get(url)

In [3]:
# parse the HTML using Beautiful Soup
soup = BeautifulSoup(req.text)

In [4]:
# find all elements in HTML tables
prof_html_list = soup.findAll('td')

# first five elements
prof_html_list[:5]

[<td headers="table09355r1c1"><a href="https://www.deakin.edu.au/about-deakin/people/lynn-batten" target="_blank">Emeritus Professor Lynn Batten</a></td>,
 <td headers="table09355r1c1"><a href="https://www.deakin.edu.au/about-deakin/people/andrzej-goscinski">Emeritus Professor Andrzej Goscinski</a></td>,
 <td headers="table09355r1c1"><a href="https://www.deakin.edu.au/about-deakin/people/jemal-abawajy">Professor Jemal Abawajy</a></td>,
 <td headers="table09355r1c1"><a href="https://www.deakin.edu.au/about-deakin/people/maia-angelova-turkedjieva">Professor Maia Angelova</a></td>,
 <td headers="table09355r1c1"><a href="https://www.deakin.edu.au/about-deakin/people/gleb-beliakov">Professor Gleb Beliakov</a></td>]

In [5]:
# extract names using text method
print(prof_html_list[0])
print(prof_html_list[0].text)

<td headers="table09355r1c1"><a href="https://www.deakin.edu.au/about-deakin/people/lynn-batten" target="_blank">Emeritus Professor Lynn Batten</a></td>
Emeritus Professor Lynn Batten


In [6]:
def parse_name(stringtext):
    """
    This function parses the string with professors details 
    and splits it into name and title.
    """
    return " ".join(stringtext.split(" ")[-2:])," ".join(stringtext.split(" ")[:-2])

In [7]:
# store professor information in dataframe
prof_df = pd.DataFrame([prof.text for prof in prof_html_list if "Professor" in prof.text], 
                       columns=['fullname_title'])
prof_df['Name'] = prof_df['fullname_title'].apply(lambda x: parse_name(x)[0])
prof_df['Title'] = prof_df['fullname_title'].apply(lambda x: parse_name(x)[1])
prof_df['University'] = ['Deakin Univerity' for i in range(len(prof_df))]
prof_df.drop('fullname_title', axis=1, inplace=True)

# first five rows 
prof_df.head()

Unnamed: 0,Name,Title,University
0,Lynn Batten,Emeritus Professor,Deakin Univerity
1,Andrzej Goscinski,Emeritus Professor,Deakin Univerity
2,Jemal Abawajy,Professor,Deakin Univerity
3,Maia Angelova,Professor,Deakin Univerity
4,Gleb Beliakov,Professor,Deakin Univerity


# Crawl Professors citation information from Google scholar

In [8]:
citation_all = []
citation_since2016 = []
h_index_all = []
h_index_since2016 = []
i10_index_all = []
i10_index_since2016 = []
for i, name in enumerate(prof_df['Name']):
    first = name.split()[0]
    last = name.split()[1]
    search_url = f"""
    https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={first}+{last}+Deakin+Univeristy
    """
    gsc_page = requests.get(search_url)
    soup1 = BeautifulSoup(gsc_page.text, 'html.parser')
    try:
        # search for google scholar profile
        url_suffix = soup1.find('div', {'class': 'gs_ai_t'}).find('a')['href']
    except:
        # no google scholar profile
        citation_all.append(np.nan)
        citation_since2016.append(np.nan)
        h_index_all.append(np.nan)
        h_index_since2016.append(np.nan)
        i10_index_all.append(np.nan)
        i10_index_since2016.append(np.nan)
        continue
        
    # open google scholar profile and get citaion information
    gsc_profile_page = requests.get(f"https://scholar.google.com/{url_suffix}")
    soup2 = BeautifulSoup(gsc_profile_page.text, 'html.parser')
    citations_info = soup2.findAll('td', {'class': 'gsc_rsb_std'})
    if citations_info == []:
        # if there is no citation information
        citation_all.append(np.nan)
        citation_since2016.append(np.nan)
        h_index_all.append(np.nan)
        h_index_since2016.append(np.nan)
        i10_index_all.append(np.nan)
        i10_index_since2016.append(np.nan)
    else:
        citation_all.append(int(citations_info[0].text))
        citation_since2016.append(int(citations_info[1].text))
        h_index_all.append(int(citations_info[2].text))
        h_index_since2016.append(int(citations_info[3].text))
        i10_index_all.append(int(citations_info[4].text))
        i10_index_since2016.append(int(citations_info[5].text))

In [9]:
# create data frame
data_dict = {}
data_dict['name'] = prof_df['Name']
data_dict['title'] = prof_df['Title']
data_dict['citation_all'] = citation_all
data_dict['citation_since2016'] = citation_since2016
data_dict['h_index_all'] = h_index_all
data_dict['h_index_since2016'] = h_index_since2016
data_dict['i10_index_all'] = i10_index_all
data_dict['i10_index_since2016'] = i10_index_since2016
citation_df = pd.DataFrame(data_dict)

# remove missing values
citation_df.dropna(axis=0, inplace=True)

# fix data type
for i in np.arange(2, 8):
  citation_df.iloc[:, i] = citation_df.iloc[:, i].astype('int')

In [10]:
citation_df

Unnamed: 0,name,title,citation_all,citation_since2016,h_index_all,h_index_since2016,i10_index_all,i10_index_since2016
0,Lynn Batten,Emeritus Professor,2590,1229,23,16,45,21
3,Maia Angelova,Professor,708,423,16,12,22,16
4,Gleb Beliakov,Professor,8267,4722,42,32,135,87
5,Terry Caelli,Professor,8939,1554,54,20,182,36
6,Jinho Choi,Professor,8100,5291,37,30,173,102
7,Chang-Tsun Li,Professor,4464,2456,34,22,106,66
9,Peter Eklund,Professor,4087,891,35,14,75,24
10,Seng Loke,Professor,7271,2978,38,21,129,52
11,Antonio Robles-Kelly,Professor,3741,1615,24,16,62,27
12,Jean-Guy Schneider,Professor,1844,673,24,16,48,21
