# Things to Scrape

1. Professor's Experience/Affiliation (DBLP) & Google Scholar URL --> Background info of researcher
2. Description, Year, Venues of all Publications by invidivual (Google Scholar) --> Quantitative Metric & Track Shifts in Research Interest
3. Collaboration counts and their citation counts for each co-authors(DBLP & Google Scholar) --> Create Coauthor Network Graph
4. Scrape Conference/Journals Ratings (portal conference website)
5. Scrape the research interest for each professor


In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, NavigableString
import requests
from thefuzz import fuzz
import re
import itertools
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
import os,json
from tqdm import tqdm
import time

# Extract Google Scholar

In [2]:
'''

    {
        'name': "Li Boyang",
        'interest': ['ML','AI'],
        'articles':[
            {
                'Title': "Story Generation",
                'CO-Authors' : ['Mark Riedl','LOL'],
                'date': str,
                'venue': str,
                'Description: str,
                'Cited by': int,
                'citation_graph':dict
            },
        ],
        'co_authors_profile_url': [
            'https://scholar.google.com/citations?user=Yg_QjxcAAAAJ&hl=en',
        ]
        'all_citation_count': int
        'recent_citation_count: int
        }


'''


'\n\n    {\n        \'name\': "Li Boyang",\n        \'interest\': [\'ML\',\'AI\'],\n        \'articles\':[\n            {\n                \'Title\': "Story Generation",\n                \'CO-Authors\' : [\'Mark Riedl\',\'LOL\'],\n                \'date\': str,\n                \'venue\': str,\n                \'Description: str,\n                \'Cited by\': int,\n                \'citation_graph\':dict\n            },\n        ],\n        \'co_authors_profile_url\': [\n            \'https://scholar.google.com/citations?user=Yg_QjxcAAAAJ&hl=en\',\n        ]\n        \'all_citation_count\': int\n        \'recent_citation_count: int\n        }\n\n\n'

## Scholarly Name Search

- Add Nanyang Technological University as keyword during the search to narrow down the results
- Disadvantage is that those faculty member which did not update their affiliation will be lost in this search
- Hence we need to search by citation as well

In [3]:
def save_prof_details(details, filename, dir='./prof_raw_data'):
    os.makedirs(dir, exist_ok=True)
    filename = dir + '/' + filename
        
    with open(filename, 'w') as f:
        json.dump(details, f)

In [4]:
def generate_name_combinations(name):
    words = name.split()
    combinations = []
    for r in range(2, len(words) + 1):
        combinations.extend(list(itertools.combinations(words, r)))
    combinations = [' '.join(combination).strip(',') for combination in combinations]
    return combinations[::-1]

def create_driver(debug=False):

    options = webdriver.ChromeOptions()
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--start-maximized")
    if debug==False:
        options.add_argument("--headless")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(
        options=options
    )
    stealth(driver,
            # user_agent=agent,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
    return driver

def extract_goog_sch_profile(author_full_name):

    BASE_URL = 'https://scholar.google.com'
    if len(author_full_name.split())>2:
        name_perm_list = generate_name_combinations(author_full_name)
    else:
        name_perm_list = [author_full_name]

    prof_result = {}
    for name in name_perm_list:
        query_param = name+" Nanyang Technological University"
        search_url = f"https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={query_param}&btnG="

        driver = create_driver()
        driver.get(search_url)
        time.sleep(3)  # wait for the page to load

        # find professors' descriptions and emails
        dsc_elements = driver.find_elements(By.CLASS_NAME, 'gs_ai_aff') 
        email_elements = driver.find_elements(By.CLASS_NAME, 'gs_ai_eml')

        candidates = []
        max_index = None
        max_similarity = 0
        for i in range(len(dsc_elements)):
            if 'ntu.edu.sg' in email_elements[i].text or 'Nanyang Technological University' in dsc_elements[i].text:
                cur_name = driver.find_elements(By.CLASS_NAME, 'gs_ai_name')[i].text
                # Similarity score of at least 80 before we consider it a match
                sim_score = fuzz.token_sort_ratio(name,cur_name)
                if sim_score>=80:
                    author_url = driver.find_elements(By.CLASS_NAME, 'gs_ai_name')[i].find_element(By.TAG_NAME, 'a').get_attribute('href')
                    candidates.append((cur_name,author_url))
                    if len(candidates)==1:
                        max_index=0
                        max_similarity = sim_score
                    elif sim_score>max_similarity:
                        max_index = len(candidates)-1
                        max_similarity = sim_score
                    else:
                        pass

        driver.quit()

        if max_index != None:
            candidate = candidates[max_index]
            cur_name = candidate[0]
            author_url = candidate[1]
            prof_result['goog_sch_url'] = author_url
            driver = create_driver()
            driver.get(author_url+"&pagesize=100")
            time.sleep(3)  # wait for the page to load
            
            # extract name
            prof_result['name'] = author_full_name

            list_of_interests = []
            try:
                # extracting interests
                int_element = driver.find_elements(By.ID, 'gsc_prf_int')
                if len(int_element)>0:
                    int_element = int_element[0]
                    a_tags = int_element.find_elements(By.TAG_NAME, 'a')
                    list_of_interests = [tag.text for tag in a_tags]
                else:
                    pass
            except Exception as e:
                print(f"Error Interest List for {author_full_name}")
                print(e)
                pass

            prof_result['interests'] = list_of_interests


            # extracting co_authors_url
            co_authors_details = []

            # some prof dont have co_authors
            try:
                co_author_table = driver.find_elements(By.ID,'gsc_rsb_co')
                if len(co_author_table)==0:
                    pass
                else:
                    co_author_table = co_author_table[0]

                    co_author_view_btn = driver.find_elements(By.ID,'gsc_coauth_opn')
                    if len(co_author_view_btn)>0:
                        open_btn = co_author_view_btn[0]
                        open_btn.click()
                        time.sleep(4)

                        content = driver.find_element(By.ID,'gsc_codb_content').get_attribute('outerHTML')
                        soup = BeautifulSoup(content,'html.parser')
                        co_authors_list = soup.find_all(name='div', attrs={'class':'gsc_ucoar gs_scl'})

                        for co_author in co_authors_list:
                            desc = co_author.find(name='div', attrs={'class':'gs_ai_t gs_ai_pss'})
                            name_div = desc.find(name='h3', attrs={'class': 'gs_ai_name'})
                            name = name_div.text.strip()
                            url = name_div.find(name='a').get('href')
                            aff = desc.find(name='div', attrs={'class':'gs_ai_aff'}).text.strip()
                            
                            co_authors_details.append({
                                'name':name,
                                'url': BASE_URL + url,
                                'aff':aff
                            })

                        close_button = driver.find_element(By.ID,'gsc_md_cod-x')
                        close_button.click()
                    else:
                        co_authors_list = co_author_table.find_element(By.CLASS_NAME,'gsc_rsb_a').find_elements(By.TAG_NAME,'li')
                        for co_author in co_authors_list:
                            name = co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_desc').find_element(By.TAG_NAME,'a').text
                            url = co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_desc').find_element(By.TAG_NAME,'a').get_attribute('href')
                            aff = co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_ext').text
                                
                            co_authors_details.append({
                                'name':name,
                                'url': url,
                                'aff':aff
                            })


            except Exception as e:
                print(f"Error in Co-Authors Table for {author_full_name}")
                print(e)
                pass

            prof_result['co_authors_url'] = co_authors_details
            
            prof_result['citation_table'] = {}
            try:
                # extracting citation table info
                cols = []
                ths = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME,'thead').find_elements(By.CLASS_NAME,'gsc_rsb_sth')
                for th in ths:
                    if th.text!="":
                        cols.append(th.text)

                prof_result['citation_table']['columns'] = cols
                
                trs = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
                for tr in trs:
                    row_index = tr.find_element(By.CLASS_NAME,'gsc_rsb_sc1').text
                    counts = [int(count.text) for count in tr.find_elements(By.CLASS_NAME,'gsc_rsb_std')]
                    prof_result['citation_table'][row_index] = counts


            except Exception as e:
                print(f"Error in Citation Table for {author_full_name}") 
                print(e)

            citation_graph = {}
            try:
                view_char_btn = driver.find_elements(By.ID, 'gsc_hist_opn')
                if len(view_char_btn)>0:
                    
                    view_char_btn[0].click()
                    time.sleep(3)

                    graph = driver.find_element(By.ID,'gsc_md_hist_c').find_element(By.CLASS_NAME,'gsc_md_hist_b')
                    years = graph.find_elements(By.CLASS_NAME,'gsc_g_t')
                    citation_counts = graph.find_elements(By.CLASS_NAME,'gsc_g_a')

                    for i in range(len(citation_counts)):
                        style = citation_counts[i].get_attribute('style')
                        year_index = int(style.split(':')[-1].strip(';'))
                        citation_graph[int(years[-year_index].get_attribute('innerText'))] = int(citation_counts[i].get_attribute('textContent'))
                    close_chart_btn = driver.find_element(By.ID,'gsc_md_hist-x')
                    close_chart_btn.click()
       
            except Exception as e:
                print(f"Error in citation graph for {author_full_name}")
                print(e)

            prof_result['citation_graph'] = citation_graph

            articles = []
            try:
                # extracting articles info
                btn = driver.find_element(By.ID,'gsc_bpf_more')
                while btn.get_attribute('disabled') is None:
                    btn.click()
                    time.sleep(3)

                article_url_list = []
                trs = driver.find_element(By.ID, 'gsc_a_b').find_elements(By.CLASS_NAME, 'gsc_a_tr')
                for tr in trs:
                    article_url_list.append(tr.find_element(By.CLASS_NAME,'gsc_a_t').find_element(By.TAG_NAME,'a').get_attribute('href'))
                    
                driver.quit()

                for article_url in tqdm(article_url_list,position=0,leave=True):
                    driver = create_driver()
                    driver.get(article_url)
                    time.sleep(3)
                    title = driver.find_element(By.ID, 'gsc_oci_title').text
                    items = driver.find_element(By.ID, 'gsc_oci_table').find_elements(By.CLASS_NAME, 'gs_scl')

                    article = {}
                    article['title'] = title
                    article['url'] = article_url
                    for item in items:
                        key = item.find_element(By.CLASS_NAME, 'gsc_oci_field')
                        value = item.find_element(By.CLASS_NAME, 'gsc_oci_value')
                        key = key.text.strip().lower().replace(' ', '_')
                        if key =='authors':
                            article[key] = value.text.split(', ')
                        if key=='publication_date':
                            article[key] = value.text
                        if key=='journal' or key=='book' or key=='conference':
                            article[key] = value.text
                        if key=='description':
                            article[key]= value.text
                        if key=='total_citations':
                            # total citation count
                            article[key] = int(value.find_element(By.TAG_NAME, 'a').text.split(' ')[-1])

                            # citation count over the years
                            years = value.find_elements(By.CLASS_NAME,'gsc_oci_g_t')
                            citations = value.find_elements(By.CLASS_NAME,'gsc_oci_g_a')
                            value_2 = {int(year.get_attribute('innerText')):0 for year in years}
                            for citation in citations:
                                year = int(citation.get_attribute('href')[-4:])
                                value_2[year] = int(citation.get_attribute('textContent'))
                            article['citation_graph'] = value_2
                    articles.append(article)
                    driver.quit()

            except Exception as e:
                print(f"Error in Articles for {author_full_name}")
                print(e)
            prof_result['articles'] = articles
            
        # No need to search through all name_permutations as best candidate has been found
        if max_index != None:
            break
        
    filename = "goog_sch_"+author_full_name.lower().replace(' ','_')+'.json'
    save_prof_details(details=prof_result,filename=filename)
                
    return 



In [98]:
result_df = pd.read_csv('Kee_Kai_Teng.csv')
result_df = result_df.drop(columns=['Unnamed: 0'])

In [None]:
result_df['Name'].apply(extract_goog_sch_profile)

In [7]:
missing_article = ["Dusit Niyato","Cong Gao","Lin Guosheng","Yu Han","Zhang Hanwang","Luo Jun","Liu Weichen",'Luke Ong （翁之昊）',"Qian Kemao",'Mohamed M. Sabry',"Owen Noel Newton Fernando","Joty Shafiq Rayhan","Tan Rui",'Pan Xingang',"Liu Yang",'Wen Yonggang',"Li Yi","Zhang Jie"]

In [11]:
missing_co_author = ["Anwitaman Datta","Anupam Chattopadhyay","Kwoh Chee Keong","Chng Eng Siong"]

In [12]:
for name in missing_co_author:
    extract_goog_sch_profile(name)

100%|██████████| 232/232 [30:35<00:00,  7.91s/it]
100%|██████████| 381/381 [49:36<00:00,  7.81s/it]
100%|██████████| 378/378 [49:20<00:00,  7.83s/it]
100%|██████████| 345/345 [45:11<00:00,  7.86s/it]
