# Main Sources of Data

1. Google Scholar 
    - Description, Year, Venues of all Publications by invidivual (Google Scholar)
    - all citation related details (Google Scholar)
    - co authors detail & research interest (Google Scholar) 
2. DR-NTU
    - biography, websites ,grants,email,name, designations (DR-NTU)


In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, NavigableString
import requests
from thefuzz import fuzz
import re
import itertools
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
import os,json
from tqdm import tqdm
import time

# Extract Google Scholar

In [2]:
'''

    {
        'name': "Li Boyang",
        'interest': ['ML','AI'],
        'articles':[
            {
                'Title': "Story Generation",
                'CO-Authors' : ['Mark Riedl','LOL'],
                'date': str,
                'venue': str,
                'Description: str,
                'Cited by': int,
                'citation_graph':dict
            },
        ],
        'co_authors_profile_url': [
            'https://scholar.google.com/citations?user=Yg_QjxcAAAAJ&hl=en',
        ]
        'all_citation_count': int
        'recent_citation_count: int
        }


'''


'\n\n    {\n        \'name\': "Li Boyang",\n        \'interest\': [\'ML\',\'AI\'],\n        \'articles\':[\n            {\n                \'Title\': "Story Generation",\n                \'CO-Authors\' : [\'Mark Riedl\',\'LOL\'],\n                \'date\': str,\n                \'venue\': str,\n                \'Description: str,\n                \'Cited by\': int,\n                \'citation_graph\':dict\n            },\n        ],\n        \'co_authors_profile_url\': [\n            \'https://scholar.google.com/citations?user=Yg_QjxcAAAAJ&hl=en\',\n        ]\n        \'all_citation_count\': int\n        \'recent_citation_count: int\n        }\n\n\n'

In [14]:
# selenium stealth driver used for scraping Google Scholar
def create_driver(debug=False):

    options = webdriver.ChromeOptions()
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--start-maximized")
    if debug==False:
        options.add_argument("--headless")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(
        options=options
    )
    stealth(driver,
            # user_agent=agent,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
    return driver

## Extract Google Scholar

### Scholarly Name Search
- Through Name search by appending Nanyang Technological University at the back of the name
- Does not cover all cases

In [41]:
def save_prof_details(details, filename, dir='./prof_raw_data'):
    os.makedirs(dir, exist_ok=True)
    filename = dir + '/' + filename
        
    with open(filename, 'w') as f:
        json.dump(details, f)

In [42]:
def generate_name_combinations(name):
    words = name.split()
    combinations = []
    for r in range(2, len(words) + 1):
        combinations.extend(list(itertools.combinations(words, r)))
    combinations = [' '.join(combination).strip(',') for combination in combinations]
    return combinations[::-1]


def extract_goog_sch_profile(author_full_name):

    BASE_URL = 'https://scholar.google.com'
    if len(author_full_name.split())>2:
        name_perm_list = generate_name_combinations(author_full_name)
    else:
        name_perm_list = [author_full_name]

    prof_result = {}
    for name in name_perm_list:
        query_param = name+" Nanyang Technological University"
        search_url = f"https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={query_param}&btnG="

        driver = create_driver()
        driver.get(search_url)
        time.sleep(3)  # wait for the page to load

        # find professors' descriptions and emails
        dsc_elements = driver.find_elements(By.CLASS_NAME, 'gs_ai_aff') 
        email_elements = driver.find_elements(By.CLASS_NAME, 'gs_ai_eml')

        candidates = []
        max_index = None
        max_similarity = 0
        for i in range(len(dsc_elements)):
            if 'ntu.edu.sg' in email_elements[i].text or 'Nanyang Technological University' in dsc_elements[i].text:
                cur_name = driver.find_elements(By.CLASS_NAME, 'gs_ai_name')[i].text
                # Similarity score of at least 80 before we consider it a match
                sim_score = fuzz.token_sort_ratio(name,cur_name)
                if sim_score>=80:
                    author_url = driver.find_elements(By.CLASS_NAME, 'gs_ai_name')[i].find_element(By.TAG_NAME, 'a').get_attribute('href')
                    candidates.append((cur_name,author_url))
                    if len(candidates)==1:
                        max_index=0
                        max_similarity = sim_score
                    elif sim_score>max_similarity:
                        max_index = len(candidates)-1
                        max_similarity = sim_score
                    else:
                        pass

        driver.quit()

        if max_index != None:
            candidate = candidates[max_index]
            cur_name = candidate[0]
            author_url = candidate[1]
            prof_result['goog_sch_url'] = author_url
            driver = create_driver()
            driver.get(author_url+"&pagesize=100")
            time.sleep(3)  # wait for the page to load
            
            # extract name
            prof_result['name'] = author_full_name

            list_of_interests = []
            try:
                # extracting interests
                int_element = driver.find_elements(By.ID, 'gsc_prf_int')
                if len(int_element)>0:
                    int_element = int_element[0]
                    a_tags = int_element.find_elements(By.TAG_NAME, 'a')
                    list_of_interests = [tag.text for tag in a_tags]
                else:
                    pass
            except Exception as e:
                print(f"Error Interest List for {author_full_name}")
                print(e)
                pass

            prof_result['interests'] = list_of_interests


            # extracting co_authors_url
            co_authors_details = []

            # some prof dont have co_authors
            try:
                co_author_table = driver.find_elements(By.ID,'gsc_rsb_co')
                if len(co_author_table)==0:
                    pass
                else:
                    co_author_table = co_author_table[0]

                    co_author_view_btn = driver.find_elements(By.ID,'gsc_coauth_opn')
                    if len(co_author_view_btn)>0:
                        open_btn = co_author_view_btn[0]
                        open_btn.click()
                        time.sleep(4)

                        content = driver.find_element(By.ID,'gsc_codb_content').get_attribute('outerHTML')
                        soup = BeautifulSoup(content,'html.parser')
                        co_authors_list = soup.find_all(name='div', attrs={'class':'gsc_ucoar gs_scl'})

                        for co_author in co_authors_list:
                            desc = co_author.find(name='div', attrs={'class':'gs_ai_t gs_ai_pss'})
                            name_div = desc.find(name='h3', attrs={'class': 'gs_ai_name'})
                            name = name_div.text.strip()
                            url = name_div.find(name='a').get('href')
                            aff = desc.find(name='div', attrs={'class':'gs_ai_aff'}).text.strip()
                            
                            co_authors_details.append({
                                'name':name,
                                'url': BASE_URL + url,
                                'aff':aff
                            })

                        close_button = driver.find_element(By.ID,'gsc_md_cod-x')
                        close_button.click()
                    else:
                        co_authors_list = co_author_table.find_element(By.CLASS_NAME,'gsc_rsb_a').find_elements(By.TAG_NAME,'li')
                        for co_author in co_authors_list:
                            name = co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_desc').find_element(By.TAG_NAME,'a').text
                            url = co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_desc').find_element(By.TAG_NAME,'a').get_attribute('href')
                            aff = co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_ext').text
                                
                            co_authors_details.append({
                                'name':name,
                                'url': url,
                                'aff':aff
                            })


            except Exception as e:
                print(f"Error in Co-Authors Table for {author_full_name}")
                print(e)
                pass

            prof_result['co_authors_url'] = co_authors_details
            
            prof_result['citation_table'] = {}
            try:
                # extracting citation table info
                cols = []
                ths = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME,'thead').find_elements(By.CLASS_NAME,'gsc_rsb_sth')
                for th in ths:
                    if th.text!="":
                        cols.append(th.text)

                prof_result['citation_table']['columns'] = cols
                
                trs = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
                for tr in trs:
                    row_index = tr.find_element(By.CLASS_NAME,'gsc_rsb_sc1').text
                    counts = [int(count.text) for count in tr.find_elements(By.CLASS_NAME,'gsc_rsb_std')]
                    prof_result['citation_table'][row_index] = counts


            except Exception as e:
                print(f"Error in Citation Table for {author_full_name}") 
                print(e)

            citation_graph = {}
            try:
                view_char_btn = driver.find_elements(By.ID, 'gsc_hist_opn')
                if len(view_char_btn)>0:
                    
                    view_char_btn[0].click()
                    time.sleep(3)

                    graph = driver.find_element(By.ID,'gsc_md_hist_c').find_element(By.CLASS_NAME,'gsc_md_hist_b')
                    years = graph.find_elements(By.CLASS_NAME,'gsc_g_t')
                    citation_counts = graph.find_elements(By.CLASS_NAME,'gsc_g_a')

                    for i in range(len(citation_counts)):
                        style = citation_counts[i].get_attribute('style')
                        year_index = int(style.split(':')[-1].strip(';'))
                        citation_graph[int(years[-year_index].get_attribute('innerText'))] = int(citation_counts[i].get_attribute('textContent'))
                    close_chart_btn = driver.find_element(By.ID,'gsc_md_hist-x')
                    close_chart_btn.click()
       
            except Exception as e:
                print(f"Error in citation graph for {author_full_name}")
                print(e)

            prof_result['citation_graph'] = citation_graph

            articles = []
            try:
                # extracting articles info
                btn = driver.find_element(By.ID,'gsc_bpf_more')
                while btn.get_attribute('disabled') is None:
                    btn.click()
                    time.sleep(3)

                article_url_list = []
                trs = driver.find_element(By.ID, 'gsc_a_b').find_elements(By.CLASS_NAME, 'gsc_a_tr')
                for tr in trs:
                    
                    article_url_list.append(tr.find_element(By.CLASS_NAME,'gsc_a_t').find_element(By.TAG_NAME,'a').get_attribute('href'))
                    
                driver.quit()

                for article_url in tqdm(article_url_list,position=0,leave=True):
                    driver = create_driver()
                    driver.get(article_url)
                    time.sleep(3)
                    title = driver.find_element(By.ID, 'gsc_oci_title').text
                    items = driver.find_element(By.ID, 'gsc_oci_table').find_elements(By.CLASS_NAME, 'gs_scl')

                    article = {}
                    article['title'] = title
                    article['url'] = article_url
                    for item in items:
                        key = item.find_element(By.CLASS_NAME, 'gsc_oci_field')
                        value = item.find_element(By.CLASS_NAME, 'gsc_oci_value')
                        key = key.text.strip().lower().replace(' ', '_')
                        if key =='authors':
                            article[key] = value.text.split(', ')
                        if key=='publication_date':
                            article[key] = value.text
                        if key=='journal' or key=='book' or key=='conference':
                            article[key] = value.text
                        if key=='description':
                            article[key]= value.text
                        if key=='total_citations':
                            # total citation count
                            article[key] = int(value.find_element(By.TAG_NAME, 'a').text.split(' ')[-1])

                            # citation count over the years
                            years = value.find_elements(By.CLASS_NAME,'gsc_oci_g_t')
                            citations = value.find_elements(By.CLASS_NAME,'gsc_oci_g_a')
                            value_2 = {int(year.get_attribute('innerText')):0 for year in years}
                            for citation in citations:
                                year = int(citation.get_attribute('href')[-4:])
                                value_2[year] = int(citation.get_attribute('textContent'))
                            article['citation_graph'] = value_2
                    articles.append(article)
                    driver.quit()

            except Exception as e:
                print(f"Error in Articles for {author_full_name}")
                print(e)
            prof_result['articles'] = articles
            
        # No need to search through all name_permutations as best candidate has been found
        if max_index != None:
            break
        
    filename = "goog_sch_"+author_full_name.lower().replace(' ','_')+'.json'
    save_prof_details(details=prof_result,filename=filename)
                
    return 



In [98]:
result_df = pd.read_csv('./prof_raw_data/scse_profiles.csv')
result_df = result_df.drop(columns=['Unnamed: 0'])

In [None]:
result_df['Name'].apply(extract_goog_sch_profile)

### Bilbometric

- For the remaining professor that were not found through name search, we check if we manage to scrape their google scholar url from DR-NTU bilbometric

In [57]:
missing_author = ['Miao Chun Yan','Li Boyang','Li Mo','Chan Syin','Josephine Chong','Lau Chiew Tong','Li Fang','Pan, Sinno Jialin','Tang Xueyan','Tay Kian Boon','Thambipillai Srikanthan','Vun Chan Hua Nicholas','Wee Keong Ng','Zinovi Rabinovich']

In [None]:
BASE_URL = 'https://scholar.google.com'

for name in missing_author:
    filename = name.lower().replace(' ','_')
    filepath = f"./prof_raw_data/dr_ntu_{filename}.json"
    with open(filepath,'r') as f:
        author = json.load(f)
    prof_result = {}
    if 'google_scholar' in author['urls']:
        if author['urls']['google_scholar']:
            driver = create_driver()
            driver.get(author['urls']['google_scholar']+"&pagesize=100")
            time.sleep(3)  # wait for the page to load
            
            # extract name
            prof_result['goog_sch_url'] = author['urls']['google_scholar']
            prof_result['name'] = name

            list_of_interests = []
            try:
                # extracting interests
                int_element = driver.find_elements(By.ID, 'gsc_prf_int')
                if len(int_element)>0:
                    int_element = int_element[0]
                    a_tags = int_element.find_elements(By.TAG_NAME, 'a')
                    list_of_interests = [tag.text for tag in a_tags]
                else:
                    pass
            except Exception as e:
                print(f"Error Interest List for {name}")
                print(e)
                pass

            prof_result['interests'] = list_of_interests


            # extracting co_authors_url
            co_authors_details = []

            # some prof dont have co_authors
            try:
                co_author_table = driver.find_elements(By.ID,'gsc_rsb_co')
                if len(co_author_table)==0:
                    pass
                else:
                    co_author_table = co_author_table[0]

                    co_author_view_btn = driver.find_elements(By.ID,'gsc_coauth_opn')
                    if len(co_author_view_btn)>0:
                        open_btn = co_author_view_btn[0]
                        open_btn.click()
                        time.sleep(4)

                        content = driver.find_element(By.ID,'gsc_codb_content').get_attribute('outerHTML')
                        soup = BeautifulSoup(content,'html.parser')
                        co_authors_list = soup.find_all(name='div', attrs={'class':'gsc_ucoar gs_scl'})

                        for co_author in co_authors_list:
                            desc = co_author.find(name='div', attrs={'class':'gs_ai_t gs_ai_pss'})
                            name_div = desc.find(name='h3', attrs={'class': 'gs_ai_name'})
                            name = name_div.text.strip()
                            url = name_div.find(name='a').get('href')
                            aff = desc.find(name='div', attrs={'class':'gs_ai_aff'}).text.strip()
                            
                            co_authors_details.append({
                                'name':name,
                                'url': BASE_URL + url,
                                'aff':aff
                            })

                        close_button = driver.find_element(By.ID,'gsc_md_cod-x')
                        close_button.click()
                    else:
                        co_authors_list = co_author_table.find_element(By.CLASS_NAME,'gsc_rsb_a').find_elements(By.TAG_NAME,'li')
                        for co_author in co_authors_list:
                            name = co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_desc').find_element(By.TAG_NAME,'a').text
                            url = co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_desc').find_element(By.TAG_NAME,'a').get_attribute('href')
                            aff = co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_ext').text
                                
                            co_authors_details.append({
                                'name':name,
                                'url': url,
                                'aff':aff
                            })


            except Exception as e:
                print(f"Error in Co-Authors Table for {name}")
                print(e)
                pass

            prof_result['co_authors_url'] = co_authors_details
            
            prof_result['citation_table'] = {}
            try:
                # extracting citation table info
                cols = []
                ths = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME,'thead').find_elements(By.CLASS_NAME,'gsc_rsb_sth')
                for th in ths:
                    if th.text!="":
                        cols.append(th.text)

                prof_result['citation_table']['columns'] = cols
                
                trs = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
                for tr in trs:
                    row_index = tr.find_element(By.CLASS_NAME,'gsc_rsb_sc1').text
                    counts = [int(count.text) for count in tr.find_elements(By.CLASS_NAME,'gsc_rsb_std')]
                    prof_result['citation_table'][row_index] = counts


            except Exception as e:
                print(f"Error in Citation Table for {name}") 
                print(e)

            citation_graph = {}
            try:
                view_char_btn = driver.find_elements(By.ID, 'gsc_hist_opn')
                if len(view_char_btn)>0:
                    
                    view_char_btn[0].click()
                    time.sleep(3)

                    graph = driver.find_element(By.ID,'gsc_md_hist_c').find_element(By.CLASS_NAME,'gsc_md_hist_b')
                    years = graph.find_elements(By.CLASS_NAME,'gsc_g_t')
                    citation_counts = graph.find_elements(By.CLASS_NAME,'gsc_g_a')

                    for i in range(len(citation_counts)):
                        style = citation_counts[i].get_attribute('style')
                        year_index = int(style.split(':')[-1].strip(';'))
                        citation_graph[int(years[-year_index].get_attribute('innerText'))] = int(citation_counts[i].get_attribute('textContent'))
                    close_chart_btn = driver.find_element(By.ID,'gsc_md_hist-x')
                    close_chart_btn.click()
       
            except Exception as e:
                print(f"Error in citation graph for {name}")
                print(e)

            prof_result['citation_graph'] = citation_graph

            articles = []
            try:
                # extracting articles info
                btn = driver.find_element(By.ID,'gsc_bpf_more')
                while btn.get_attribute('disabled') is None:
                    btn.click()
                    time.sleep(3)

                article_url_list = []
                trs = driver.find_element(By.ID, 'gsc_a_b').find_elements(By.CLASS_NAME, 'gsc_a_tr')
                for tr in trs:
                    
                    article_url_list.append(tr.find_element(By.CLASS_NAME,'gsc_a_t').find_element(By.TAG_NAME,'a').get_attribute('href'))
                    
                driver.quit()

                for article_url in tqdm(article_url_list,position=0,leave=True):
                    driver = create_driver()
                    driver.get(article_url)
                    time.sleep(3)
                    title = driver.find_element(By.ID, 'gsc_oci_title').text
                    items = driver.find_element(By.ID, 'gsc_oci_table').find_elements(By.CLASS_NAME, 'gs_scl')

                    article = {}
                    article['title'] = title
                    article['url'] = article_url
                    for item in items:
                        key = item.find_element(By.CLASS_NAME, 'gsc_oci_field')
                        value = item.find_element(By.CLASS_NAME, 'gsc_oci_value')
                        key = key.text.strip().lower().replace(' ', '_')
                        if key =='authors':
                            article[key] = value.text.split(', ')
                        if key=='publication_date':
                            article[key] = value.text
                        if key=='journal' or key=='book' or key=='conference':
                            article[key] = value.text
                        if key=='description':
                            article[key]= value.text
                        if key=='total_citations':
                            # total citation count
                            article[key] = int(value.find_element(By.TAG_NAME, 'a').text.split(' ')[-1])

                            # citation count over the years
                            years = value.find_elements(By.CLASS_NAME,'gsc_oci_g_t')
                            citations = value.find_elements(By.CLASS_NAME,'gsc_oci_g_a')
                            value_2 = {int(year.get_attribute('innerText')):0 for year in years}
                            for citation in citations:
                                year = int(citation.get_attribute('href')[-4:])
                                value_2[year] = int(citation.get_attribute('textContent'))
                            article['citation_graph'] = value_2
                    articles.append(article)
                    driver.quit()

            except Exception as e:
                print(f"Error in Articles for {name}")
                print(e)
            prof_result['articles'] = articles
    save_prof_details(prof_result,filename=f"goog_sch_{filename}.json")
        


## Extract Additional Co-Author Details

- Created to extract additional details from Google Scholar that is needed to build up the dashboard co-author features 

In [61]:
dir = './prof_raw_data/'
goog_sch_file_list = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'goog_sch' in f]


In [59]:
error_file_path = ['./prof_raw_data/goog_sch_miao_chun_yan.json','./prof_raw_data/goog_sch_li_boyang.json','./prof_raw_data/goog_sch_li_mo.json']

In [47]:
co_author_dir = './co_author_data/'
BASE_URL = 'https://scholar.google.com/'
for filepath in goog_sch_file_list:
    filename = filepath[25:-5]
    with open(filepath,'r') as f:
        profile = json.load(f)

    co_author_network = {}
    if 'co_authors_url' in profile:
        author_url = profile['goog_sch_url']
        co_author_network[author_url] = []

        for co_author in tqdm(profile['co_authors_url'],position=0,leave=True):
            co_author_url = co_author['url']
            co_author_network[author_url].append(co_author_url)
            co_author_network[co_author_url] = []

            driver = create_driver()
            driver.get(co_author_url)
            time.sleep(3)  # wait for the page to load


            co_author['citation_table'] = {}
            try:
                # extracting citation table info
                cols = []
                ths = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME,'thead').find_elements(By.CLASS_NAME,'gsc_rsb_sth')
                for th in ths:
                    if th.text!="":
                        cols.append(th.text)

                co_author['citation_table']['columns'] = cols
                
                trs = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
                for tr in trs:
                    row_index = tr.find_element(By.CLASS_NAME,'gsc_rsb_sc1').text
                    counts = [int(count.text) for count in tr.find_elements(By.CLASS_NAME,'gsc_rsb_std')]
                    co_author['citation_table'][row_index] = counts


            except Exception as e:
                print(f"Error in Citation Table for main author:{profile['name']}, co_author:{co_author['name']}") 
                print(e)


            # some prof dont have co_authors
            try:
                co_co_author_table = driver.find_elements(By.ID,'gsc_rsb_co')
                if len(co_co_author_table)==0:
                    pass
                else:
                    co_co_author_table = co_co_author_table[0]

                    co_co_author_view_btn = driver.find_elements(By.ID,'gsc_coauth_opn')
                    if len(co_co_author_view_btn)>0:
                        open_btn = co_co_author_view_btn[0]
                        open_btn.click()
                        time.sleep(4)

                        content = driver.find_element(By.ID,'gsc_codb_content').get_attribute('outerHTML')
                        soup = BeautifulSoup(content,'html.parser')
                        co_co_authors_list = soup.find_all(name='div', attrs={'class':'gsc_ucoar gs_scl'})

                        for co_co_author in co_co_authors_list:
                            desc = co_co_author.find(name='div', attrs={'class':'gs_ai_t gs_ai_pss'})
                            name_div = desc.find(name='h3', attrs={'class': 'gs_ai_name'})
                            co_co_author_url = name_div.find(name='a').get('href')
                            
                            co_author_network[co_author_url].append(BASE_URL + co_co_author_url)

                        close_button = driver.find_element(By.ID,'gsc_md_cod-x')
                        close_button.click()
                    else:
                        co_co_authors_list = co_co_author_table.find_element(By.CLASS_NAME,'gsc_rsb_a').find_elements(By.TAG_NAME,'li')
                        for co_co_author in co_co_authors_list:
                            co_co_author_url = co_co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_desc').find_element(By.TAG_NAME,'a').get_attribute('href')
                                
                            co_author_network[co_author_url].append(co_co_author_url)


            except Exception as e:
                print(f"Error in Co-Co-Authors Table for Co-Author {co_author['name']}")
                print(e)
                pass

            driver.quit()
        with open(filepath,'w') as f:
            json.dump(profile,f)
        
    with open(f"{co_author_dir}{filename}.json",'w') as f:
        json.dump(co_author_network,f)

            
            

100%|██████████| 59/59 [09:21<00:00,  9.51s/it]
100%|██████████| 22/22 [03:24<00:00,  9.30s/it]
100%|██████████| 13/13 [01:42<00:00,  7.91s/it]
100%|██████████| 32/32 [05:03<00:00,  9.49s/it]
100%|██████████| 43/43 [06:42<00:00,  9.35s/it]
100%|██████████| 166/166 [24:10<00:00,  8.74s/it]
100%|██████████| 12/12 [02:02<00:00, 10.20s/it]
100%|██████████| 10/10 [01:48<00:00, 10.87s/it]
100%|██████████| 51/51 [08:23<00:00,  9.86s/it]
100%|██████████| 16/16 [02:42<00:00, 10.15s/it]
0it [00:00, ?it/s]
100%|██████████| 51/51 [08:13<00:00,  9.67s/it]
100%|██████████| 35/35 [04:59<00:00,  8.54s/it]
100%|██████████| 14/14 [01:55<00:00,  8.25s/it]
100%|██████████| 32/32 [04:09<00:00,  7.78s/it]
0it [00:00, ?it/s]
100%|██████████| 39/39 [05:35<00:00,  8.60s/it]
100%|██████████| 17/17 [02:35<00:00,  9.17s/it]
100%|██████████| 13/13 [02:06<00:00,  9.72s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 16/16 [02:51<00:00, 10.74s/it]
100%|██████████| 1/1 [00:08<00:00,  8.22s/it]
0it [00:00, 

Error in Citation Table for main author:Chen Change Loy, co_author:Yuxin Jiang
Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="gsc_rsb_st"]"}
  (Session info: headless chrome=118.0.5993.117); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001071e6e08 chromedriver + 5025288
1   chromedriver                        0x00000001071ddc23 chromedriver + 4987939
2   chromedriver                        0x0000000106d7fe67 chromedriver + 409191
3   chromedriver                        0x0000000106dcf1b9 chromedriver + 733625
4   chromedriver                        0x0000000106dcf371 chromedriver + 734065
5   chromedriver                        0x0000000106e15194 chromedriver + 1020308
6   chromedriver                        0x0000000106df650d chromedriver + 894221
7   chromedriver                   

 97%|█████████▋| 174/180 [26:21<00:51,  8.56s/it]

Error in Citation Table for main author:Chen Change Loy, co_author:Wayne Wu
Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="gsc_rsb_st"]"}
  (Session info: headless chrome=118.0.5993.117); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000104ba7e08 chromedriver + 5025288
1   chromedriver                        0x0000000104b9ec23 chromedriver + 4987939
2   chromedriver                        0x0000000104740e67 chromedriver + 409191
3   chromedriver                        0x00000001047901b9 chromedriver + 733625
4   chromedriver                        0x0000000104790371 chromedriver + 734065
5   chromedriver                        0x00000001047d6194 chromedriver + 1020308
6   chromedriver                        0x00000001047b750d chromedriver + 894221
7   chromedriver                      

100%|██████████| 180/180 [27:21<00:00,  9.12s/it]
0it [00:00, ?it/s]
100%|██████████| 36/36 [05:05<00:00,  8.48s/it]
100%|██████████| 37/37 [05:07<00:00,  8.30s/it]
0it [00:00, ?it/s]
100%|██████████| 45/45 [06:22<00:00,  8.50s/it]
100%|██████████| 18/18 [02:47<00:00,  9.32s/it]
100%|██████████| 48/48 [07:13<00:00,  9.02s/it]
100%|██████████| 4/4 [00:35<00:00,  8.79s/it]
0it [00:00, ?it/s]
100%|██████████| 189/189 [28:05<00:00,  8.92s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 136/136 [20:23<00:00,  9.00s/it]
100%|██████████| 4/4 [00:33<00:00,  8.32s/it]
100%|██████████| 45/45 [07:38<00:00, 10.19s/it]
100%|██████████| 80/80 [12:11<00:00,  9.14s/it]
0it [00:00, ?it/s]
100%|██████████| 13/13 [02:05<00:00,  9.65s/it]
100%|██████████| 188/188 [30:18<00:00,  9.67s/it]
0it [00:00, ?it/s]
100%|██████████| 9/9 [01:09<00:00,  7.75s/it]
100%|██████████| 19/19 [03:21<00:00, 10.58s/it]
100%|██████████| 70/70 [11:23<00:00,  9.76s/it]
100%|██████████| 27/27 [04:14<00:00,  9.42s/it]
0

In [60]:
for filepath in error_file_path:
    filename = filepath[25:-5]
    with open(filepath,'r') as f:
        profile = json.load(f)

    co_author_network = {}
    if 'co_authors_url' in profile:
        author_url = profile['goog_sch_url']
        co_author_network[author_url] = []

        for co_author in tqdm(profile['co_authors_url'],position=0,leave=True):
            co_author_url = co_author['url']
            co_author_network[author_url].append(co_author_url)
            co_author_network[co_author_url] = []

            driver = create_driver()
            driver.get(co_author_url)
            time.sleep(3)  # wait for the page to load


            co_author['citation_table'] = {}
            try:
                # extracting citation table info
                cols = []
                ths = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME,'thead').find_elements(By.CLASS_NAME,'gsc_rsb_sth')
                for th in ths:
                    if th.text!="":
                        cols.append(th.text)

                co_author['citation_table']['columns'] = cols
                
                trs = driver.find_element(By.ID, 'gsc_rsb_st').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
                for tr in trs:
                    row_index = tr.find_element(By.CLASS_NAME,'gsc_rsb_sc1').text
                    counts = [int(count.text) for count in tr.find_elements(By.CLASS_NAME,'gsc_rsb_std')]
                    co_author['citation_table'][row_index] = counts


            except Exception as e:
                print(f"Error in Citation Table for main author:{profile['name']}, co_author:{co_author['name']}") 
                print(e)


            # some prof dont have co_authors
            try:
                co_co_author_table = driver.find_elements(By.ID,'gsc_rsb_co')
                if len(co_co_author_table)==0:
                    pass
                else:
                    co_co_author_table = co_co_author_table[0]

                    co_co_author_view_btn = driver.find_elements(By.ID,'gsc_coauth_opn')
                    if len(co_co_author_view_btn)>0:
                        open_btn = co_co_author_view_btn[0]
                        open_btn.click()
                        time.sleep(4)

                        content = driver.find_element(By.ID,'gsc_codb_content').get_attribute('outerHTML')
                        soup = BeautifulSoup(content,'html.parser')
                        co_co_authors_list = soup.find_all(name='div', attrs={'class':'gsc_ucoar gs_scl'})

                        for co_co_author in co_co_authors_list:
                            desc = co_co_author.find(name='div', attrs={'class':'gs_ai_t gs_ai_pss'})
                            name_div = desc.find(name='h3', attrs={'class': 'gs_ai_name'})
                            co_co_author_url = name_div.find(name='a').get('href')
                            
                            co_author_network[co_author_url].append(BASE_URL + co_co_author_url)

                        close_button = driver.find_element(By.ID,'gsc_md_cod-x')
                        close_button.click()
                    else:
                        co_co_authors_list = co_co_author_table.find_element(By.CLASS_NAME,'gsc_rsb_a').find_elements(By.TAG_NAME,'li')
                        for co_co_author in co_co_authors_list:
                            co_co_author_url = co_co_author.find_element(By.CLASS_NAME,'gsc_rsb_a_desc').find_element(By.TAG_NAME,'a').get_attribute('href')
                                
                            co_author_network[co_author_url].append(co_co_author_url)


            except Exception as e:
                print(f"Error in Co-Co-Authors Table for Co-Author {co_author['name']}")
                print(e)
                pass

            driver.quit()
        with open(filepath,'w') as f:
            json.dump(profile,f)
        
    with open(f"{co_author_dir}{filename}.json",'w') as f:
        json.dump(co_author_network,f)

            
            

100%|██████████| 38/38 [05:57<00:00,  9.41s/it]
 75%|███████▌  | 21/28 [03:41<01:19, 11.39s/it]

Error in Co-Co-Authors Table for Co-Author Yangfeng Ji
Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="gsc_codb_content"]"}
  (Session info: headless chrome=118.0.5993.117); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000107477e08 chromedriver + 5025288
1   chromedriver                        0x000000010746ec23 chromedriver + 4987939
2   chromedriver                        0x0000000107010e67 chromedriver + 409191
3   chromedriver                        0x00000001070601b9 chromedriver + 733625
4   chromedriver                        0x0000000107060371 chromedriver + 734065
5   chromedriver                        0x00000001070a6194 chromedriver + 1020308
6   chromedriver                        0x000000010708750d chromedriver + 894221
7   chromedriver                        0x00000001070

100%|██████████| 28/28 [05:00<00:00, 10.73s/it]
100%|██████████| 25/25 [04:02<00:00,  9.69s/it]
