# Importing Libraries used


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from scholarly import scholarly 

# Collecting Names, Emails and DR-NTU URLs from DR-NTU


In [2]:
# URL of the webpage to scrape with full list of academic profiles up to 100 records
url = "https://dr.ntu.edu.sg/simple-search?query=&location=researcherprofiles&filter_field_1=school&filter_type_1=authority&filter_value_1=ou00030&crisID=&relationName=&sort_by=bi_sort_4_sort&order=asc&rpp=100&etal=0&start=0"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Parse the HTML content with Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')

# Example: Find all academic profile links
academic_profiles = soup.find(class_='table table-hover').find_all('tr')

# Delete that header row
academic_profiles = academic_profiles[1:]

# Create Empty Lists
dr_ntu_urls = []
full_names = []
emails = []

# Loop through the academic profile links and extract data
for profile_link in academic_profiles:

    # From headers = "t1"
    t1_element = profile_link.find('td',{'headers':'t1'})
    profile_name = t1_element.text
    full_names.append(profile_name)

    profile_url = t1_element.find('a')['href']
    dr_ntu_urls.append('https://dr.ntu.edu.sg' + profile_url)

    # From headers = "t3"
    t3_element = profile_link.find('td',{'headers':'t3'})

    profile_email = t3_element.text
    emails.append(profile_email)
    


# Collecting Prof's Personal Website URLs


In [3]:
# Entering the Prof's DR-NTU URL to get his website URL

website_url = []

for url in dr_ntu_urls:
    prof_urls = []
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    websites = soup.find(id = 'personalsiteDiv')

    if websites != None:
        c = websites.find_all('a')
        for i in c:
            # Enter their website URL and append to the list of website_url
            if i['href'] == '#':
                prof_urls.append(url)
            else:
                prof_urls.append(i['href'])
        website_url.append(prof_urls)
    else:
        website_url.append(None)
    
    

# Collecting Prof's DBLP Profile URLs


In [4]:
# Searching for Prof's on DBLP


## Run this when there are connection errors
# from requests.adapters import HTTPAdapter
# from urllib3.util.retry import Retry
# session = requests.Session()
# retry = Retry(connect=3, backoff_factor=0.5)
# adapter = HTTPAdapter(max_retries=retry)
# session.mount('http://', adapter)
# session.mount('https://', adapter)
# session.get(url)

dblp_urls = []
google_scholar_urls = []

for names in full_names:
    if names == "Ke Yiping, Kelly" :
        names = names.replace("Ke Yiping, Kelly", "Ke Yiping")
    names = names.replace(" ", "%20")
    url = "https://dblp.org/search/author?q=" + names 


    # Direct match of the pid from full name of prof
    response = requests.get(url)
    dblp_url = response.url
    match = re.search(r'^(.*?\.html)', dblp_url)
    # print(match)

    if match:
        dblp_urls.append(match.group(1))
    else:
        # if no match, find the ntu tag of the prof in all results and take his dblp
        soup = BeautifulSoup(response.content, 'html.parser')
        small_elements = soup.find_all('small')
        subtext_to_find = "Nanyang Technological University"
        # Search for the specific <small> element containing the desired subtext
        for small in small_elements:
            if subtext_to_find in small.get_text():
                # Get the parent <li> element and then the <a> element to extract the href link
                parent_li = small.find_parent('li')
                person_link = parent_li.find('a')['href']
                dblp_urls.append(person_link + ".html")
                break 
        else:
            dblp_urls.append(None)
    

# Getting Citations of Profs from Google Scholar


In [5]:
# Searching for prof's names on Google Scholar with NTU tag using Scholarly
test_name = []
test_citation = []
test_affiliation = []

citations = []
for names in full_names:
    if names == "Ke Yiping, Kelly" :
        names = names.replace("Ke Yiping, Kelly", "Ke Yiping")
    names = names + ', Nanyang Technological University'
    search_query = scholarly.search_author(names)
    try:
        author = next(search_query)
        
        # Test DataFrame to check if the names and affiliation from Scholarly is correct.
        test_name.append(author['name'])
        test_citation.append(author['citedby'])
        test_affiliation.append(author['affiliation'])

        # Append the citations to the list
        citations.append(author['citedby'])
    except StopIteration:
        citations.append(None)
        # print(f"Professor {names} not found")

# Checking if Scholarly returned incorrect results

In [6]:
data = {'Full Name': test_name,'Citations': test_citation, 'affiliation': test_affiliation}
test_df = pd.DataFrame(data)

## Prof Li Fang was incorrectly searched in Scholarly by checking through the CSV file

In [7]:
# test_df.to_csv('test.csv', index=False)

<img src="Screenshot 2023-09-20 at 4.30.50 PM.png">

# Putting everything nicely into a DataFrame


In [8]:
data = {'Full Name': full_names, 'Email': emails,'DR-NTU URL': dr_ntu_urls, 'Website URL': website_url, 'DBLP URL': dblp_urls, 'Citations': citations}
df = pd.DataFrame(data)

In [9]:
## Editing Prof Lifang's Citations first
df.loc[df['Full Name'] == 'Li Fang', 'Citations'] = None

In [10]:
df.loc[df['Full Name'] == 'Li Fang']

Unnamed: 0,Full Name,Email,DR-NTU URL,Website URL,DBLP URL,Citations
33,Li Fang,asfli@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01161,,https://dblp.org/pid/55/2162-9.html,


# Checking the fields of the data


In [11]:
print(sum(df['Website URL'].isna()))
print(sum(df['DBLP URL'].isna()))
print(sum(df['Citations'].isna()))

17
17
17


# Profs with no website listed on DR-NTU

#### Double checked their profiles on DR-NTU and there wasnt a link provided


In [12]:
no_website = df[pd.isna(df['Website URL'])]
no_website

Unnamed: 0,Full Name,Email,DR-NTU URL,Website URL,DBLP URL,Citations
8,Chee Wei Tan,cheewei.tan@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp02029,,https://dblp.org/pid/04/3030.html,5377.0
13,Deepu Rajan,asdrajan@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01157,,https://dblp.org/pid/95/3115.html,4711.0
18,Goh Wooi Boon,aswbgoh@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00693,,https://dblp.org/pid/97/6922.html,1435.0
21,Huang Shell Ying,assyhuang@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00643,,,1792.0
24,Josephine Chong,josephine.chong@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp02294,,,
30,Lau Chiew Tong,asctlau@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00670,,https://dblp.org/pid/30/6609.html,
31,Lee Bu Sung,ebslee@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00531,,https://dblp.org/pid/l/BuSungLee.html,14687.0
33,Li Fang,asfli@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01161,,https://dblp.org/pid/55/2162-9.html,
51,Oh Hong Lye,hloh@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00131,,https://dblp.org/pid/224/9431.html,225.0
52,Ong Chin Ann,chinann.ong@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp02123,,https://dblp.org/pid/59/4829.html,210.0


# Profs with no DBLP Profile


In [13]:
no_dblp = df[pd.isna(df['DBLP URL'])]
no_dblp

Unnamed: 0,Full Name,Email,DR-NTU URL,Website URL,DBLP URL,Citations
21,Huang Shell Ying,assyhuang@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00643,,,1792.0
24,Josephine Chong,josephine.chong@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp02294,,,
28,Lam Kwok Yan,kwokyan.lam@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00321,[https://orcid.org/0000-0001-7479-7970],,5544.0
39,Liu Weichen,liu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00280,"[https://personal.ntu.edu.sg/liu/, https://orc...",,2659.0
43,Long Cheng,c.long@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00772,[https://personal.ntu.edu.sg/c.long],,1819.0
44,Lu Shijian,shijian.lu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01128,[https://personal.ntu.edu.sg/shijian.lu/index....,,14285.0
48,Luu Anh Tuan,anhtuan.luu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01296,[https://tuanluu.github.io/],,4242.0
50,Mohamed M. Sabry,msabry@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00314,[http://www.ecs-97.webself.net/],,1498.0
61,Siyuan Liu,syliu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01753,,,682.0
63,Sourav Saha Bhowmick,assourav@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00915,"[http://www3.ntu.edu.sg/home/assourav/, https:...",,


In [14]:
# Check for publications found on DBLP with publications in DR-NTU
# if there is a match, take the DBLP URL of that page and append to the list of dblp_urls


for index, row in no_dblp.iterrows():
    full_name = row["Full Name"]
    dr_ntu_url = row['DR-NTU URL']

    # Searching for Prof's name on DBLP
    full_name = full_name.replace(" ", "%20")
    url = "https://dblp.org/search/author?q=" + full_name 
    response = requests.get(url)
    dblp_search_soup = BeautifulSoup(response.content, 'html.parser')

    #result list first
    try:
        prof_pubs = dr_ntu_url + '/selectedPublications.html'
        dr_ntu_soup = BeautifulSoup(requests.get(prof_pubs).content, 'html.parser')
        try:
            pubs_in_drntu = dr_ntu_soup.find(id = "facultyjournalDiv").find('div').text
        except:
            print("No publications found for " + full_name + " in DR-NTU")

        links = dblp_search_soup.find(class_ = 'result-list')
        for i in range(len(links)):
            prof_candidiate = links.find_all('a')[i]['href']
            dblp_url = requests.get(prof_candidiate)
            dblp_home_soup = BeautifulSoup(dblp_url.content, 'html.parser')
            pubs_dblps = dblp_home_soup.find_all(class_ ='title')
            for i in range(len(pubs_dblps)):
                pubs_dblp = pubs_dblps[i].text
                pubs_dblp = re.sub(r'\.', '', pubs_dblp)
            # check for each publication in dblp with ntu prof's publications
                if re.search(re.escape(pubs_dblp), pubs_in_drntu, re.IGNORECASE) or re.search(re.escape(pubs_in_drntu), pubs_dblp, re.IGNORECASE):
                    no_dblp.at[index, 'DBLP URL'] = dblp_url.url
                    break

    except:
        print('No dblp page found for ' + full_name)


    

No publications found for Mohamed%20M.%20Sabry in DR-NTU
No dblp page found for Sourav%20Saha%20Bhowmick
No dblp page found for Tay%20Kian%20Boon
No publications found for Zinovi%20Rabinovich in DR-NTU


## Actually I realised some of the Profs have multiple DBLP Profiles which are them based on the publications listed. But I just took the first DBLP Profile link which was found

In [15]:
df.update(no_dblp)
print(sum(df['DBLP URL'].isna()))

8


In [16]:
no_dblp = df[pd.isna(df['DBLP URL'])]
no_dblp

Unnamed: 0,Full Name,Email,DR-NTU URL,Website URL,DBLP URL,Citations
28,Lam Kwok Yan,kwokyan.lam@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00321,[https://orcid.org/0000-0001-7479-7970],,5544.0
43,Long Cheng,c.long@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00772,[https://personal.ntu.edu.sg/c.long],,1819.0
50,Mohamed M. Sabry,msabry@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00314,[http://www.ecs-97.webself.net/],,1498.0
63,Sourav Saha Bhowmick,assourav@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00915,"[http://www3.ntu.edu.sg/home/assourav/, https:...",,
65,Tan Rui,tanrui@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00504,"[https://personal.ntu.edu.sg/tanrui/, https://...",,4273.0
67,Tay Kian Boon,kianboon.tay@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00174,,,
75,Wei Ying,ying.wei@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp02297,[https://wei-ying.net/],,2780.0
85,Zinovi Rabinovich,zinovi@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00214,[zinovi.net],,


# Double Checking DBLP URLs for those profs against Google Scholar or ORCID manually since there's only 7 left


In [17]:
no_dblp.at[28, 'DBLP URL'] = 'https://dblp.org/pid/10/1993.html'
no_dblp.at[43, 'DBLP URL'] ='https://dblp.org/pid/58/10813.html'
no_dblp.at[50, 'DBLP URL'] = 'https://dblp.org/pid/74/5309.html'
no_dblp.at[63, 'DBLP URL'] = 'https://dblp.org/pid/b/SSBhowmick.html'
no_dblp.at[65, 'DBLP URL'] = "https://dblp.org/pid/00/5179.html"
no_dblp.at[75, 'DBLP URL'] = "https://dblp.org/pid/14/4899-1.html"
no_dblp.at[85, 'DBLP URL'] = "https://dblp.org/pid/93/4009.html"

# Dr Tay Kian Boon has no Google Scholar Profile and no DBLP profile
no_dblp.at[67, 'DBLP URL'] = None

In [18]:
df.update(no_dblp)
print(sum(df['DBLP URL'].isna()))

1


# Profs with no Google Scholar Profile to find citations


In [19]:
no_citations = df[pd.isna(df['Citations'])]
no_citations

Unnamed: 0,Full Name,Email,DR-NTU URL,Website URL,DBLP URL,Citations
7,Chan Syin,asschan@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00691,[http://www3.ntu.edu.sg/home/asschan/],https://dblp.org/pid/80/2106.html,
15,Douglas Leslie Maskell,asdouglas@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01059,[http://www3.ntu.edu.sg/home/asdouglas/],https://dblp.org/pid/63/6663.html,
23,Jagath Chandana Rajapakse,asjagath@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00169,"[https://personal.ntu.edu.sg/asjagath, https:/...",https://dblp.org/pid/91/665.html,
24,Josephine Chong,josephine.chong@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp02294,,https://dblp.org/pid/267/0356.html,
25,Joty Shafiq Rayhan,srjoty@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00319,[https://raihanjoty.github.io/],https://dblp.org/pid/62/2078.html,
30,Lau Chiew Tong,asctlau@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00670,,https://dblp.org/pid/30/6609.html,
33,Li Fang,asfli@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01161,,https://dblp.org/pid/55/2162-9.html,
45,Luke Ong （翁之昊）,luke.ong@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp02044,[https://personal.ntu.edu.sg/luke.ong/index.html],https://dblp.org/pid/o/CHLukeOng.html,
56,"Pan, Sinno Jialin",sinnopan@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00122,[https://personal.ntu.edu.sg/sinnopan/],https://dblp.org/pid/80/5412.html,
58,Quek Hiok Chai,ashcquek@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00707,[http://www.c2i.ntu.edu.sg/People/Research/Que...,https://dblp.org/pid/q/HiokChaiQuek.html,


# Double Checking for Citations on Google Scholar for Profs with NaN Citations


In [20]:
# Prof Chan Syin has no Google Scholar Profile
no_citations.at[7, 'Citations'] = None
no_citations.at[15, 'Citations'] = 5353
# Prof Josephine Chong has no Google Scholar Profile
no_citations.at[24, 'Citations']= None
# Prof Rayhan has no Google Scholar Profile
no_citations.at[25, 'Citations']= None
# Prof Lau Chiew Tong has no Google Scholar Profile
no_citations.at[30, 'Citations']= None
no_citations.at[45, 'Citations']= 5933
no_citations.at[56, 'Citations']= 37502
no_citations.at[58, 'Citations'] = None
no_citations.at[63, 'Citations'] = 5868
no_citations.at[66, 'Citations'] = None
no_citations.at[67, 'Citations'] = None
no_citations.at[68, 'Citations'] = None
no_citations.at[70, 'Citations'] = None
no_citations.at[73, 'Citations'] = None
no_citations.at[85, 'Citations'] = 1126


In [21]:
df.update(no_citations)
print(sum(df['Citations'].isna()))

12


# Saving it to a CSV


In [22]:
df.to_csv('Assignment1.csv', index=False, encoding='utf-8')

# Take a look at the DataFrame again

In [23]:
df

Unnamed: 0,Full Name,Email,DR-NTU URL,Website URL,DBLP URL,Citations
0,A S Madhukumar,asmadhukumar@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00083,"[http://www3.ntu.edu.sg/home/asmadhukumar/, ht...",https://dblp.org/pid/66/549.html,2906.0
1,Alexei Sourin,assourin@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00274,"[http://www3.ntu.edu.sg/home/assourin/, https:...",https://dblp.org/pid/15/3108.html,2939.0
2,Anupam Chattopadhyay,anupam@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01076,[https://scholar.google.co.in/citations?user=T...,https://dblp.org/pid/99/4535.html,6226.0
3,Anwitaman Datta,anwitaman@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00706,[https://personal.ntu.edu.sg/anwitaman/],https://dblp.org/pid/d/AnwitamanDatta.html,8043.0
4,Arvind Easwaran,arvinde@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00687,"[https://cps-research-group.github.io/, https:...",https://dblp.org/pid/73/1708.html,2816.0
...,...,...,...,...,...,...
81,Zhang Jie,zhangj@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00759,[https://personal.ntu.edu.sg/zhangj/],https://dblp.org/pid/84/6889-2.html,12214.0
82,Zhang Tianwei,tianwei.zhang@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00562,[https://personal.ntu.edu.sg/tianwei.zhang/],https://dblp.org/pid/77/7902-4.html,2695.0
83,Zhao Jun,junzhao@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00501,[http://junzhaogroupntu.github.io],https://dblp.org/pid/47/2026-7.html,7030.0
84,Zheng Jianmin,asjmzheng@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00072,[https://personal.ntu.edu.sg/asjmzheng],https://dblp.org/pid/09/5452.html,7211.0


# And also the updated missing values

In [24]:
print(sum(df['Website URL'].isna()))
print(sum(df['DBLP URL'].isna()))
print(sum(df['Citations'].isna()))

17
1
12


# DBLP URLs

In [25]:
no_dblp = df[pd.isna(df['DBLP URL'])]
no_dblp

Unnamed: 0,Full Name,Email,DR-NTU URL,Website URL,DBLP URL,Citations
67,Tay Kian Boon,kianboon.tay@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00174,,,


# Citations

In [26]:
no_citations = df[pd.isna(df['Citations'])]
no_citations

Unnamed: 0,Full Name,Email,DR-NTU URL,Website URL,DBLP URL,Citations
7,Chan Syin,asschan@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00691,[http://www3.ntu.edu.sg/home/asschan/],https://dblp.org/pid/80/2106.html,
23,Jagath Chandana Rajapakse,asjagath@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00169,"[https://personal.ntu.edu.sg/asjagath, https:/...",https://dblp.org/pid/91/665.html,
24,Josephine Chong,josephine.chong@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp02294,,https://dblp.org/pid/267/0356.html,
25,Joty Shafiq Rayhan,srjoty@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00319,[https://raihanjoty.github.io/],https://dblp.org/pid/62/2078.html,
30,Lau Chiew Tong,asctlau@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00670,,https://dblp.org/pid/30/6609.html,
33,Li Fang,asfli@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01161,,https://dblp.org/pid/55/2162-9.html,
58,Quek Hiok Chai,ashcquek@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00707,[http://www.c2i.ntu.edu.sg/People/Research/Que...,https://dblp.org/pid/q/HiokChaiQuek.html,
66,Tang Xueyan,asxytang@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00075,[https://personal.ntu.edu.sg/asxytang/],https://dblp.org/pid/23/2460.html,
67,Tay Kian Boon,kianboon.tay@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00174,,,
68,Thambipillai Srikanthan,astsrikan@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00841,,https://dblp.org/pid/23/1694.html,
