In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, NavigableString
import requests
from thefuzz import fuzz
import re
import time
import itertools
import json

# Extract DR-NTU

1. DR-NTU
    - Name,Email,Designations,Patents,Grants,Biography,External Websites,Image

## Extract SCSE Profile

In [2]:
def extract_scse_profiles():    
    # rpp query parameter specifies number of rows to display
    # start query parameter specifies which row to start displaying from.
    start = 0
    rpp = 50
    dr_ntu = {"email":[],'name':[], "dr_ntu_url":[]}
    while True:
        SCSE_list_url = f"https://dr.ntu.edu.sg/simple-search?query=&location=researcherprofiles&filter_field_1=school&filter_type_1=authority&filter_value_1=ou00030&crisID=&relationName=&sort_by=bi_sort_4_sort&order=asc&rpp={rpp}&etal=0&start={start}"

        soup_source = requests.get(SCSE_list_url).text
        soup = BeautifulSoup(soup_source,'lxml')

        table = soup.find('table')
        if table==None:
            break

        else:
            table_rows = table.find_all('tr')
            # Skip Header Column
            for row in table_rows[1:]:
                name = row.find(name='td', headers='t1').text
                dr_ntu_url = row.find(name='td', headers='t1').find(name='a')['href']
                email = row.find(name='td', headers='t3').text

                dr_ntu['name'].append(name)
                dr_ntu['dr_ntu_url'].append("https://dr.ntu.edu.sg"+dr_ntu_url)
                dr_ntu['email'].append(email)
                
        start+=rpp
    return pd.DataFrame(dr_ntu)


In [3]:
scse_profiles = extract_scse_profiles()
# Save to csv file in prof_raw_data dir
raw_dir = './prof_raw_data/'
filename = 'scse_profiles'
filepath = f"{raw_dir}{filename}.csv" 

scse_profiles.to_csv(filepath)
    

# Extract Individual's Information from DR-NTU

In [9]:
def extract_dr_ntu(row):

    name = row['name']
    dr_ntu_url = row['dr_ntu_url']
    ntu_email = row['email']

    profile = {}
    profile['full_name'] = name
    profile['email'] = ntu_email


    response = requests.get(dr_ntu_url).text
    html = BeautifulSoup(response,'lxml')
    
    
    # scrape the text on the name card class
    span = html.find(name='span',attrs={'class':'namecard-fullname'})
    if span is None:
        profile['name_card'] = None
    else:
        profile['name_card'] = span.text.strip()

    img = html.find(name='img',attrs={'id':'picture'})
    img_link = f"https://dr.ntu.edu.sg{img.get('src')}"
    response = requests.get(img_link)
    
    image_path = f"./profile_image/{name.lower().replace(' ','_')}.jpg"
    with open(image_path,'wb')as f:
        f.write(response.content)
    
    profile['image_path'] = f"./data_source/profile_image/{name.lower().replace(' ','_')}.jpg"

    
    # scraping designations can have multiple designation
    div = span.parent
    div = div.find_next_sibling('div')
    designations = []
    while div.get('id') is None:
        designations.append(div.text.strip())
        div = div.find_next_sibling('div')

    profile['designations'] = designations
    
    urls = {'dr_ntu':dr_ntu_url,'orcid':None,'personal':None}
    # scrape all personal websites from personalsiteDiv
    div = html.find(name='div',attrs={'id':'personalsiteDiv'})
    if div != None:
        a_tags = div.find_all(name='a')
        for i,a_tag in enumerate(a_tags):
            # first one is always personal website
            if i==0:
                url = a_tag.get('href')
                # fix the key to personal instead of what is typed as text in html
                text = 'personal'
            else:
                url = a_tag.get('href')
                text = a_tag.text.lower().replace(' ','_').strip()

            if url is None or url =='#':
                continue
            else:
                urls[text] = url
    
    # scrape biography
    biography_text = html.find(name='div',attrs={'id':'biographyDiv'}).text.strip()
    profile['biography'] = biography_text

    # scrape keywords
    keywords = []
    div = html.find(name='div',attrs={'id':'researchkeywords','class':'panel'})
    if div != None:
        spans = html.find_all(name='span',attrs={'class':'rkeyword'})
        for span in spans:
            keywords.append(span.text.strip())

    profile['keywords'] = keywords

    # scrape research grants
    grants = []
    div = html.find(name='div',attrs={'id':'currentgrantsDiv'})
    if div != None:
        # first case is encapsulated by li tag
        ul = div.find(name='ul')
        if ul != None:
            li_tags = ul.find_all('li')
            for li_tag in li_tags:
                grants.append(li_tag.text.strip())

        elif div.find('br'):
            for child in div.children:
                if child.name == 'br':
                    grants.append(child.next_sibling.strip())
        else:
            pass
    profile['grants'] = grants

    #scrape patents
    patents = []
    div = html.find(name='div',attrs={'id':'centralpatentsDiv'})
    if div:
        a_tags = div.find_all('a')
        links = [a_tag['href'] for a_tag in a_tags]
        titles = [a_tag.find('b').text for a_tag in a_tags]
        u_tags = div.find_all('u')
        abstracts = [u_tag.next_sibling.strip() for u_tag in u_tags]

        for link,title,abstract in zip(links,titles,abstracts):
            patents.append({'link':link,'title':title,'abstract':abstract})
    
    profile['patents'] = patents

    # scrape all personal websites from bibliometric
    publication_url = dr_ntu_url + '/selectedPublications.html'
    resp = requests.get(publication_url).text
    html = BeautifulSoup(resp, "html.parser")
    div = html.find(name='div', attrs={'id':'custombiblio'})
    if div:
        div_list = div.find_all(name='div', attrs={'class':'dynaField'})
        for div in div_list:
            a_tag = div.find(name='a')
            url = a_tag.get('href')
            text = a_tag.find(name='span').text.lower().strip().replace(' ','_')
            # if key already exist in urls dict or if url is empty --> skip
            if url and text not in urls:
                urls[text] = url
    
    profile['urls'] = urls


    dir = './prof_raw_data/'
    prefix = 'dr_ntu_'
    filename = name.lower().replace(' ','_')
    filepath = f"{dir}{prefix}{filename}.json"
    with open(filepath,'w') as f:
        json.dump(profile,f)
        
    return

In [10]:
scse_profile_df = pd.read_csv('./prof_raw_data/scse_profiles.csv')

In [11]:
scse_profile_df.apply(extract_dr_ntu,axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
81    None
82    None
83    None
84    None
85    None
Length: 86, dtype: object

## Extract About Us

- Introduction Text from NTU Website

In [10]:
url = "https://www.ntu.edu.sg/scse/about-us"
response = requests.get(url).text
soup = BeautifulSoup(response)
div = soup.find(attrs={'class':'rte'})
p_tags = soup.find_all('p')
text = []
for p in p_tags:
    p_text = p.get_text(strip=True)
    if p_text != "":
        # Split the text at the `<br>` tags and strip any leading/trailing whitespace
        text.extend(br.strip() for br in p_text.split('<br>'))

# br_tags = div.find_all('br')
# text = [br.parent.get_text(strip=True) for br in br_tags if br.parent and br.parent.get_text(strip=True) != ""]

with open('./prof_raw_data/scse_intro.json','w') as f:
    scse_intro = {'intro':text}
    json.dump(scse_intro,f)
