In [9]:
from bs4 import BeautifulSoup
import csv
import requests, re
import pandas as pd

# Get Tables

Get the HTML object of IGP score tables

In [10]:
igp_link = "https://docs.google.com/spreadsheets/d/1MPEDZpw26TjN7dTsQzsbnXHZa47og0qSrdHrlT7nLKc/pubhtml#"
html = requests.get(f"{igp_link}").text
soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all("table")
index = 0
# for table in tables:
#     with open(str(index) + ".csv", "w") as f:
#         wr = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n')
#         wr.writerows([[td.text for td in row.find_all("td")] for row in table.find_all("tr")])
#     index = index + 1


# Extract IGP Data

Extracting the raw IGP tables from the spreadsheet link

In [11]:
nus_igp_df = pd.read_html(str(tables[1]))
# Object returned is a list of dataframes with ONE element. Need to convert 
nus_igp_df = nus_igp_df[0]

In [12]:
nus_igp_df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,1,NUS Courses | Admitted Batch >,2023/2024,2022/2023,2021/2022,2020/2021,2019/2020,2018/2019,2017/2018,2016/2017,2015/2016,2014/2015,2013/2014,2012/2013,2011/2012,2010/2011,2009/2010,2008/2009
1,2,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law,Faculty of Law
2,3,Law*,AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85),AAA/A (85)
3,4,Law*,3.75,3.82,3.87,3.81,3.86,3.77,3.80,3.84,3.83,3.79,#,#,#,#,#,#
4,5,Law*,233,244,234,219,223,250,227,237,253,242,244,262,239,240,255,231


In [13]:
## NTU Table
ntu_igp_df = pd.read_html(str(tables[2]))
# Object returned is a list of dataframes with ONE element. Need to convert 
ntu_igp_df = ntu_igp_df[0]

In [14]:
# SMU Table
smu_igp_df = pd.read_html(str(tables[3]))
# Object returned is a list of dataframes with ONE element. Need to convert 
smu_igp_df = smu_igp_df[0]

In [15]:
# # Dump all to /data/ folder. Will do adjustment in google sheets
# nus_igp_df.to_csv("../data/raw_scraped/nus_igp_raw.csv", index=False)
# ntu_igp_df.to_csv("../data/raw_scraped/ntu_igp_raw.csv", index=False)
# smu_igp_df.to_csv("../data/raw_scraped/smu_igp_raw.csv", index=False)

# Pivoting

Have uploaded the scraped tables above to google sheets and done the manual tidying up + removal of selected courses (e.g. those lacking sufficient data). 

Moving on to pivoting the data into a [**tidy format**](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html). 

Ultimately each row should be indexed by the Course-AY, and each row should have the fields:
- Faculty
- RP (10th Percentile Rank Points)
- GPA (10th Percentile GPA)
- Places 

In [28]:
# Read in processed files
ntu_processed = pd.read_excel("../data/scraped_processed/ntu_igp_raw.xlsx", 
sheet_name=0)
nus_processed = pd.read_excel("../data/scraped_processed/nus_igp_raw.xlsx", sheet_name=0)
smu_processed = pd.read_excel("../data/scraped_processed/smu_igp_raw.xlsx", sheet_name=0)

# Drop the Course_old field. This field recorded the original course names as they appeared in the original spreadsheet. used for tracking changes
ntu_processed.drop('Course_old', axis=1, inplace=True)
smu_processed.drop('Course_old', axis=1, inplace=True)
nus_processed.drop('Course_old', axis=1, inplace=True)

KeyError: "['Course_old'] not found in axis"

In [25]:
# Toy Example
tmp = smu_processed.loc[smu_processed.Course=='Accountancy']
# Pivot Wide to long for yearly info
tmp = pd.melt(tmp
        , id_vars=['Course', 'Type']
        , value_vars=tmp.columns.difference(['Course', 'Type'])
        , value_name='value'
        , var_name='AY'
        )
# Pivot Long to wide into tidy format
tmp = tmp.pivot(index = ['Course', 'AY'], columns='Type', values='value')
tmp.columns.name=None # Replace columnset name
# tmp=tmp.reset_index()

In [19]:
def pivot_processed(df, id_vars, pivot_index):
    pivoted = df.copy()
    pivoted = pd.melt(pivoted
            , id_vars=id_vars
            , value_vars=pivoted.columns.difference(id_vars)
            , value_name='value'
            , var_name='AY'
            )
    pivoted = pivoted.pivot(index = pivot_index, columns='Type', values='value')
    pivoted.columns.name=None # Replace columnset name
    return pivoted

In [20]:
# Process for SMU
smu_pivoted = pivot_processed(smu_processed, id_vars = ['Course', 'Type'], pivot_index=['Course','AY'])
smu_pivoted=smu_pivoted.reset_index()
# Process for NTU. Will have an additional faculty column
ntu_pivoted = pivot_processed(ntu_processed, id_vars = ['Course', 'Type', 'Faculty'], pivot_index=['Faculty', 'Course', 'AY'])
ntu_pivoted=ntu_pivoted.reset_index(drop=False)
# Process for NUS. Will have an additional faculty column
nus_pivoted = pivot_processed(nus_processed, id_vars = ['Course', 'Type', 'Faculty'], pivot_index=['Faculty', 'Course', 'AY'])
nus_pivoted=nus_pivoted.reset_index(drop=False)

In [22]:
nus_pivoted.Course.unique()

array(['Architecture*', 'Biomedical Engineering (until AY2021/22)',
       'Chemical Engineering (until AY2021/22)',
       'Civil Engineering (until AY2021/22)',
       'Electrical Engineering (until AY2021/22)',
       'Engineering (Common) (Not offered as a choice from AY2020/21)',
       'Engineering (from AY2022/23)',
       'Engineering Science (until AY2021/22)',
       'Environmental Engineering (until AY2021/22)',
       'Industrial & Systems Engineering (until AY2021/22)',
       'Industrial Design*', 'Landscape Architecture*',
       'Materials Science & Engineering (until AY2021/22)',
       'Mechanical Engineering (until AY2021/22)',
       'Project & Facilities Management/ Infrastructure & Project Management (until AY2021/22, under CDE Engineering from AY2022/23)',
       'Computer Engineering (under CDE from AY2022/23)',
       'Arts & Social Sciences (until AY2020/21)',
       'Data Science & Analytics (until AY2020/21)',
       'Data Science & Economics (XDP)',
       

In [None]:
smu_igp_df

Unnamed: 0,Faculty,Course,AY,GPA,Places,RP
0,College of Design & Engineering,Architecture*,2008/2009,#,171,BCC/C (70)
1,College of Design & Engineering,Architecture*,2009/2010,3.4,159,BBB/C (75)
2,College of Design & Engineering,Architecture*,2010/2011,3.42,148,BBB/B (76.25)
3,College of Design & Engineering,Architecture*,2011/2012,3.58,162,BBB/C (75)
4,College of Design & Engineering,Architecture*,2012/2013,3.83,135,ABB/B (78.75)
...,...,...,...,...,...,...
651,School of Medicine,Nursing*,2019/2020,3.33,311,CCD/B (66.25)
652,School of Medicine,Nursing*,2020/2021,3.25,293,CCD/B (66.25)
653,School of Medicine,Nursing*,2021/2022,3.41,338,CCC/C (67.5)
654,School of Medicine,Nursing*,2022/2023,3.42,314,CCD/B (66.25)


In [1]:
import pandas as pd

In [4]:
tdf = pd.read_csv("../tmp.txt", header=None)

In [8]:
print(tdf[0].tolist())

['Bachelor of Arts', 'Bachelor of Arts (Hons)', 'Bachelor of Social Sciences', 'Business Administration', 'Business Administration (Hons)', 'Accountancy (Hons)', 'Communications and Media', 'Electronic Commerce', 'Information Systems', 'Bachelor of Dental Surgery', 'Industrial Design', 'Project and Facilities Management', 'Real Estate', 'Biomedical Engineering', 'Chemical Engineering', 'Electrical Engineering', 'Engineering Science', 'Industrial and Systems Engineering', 'Materials Science and Engineering', 'Bachelor of Laws', 'Bachelor of Medicine and Bachelor of Surgery', 'Bachelor of Science (Nursing)', 'Bachelor of Science (Nursing) (Hons)', 'Bachelor of Applied Science (Hons)', 'Bachelor of Science', 'Bachelor of Science (Hons)', 'Pharmacy', 'Architecture', 'Bachelor of Music', 'Environmental Studies', 'Business Analytics', 'Bachelor of Arts with Honours', 'Bachelor of Science with Honours', 'Computational Biology', 'Data Science and Analytics', 'Information Security']
