In [2]:
from bs4 import BeautifulSoup
import csv
import requests, re
import pandas as pd

# Get Tables

Get the HTML object of IGP score tables

In [2]:
igp_link = "https://docs.google.com/spreadsheets/d/1MPEDZpw26TjN7dTsQzsbnXHZa47og0qSrdHrlT7nLKc/pubhtml#"
html = requests.get(f"{igp_link}").text
soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all("table")
index = 0
# for table in tables:
#     with open(str(index) + ".csv", "w") as f:
#         wr = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n')
#         wr.writerows([[td.text for td in row.find_all("td")] for row in table.find_all("tr")])
#     index = index + 1


# Extract IGP Data

Extracting the raw IGP tables from the spreadsheet link

In [3]:
nus_igp_df = pd.read_html(str(tables[1]))
# Object returned is a list of dataframes with ONE element. Need to convert 
nus_igp_df = nus_igp_df[0]

In [1]:
nus_igp_df.head()

NameError: name 'tables' is not defined

In [6]:
## NTU Table
ntu_igp_df = pd.read_html(str(tables[2]))
# Object returned is a list of dataframes with ONE element. Need to convert 
ntu_igp_df = ntu_igp_df[0]

In [8]:
# SMU Table
smu_igp_df = pd.read_html(str(tables[3]))
# Object returned is a list of dataframes with ONE element. Need to convert 
smu_igp_df = smu_igp_df[0]

In [9]:
# Dump all to /data/ folder. Will do adjustment in google sheets
nus_igp_df.to_csv("../data/raw_scraped/nus_igp_raw.csv", index=False)
ntu_igp_df.to_csv("../data/raw_scraped/ntu_igp_raw.csv", index=False)
smu_igp_df.to_csv("../data/raw_scraped/smu_igp_raw.csv", index=False)

# Pivoting

Have uploaded the scraped tables above to google sheets and done the manual tidying up + removal of selected courses (e.g. those lacking sufficient data). 

Moving on to pivoting the data into a [**tidy format**](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html). 

Ultimately each row should be indexed by the Course-AY, and each row should have the fields:
- Faculty
- RP (10th Percentile Rank Points)
- GPA (10th Percentile GPA)
- Places 

In [3]:
# Read in processed files
ntu_processed = pd.read_excel("../data/scraped_processed/ntu_igp_raw.xlsx", 
sheet_name=0)
nus_processed = pd.read_excel("../data/scraped_processed/nus_igp_raw.xlsx", sheet_name=0)
smu_processed = pd.read_excel("../data/scraped_processed/smu_igp_raw.xlsx", sheet_name=0)

In [4]:
smu_processed.head()

Unnamed: 0,Course,Type,2023/2024,2022/2023,2021/2022,2020/2021,2019/2020,2018/2019,2017/2018,2016/2017,2015/2016,2014/2015,2013/2014,2012/2013,2011/2012,2010/2011,2009/2010,2008/2009
0,Accountancy,RP,BBC/C (72.5),BBC/C (72.5),BBC/C (72.5),BBC/C (72.5),ABB/C (77.5),AAB/B (81.25),AAB/B (81.25),AAB/B (81.25),AAB/C (80),AAB/B (81.25),AAB/C (80),AAB/B (81.25),AAB/A (82.5),AAB/B (81.25),AAB/B (81.25),ABB/A (80)
1,Accountancy,GPA,3.63,3.63,3.54,3.53,3.68,3.7,3.7,3.72,3.71,3.72,3.72,3.73,3.78,3.76,3.77,3.6
2,Accountancy,Places,272,317,348,333,349,301,292,292,299,308,282,303,275,258,288,270
3,Business Management,RP,BBB/C (75),BBB/B (76.25),BBB/B (76.25),BBB/C (75),ABB/C (77.5),ABB/C (77.5),ABB/C (77.5),ABB/C (77.5),BBB/B (76.25),BBB/B (76.25),BBB/B (76.25),BBB/B (76.25),ABB/B (78.75),ABB/C (77.5),ABB/C (77.5),ABB/B (78.75)
4,Business Management,GPA,3.7,3.7,3.68,3.66,3.68,3.7,3.7,3.72,3.7,3.71,3.72,3.72,3.71,3.62,3.52,3.49


In [5]:
# Toy Example
tmp = smu_processed.loc[smu_processed.Course=='Accountancy']
# Pivot Wide to long for yearly info
tmp = pd.melt(tmp
        , id_vars=['Course', 'Type']
        , value_vars=tmp.columns.difference(['Course', 'Type'])
        , value_name='value'
        , var_name='AY'
        )
# Pivot Long to wide into tidy format
tmp = tmp.pivot(index = ['Course', 'AY'], columns='Type', values='value')
tmp.columns.name=None # Replace columnset name
# tmp=tmp.reset_index()

In [6]:
def pivot_processed(df, id_vars, pivot_index):
    pivoted = df.copy()
    pivoted = pd.melt(pivoted
            , id_vars=id_vars
            , value_vars=pivoted.columns.difference(id_vars)
            , value_name='value'
            , var_name='AY'
            )
    pivoted = pivoted.pivot(index = pivot_index, columns='Type', values='value')
    pivoted.columns.name=None # Replace columnset name
    return pivoted

In [7]:
# Process for SMU
smu_pivoted = pivot_processed(smu_processed, id_vars = ['Course', 'Type'], pivot_index=['Course','AY'])
smu_pivoted=smu_pivoted.reset_index()
# Process for NTU. Will have an additional faculty column
ntu_pivoted = pivot_processed(ntu_processed, id_vars = ['Course', 'Type', 'Faculty'], pivot_index=['Faculty', 'Course', 'AY'])
ntu_pivoted=ntu_pivoted.reset_index(drop=False)
# Process for NUS. Will have an additional faculty column
nus_pivoted = pivot_processed(nus_processed, id_vars = ['Course', 'Type', 'Faculty'], pivot_index=['Faculty', 'Course', 'AY'])
nus_pivoted=nus_pivoted.reset_index(drop=False)

In [8]:
nus_pivoted

Unnamed: 0,Faculty,Course,AY,GPA,Places,RP
0,College of Design & Engineering,Architecture*,2008/2009,#,171,BCC/C (70)
1,College of Design & Engineering,Architecture*,2009/2010,3.4,159,BBB/C (75)
2,College of Design & Engineering,Architecture*,2010/2011,3.42,148,BBB/B (76.25)
3,College of Design & Engineering,Architecture*,2011/2012,3.58,162,BBB/C (75)
4,College of Design & Engineering,Architecture*,2012/2013,3.83,135,ABB/B (78.75)
...,...,...,...,...,...,...
651,School of Medicine,Nursing*,2019/2020,3.33,311,CCD/B (66.25)
652,School of Medicine,Nursing*,2020/2021,3.25,293,CCD/B (66.25)
653,School of Medicine,Nursing*,2021/2022,3.41,338,CCC/C (67.5)
654,School of Medicine,Nursing*,2022/2023,3.42,314,CCD/B (66.25)


In [None]:
smu_igp_df

Unnamed: 0,Faculty,Course,AY,GPA,Places,RP
0,College of Design & Engineering,Architecture*,2008/2009,#,171,BCC/C (70)
1,College of Design & Engineering,Architecture*,2009/2010,3.4,159,BBB/C (75)
2,College of Design & Engineering,Architecture*,2010/2011,3.42,148,BBB/B (76.25)
3,College of Design & Engineering,Architecture*,2011/2012,3.58,162,BBB/C (75)
4,College of Design & Engineering,Architecture*,2012/2013,3.83,135,ABB/B (78.75)
...,...,...,...,...,...,...
651,School of Medicine,Nursing*,2019/2020,3.33,311,CCD/B (66.25)
652,School of Medicine,Nursing*,2020/2021,3.25,293,CCD/B (66.25)
653,School of Medicine,Nursing*,2021/2022,3.41,338,CCC/C (67.5)
654,School of Medicine,Nursing*,2022/2023,3.42,314,CCD/B (66.25)
