In [10]:
from bs4 import BeautifulSoup
import csv
import requests, re
import pandas as pd

# Get Tables

Get the HTML object of IGP score tables

In [11]:
# igp_link = "https://docs.google.com/spreadsheets/d/1MPEDZpw26TjN7dTsQzsbnXHZa47og0qSrdHrlT7nLKc/pubhtml#"
# html = requests.get(f"{igp_link}").text
# soup = BeautifulSoup(html, "html.parser")
# tables = soup.find_all("table")
# index = 0
# # for table in tables:
# #     with open(str(index) + ".csv", "w") as f:
# #         wr = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n')
# #         wr.writerows([[td.text for td in row.find_all("td")] for row in table.find_all("tr")])
# #     index = index + 1


# Extract IGP Data

Extracting the raw IGP tables from the spreadsheet link

In [12]:
# nus_igp_df = pd.read_html(str(tables[1]))
# # Object returned is a list of dataframes with ONE element. Need to convert 
# nus_igp_df = nus_igp_df[0]

In [13]:
# nus_igp_df.head()

In [14]:
# ## NTU Table
# ntu_igp_df = pd.read_html(str(tables[2]))
# # Object returned is a list of dataframes with ONE element. Need to convert 
# ntu_igp_df = ntu_igp_df[0]

In [15]:
# # SMU Table
# smu_igp_df = pd.read_html(str(tables[3]))
# # Object returned is a list of dataframes with ONE element. Need to convert 
# smu_igp_df = smu_igp_df[0]

In [16]:
# # Dump all to /data/ folder. Will do adjustment in google sheets
# nus_igp_df.to_csv("../data/raw_scraped/nus_igp_raw.csv", index=False)
# ntu_igp_df.to_csv("../data/raw_scraped/ntu_igp_raw.csv", index=False)
# smu_igp_df.to_csv("../data/raw_scraped/smu_igp_raw.csv", index=False)

# Pivoting

Have uploaded the scraped tables above to google sheets and done the manual tidying up + removal of selected courses (e.g. those lacking sufficient data). 

Moving on to pivoting the data into a [**tidy format**](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html). 

Ultimately each row should be indexed by the Course-AY, and each row should have the fields:
- Faculty
- RP (10th Percentile Rank Points)
- GPA (10th Percentile GPA)
- Places 

In [18]:
# Read in processed files
ntu_processed = pd.read_excel("../../data/scraped_processed/ntu_igp_raw.xlsx", 
sheet_name=0)
nus_processed = pd.read_excel("../../data/scraped_processed/nus_igp_raw.xlsx", sheet_name=0)
smu_processed = pd.read_excel("../../data/scraped_processed/smu_igp_raw.xlsx", sheet_name=0)

# Drop the Course_old field. This field recorded the original course names as they appeared in the original spreadsheet. used for tracking changes
ntu_processed.drop('Course_old', axis=1, inplace=True)
smu_processed.drop('Course_old', axis=1, inplace=True)
nus_processed.drop('Course_old', axis=1, inplace=True)

In [19]:
# Toy Example
tmp = smu_processed.loc[smu_processed.Course=='Accountancy']
# Pivot Wide to long for yearly info
tmp = pd.melt(tmp
        , id_vars=['Course', 'Type']
        , value_vars=tmp.columns.difference(['Course', 'Type'])
        , value_name='value'
        , var_name='AY'
        )
# Pivot Long to wide into tidy format
tmp = tmp.pivot(index = ['Course', 'AY'], columns='Type', values='value')
tmp.columns.name=None # Replace columnset name
# tmp=tmp.reset_index()

In [20]:
def pivot_processed(df, id_vars, pivot_index):
    pivoted = df.copy()
    pivoted = pd.melt(pivoted
            , id_vars=id_vars
            , value_vars=pivoted.columns.difference(id_vars)
            , value_name='value'
            , var_name='AY'
            )
    pivoted = pivoted.pivot(index = pivot_index, columns='Type', values='value')
    pivoted.columns.name=None # Replace columnset name
    return pivoted

In [21]:
# Process for SMU
smu_pivoted = pivot_processed(smu_processed, id_vars = ['Course', 'Type', 'Faculty'], pivot_index=['Faculty', 'Course', 'AY'])
smu_pivoted=smu_pivoted.reset_index()
# Process for NTU. Will have an additional faculty column
ntu_pivoted = pivot_processed(ntu_processed, id_vars = ['Course', 'Type', 'Faculty'], pivot_index=['Faculty', 'Course', 'AY'])
ntu_pivoted=ntu_pivoted.reset_index(drop=False)
# Process for NUS. Will have an additional faculty column
nus_pivoted = pivot_processed(nus_processed, id_vars = ['Course', 'Type', 'Faculty'], pivot_index=['Faculty', 'Course', 'AY'])
nus_pivoted=nus_pivoted.reset_index(drop=False)

In [22]:
nus_pivoted.head()

Unnamed: 0,Faculty,Course,AY,GPA,Places,RP
0,College of Design & Engineering,Architecture,2008/2009,#,171,BCC/C (70)
1,College of Design & Engineering,Architecture,2009/2010,3.4,159,BBB/C (75)
2,College of Design & Engineering,Architecture,2010/2011,3.42,148,BBB/B (76.25)
3,College of Design & Engineering,Architecture,2011/2012,3.58,162,BBB/C (75)
4,College of Design & Engineering,Architecture,2012/2013,3.83,135,ABB/B (78.75)


In [23]:
# Combine all the records and export
nus_pivoted['University']='NUS'
ntu_pivoted['University']='NTU'
smu_pivoted['University']='SMU'

master_pivoted = pd.concat([nus_pivoted, ntu_pivoted, smu_pivoted], ignore_index=True)
master_pivoted['year'] = master_pivoted['AY'].str.extract(r"^(\d{4})").astype(int)

In [26]:
# Reorder Columns
col_order=['University', 'Faculty', 'Course',  'year', 'GPA', 'Places', 'RP','AY']
master_pivoted=master_pivoted[col_order]

This is a pivoted and tidied up version of the IGP data

In [27]:
master_pivoted.head(10)

Unnamed: 0,University,Faculty,Course,year,GPA,Places,RP,AY
0,NUS,College of Design & Engineering,Architecture,2008,#,171,BCC/C (70),2008/2009
1,NUS,College of Design & Engineering,Architecture,2009,3.4,159,BBB/C (75),2009/2010
2,NUS,College of Design & Engineering,Architecture,2010,3.42,148,BBB/B (76.25),2010/2011
3,NUS,College of Design & Engineering,Architecture,2011,3.58,162,BBB/C (75),2011/2012
4,NUS,College of Design & Engineering,Architecture,2012,3.83,135,ABB/B (78.75),2012/2013
5,NUS,College of Design & Engineering,Architecture,2013,3.69,146,AAB/C (80),2013/2014
6,NUS,College of Design & Engineering,Architecture,2014,3.76,151,AAB/C (80),2014/2015
7,NUS,College of Design & Engineering,Architecture,2015,3.72,149,ABB/B (78.75),2015/2016
8,NUS,College of Design & Engineering,Architecture,2016,3.77,149,AAB/C (80),2016/2017
9,NUS,College of Design & Engineering,Architecture,2017,3.78,148,ABB/B (78.75),2017/2018


In [11]:
# master_pivoted.to_excel("../data/igp_tidy.xlsx", index=False)