In [267]:
import re
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

# Config
pio.templates.default = "plotly_dark"

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 50)
pd.set_option("plotting.backend", "plotly")

# TODO: Backtesting

# Data

In [268]:

data = pd.read_csv("data.csv")
data_filtered = data.drop(columns=data.columns[3:5])

courses = ["ENGLISH100", "ARB100", "MATH101", "CHEM101", "STAT101", "TECH101", "ENTREPRENEUR101", "FAJAB101", "NAHAJ101", "ENGLISH110",
            "SALAM107", "PHYS104", "MATH106", "CSC111", "MATH151", "SALAM108", "CSC113", "CSC220", "MATH244",
            "CSC212", "CSC215", "MATH281", "CSC304", "CSC380", "CSC227", "CSC311", "CSC339", "CSC343", "CSC361",
            "CSC329", "CSC340", "CSC453", "CSC496", "PHYS210", "PHYS103", "CHEM103", "CSC443", "CSC462", "CSC476", "CSC478", "CSC484", "CSC489"]
criteria = ["Application", "Relevance", "Insight", "Understanding", "Ease"]
ratings = list(range(1, 6))

In [460]:
@np.vectorize
def course_to_category(course):
    prefix_to_category = {"ENGLISH":"Humanities", "NAHAJ": "Humanities", "ARB": "Humanities", "ENTREPRENEUR": "Humanities", "FAJAB": "Humanities", "TECH": "Humanities",
                    "PHYS": "Physics", "CHEM": "Chemistry", "SALAM": "Islamics", "MATH": "Mathematics", "STAT": "Mathematics", "CSC": "Computer science"}
    return prefix_to_category[re.findall("[a-zA-Z]+", course)[0]]

categories = pd.Series(course_to_category(courses))
assert sum(categories.isna()) == 0, "prefix_to_category must cover each course prefix"

# Wrangling

In [270]:
def df_by_regex(regex):
    """THIS USES GLOBALY DEFINED DF ALARM ALARM"""
    return data_filtered.filter(regex=regex, axis=1)

def get_subject_df(subject):
    """THIS USES GLOBALY DEFINED DF ALARM ALARM"""
    return df_by_regex(f"{subject}.*\[.*\].*")

In [271]:
index = list((course, criterion) for criterion in criteria for course in courses)
sorted_index = sorted(index, key=lambda x: courses.index(x[0]))
df = get_subject_df("").T.set_index(pd.Index(sorted_index))

## Ranking

### What course is [max, min, contriversial] based on [criteria, weights]

In [590]:
def scores_to_bayesian_mean(scores, category_to_application_mean, category_weight_portion=1):
    """
    scores: Series of scores with Name formatted like this (Course, _)
    category_to_application_mean: category_to_application_mean
    category_weight_portion: the portion of scores category average is weighted at. e.g. 1 means its just as weighty as scores, 2 means half as weighty
    returns bayesian estimation of mean of scores, given its category
    """
    scores = scores.dropna()
    category_mean_series = pd.Series({"": category_to_application_mean[course_to_category(scores.name[0]).item(0)]})
    weights = [max(len(scores) // category_weight_portion, 1)] + [1] * len(scores)
    normalized_weights = weights/np.linalg.norm(weights)
    return np.average(scores.append(category_mean_series), weights=normalized_weights)


In [591]:
course_to_application_mean = df.loc[[(subject, "Application") for subject in courses]].T.mean().droplevel(1)

temp = course_to_application_mean.reset_index()
temp["index"] = categories
category_to_application_mean = temp.groupby("index").mean()[0]

bayesian_estimate_course_to_application_mean = df.loc[[(subject, "Application") for subject in courses]].T.apply(
                                                lambda scores: scores_to_bayesian_mean(scores, category_to_application_mean)
                                                ).droplevel(1)

fig = px.scatter(bayesian_estimate_course_to_application_mean, color=categories, symbol=categories)
fig.update_layout(
    title="Course Vs. Bayesian estimated mean Applications score",
    xaxis_title="Course",
    yaxis_title="Application score (1-5)",
    legend_title="",
)

In [589]:
fig = px.scatter(course_to_application_mean, color=categories, symbol=categories)
fig.update_layout(
    title="Course Vs. mean Applications score",
    xaxis_title="Course",
    yaxis_title="Application score (1-5)",
    legend_title="",
)

## Meta-stats

In [None]:
# a bit of a primitive way to count every filled in ranking (some rankings are NaN). Maybe it could be improved?
i = 0
course_to_num_ranking = {}
while i < len(df):
    course_to_num_ranking.update({df.iloc[i].name[0]:len(df.iloc[i].dropna())})
    i += 5
course_to_num_ranking_series = pd.Series(course_to_num_ranking)

In [None]:
px.scatter(course_to_num_ranking_series, color=categories, symbol=categories, hover_name=course_to_num_ranking_series.index)

In [None]:
rating_counts_per_criteria = []
for criterion in criteria:
    rating_counts_per_criteria.append(df_by_regex(f".*{criterion}.*").apply(pd.value_counts).sum(axis=1))
rating_counts_df = pd.DataFrame(rating_counts_per_criteria).T
rating_counts_df.columns = criteria

In [None]:
rating_counts_df

Unnamed: 0,Application,Relevance,Insight,Understanding,Ease
1.0,40.0,35.0,51.0,36.0,28.0
2.0,30.0,31.0,30.0,26.0,28.0
3.0,40.0,42.0,40.0,38.0,45.0
4.0,26.0,36.0,39.0,44.0,45.0
5.0,86.0,78.0,62.0,78.0,76.0


In [None]:
rating_counts_raw_criteria = []
for criterion in criteria:
    rating_counts_raw_criteria.append(np.concatenate(df_by_regex(f".*{criterion}.*").values))
rating_counts_raw_df = pd.DataFrame(rating_counts_raw_criteria).T
rating_counts_raw_df.columns = criteria

In [None]:
px.violin(rating_counts_raw_df)

In [None]:
px.histogram(df_by_regex(".*Application.*"))

In [None]:
px.histogram(np.concatenate(df_by_regex(f".*Relevance.*").values))

In [None]:
px.histogram(df["MATH101 [Application]"])