![img](images/plan.jpg)

In [318]:
import re
import itertools
from functools import reduce, partial

import numpy as np
from scipy import stats
import scipy
import pandas as pd
import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio

# Config
pio.templates.default = "plotly_dark"

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 50)
pd.set_option("plotting.backend", "plotly")

# TODO: Backtesting

# Data

In [91]:

data = pd.read_csv("data.csv")
data_filtered = data.drop(columns=data.columns[3:5])

courses = ["ENGLISH100", "ARB100", "MATH101", "CHEM101", "STAT101", "TECH101", "ENTREPRENEUR101", "FAJAB101", "NAHAJ101", "ENGLISH110",
            "SALAM107", "PHYS104", "MATH106", "CSC111", "MATH151", "SALAM108", "CSC113", "CSC220", "MATH244",
            "CSC212", "CSC215", "MATH281", "CSC304", "CSC380", "CSC227", "CSC311", "CSC339", "CSC343", "CSC361",
            "CSC329", "CSC340", "CSC453", "CSC496", "PHYS210", "PHYS103", "CHEM103", "CSC443", "CSC462", "CSC476", "CSC478", "CSC484", "CSC489"]
criteria = ["Application", "Relevance", "Insight", "Understanding", "Ease"]
ratings = list(range(1, 6))

In [92]:
@np.vectorize
def course_to_category(course):
    prefix_to_category = {"ENGLISH":"Humanities", "NAHAJ": "Humanities", "ARB": "Humanities", "ENTREPRENEUR": "Humanities", "FAJAB": "Humanities", "TECH": "Humanities",
                    "PHYS": "Physics", "CHEM": "Chemistry", "SALAM": "Islamics", "MATH": "Mathematics", "STAT": "Mathematics", "CSC": "Computer science"}
    return prefix_to_category[re.findall("[a-zA-Z]+", course)[0]]

categories = pd.Series(course_to_category(courses))
assert sum(categories.isna()) == 0, "prefix_to_category must cover each course prefix"

# Wrangling

In [93]:
def df_by_regex(regex):
    """THIS USES GLOBALY DEFINED DF ALARM ALARM"""
    return data_filtered.filter(regex=regex, axis=1)

def get_subject_df(subject):
    """THIS USES GLOBALY DEFINED DF ALARM ALARM"""
    return df_by_regex(f"{subject}.*\[.*\].*")

In [94]:
index = list((course, criterion) for criterion in criteria for course in courses)
sorted_index = sorted(index, key=lambda x: courses.index(x[0]))
df = get_subject_df("").T.set_index(pd.Index(sorted_index))

# Utility

In [418]:
def remove_double_naming_in_legend(fig):
    for trace in fig.select_traces():
        trace.name = trace.name.split(",")[0]

def get_course_to_num_ranking_series(df):
    # a bit of a primitive way to count every filled in ranking (some rankings are NaN). Maybe it could be improved?
    i = 0
    course_to_num_ranking = {}
    while i < len(df):
        course_to_num_ranking.update({df.iloc[i].name[0]:len(df.iloc[i].dropna())})
        i += 5
    return pd.Series(course_to_num_ranking)

## Ranking

### Course ranking by criteria

In [263]:
def get_scores_df(df, criteria, categories):
    return pd.DataFrame(((bayesian_mean(df.loc[[(subject, criterion) for subject in courses]].T, categories)) for criterion in criteria), index=criteria)

def bayesian_mean(course_to_criterion, categories):
    course_to_criterion_mean = course_to_criterion.mean().droplevel(1)

    temp = course_to_criterion_mean.reset_index()
    temp["index"] = categories
    category_to_criterion_mean = temp.groupby("index").mean()[0]

    return course_to_criterion.apply(lambda x: scores_to_bayesian_mean(x, category_to_criterion_mean)).droplevel(1)

def scores_to_bayesian_mean(scores, category_to_criterion_mean, category_weight_portion=1):
    """
    scores: Series of scores with Name formatted like this (Course, _)
    category_to_criterion_mean: category_to_criterion_mean
    category_weight_portion: the portion of scores category average is weighted at. e.g. 1 means its just as weighty as scores, 2 means half as weighty
    returns bayesian estimation of mean of scores, given its category
    """
    scores = scores.dropna()
    category_mean_series = pd.Series({"": category_to_criterion_mean[course_to_category(scores.name[0]).item(0)]})
    weights = [max(len(scores) // category_weight_portion, 1)] + [1] * len(scores)
    normalized_weights = weights/np.linalg.norm(weights)
    return np.average(scores.append(category_mean_series), weights=normalized_weights)

In [440]:
functions = [
            lambda x: bayesian_mean(x, categories), 
            lambda x: pd.Series(np.nanmedian(x, axis=0), index=courses), 
            lambda x: pd.Series(stats.mode(x, nan_policy="omit")[0].data[0], index=courses),
            lambda x: x.std().droplevel(1),
]
function_to_name = {function: function_name for function, function_name in zip(functions, ["Bayesian estimated mean", "Median", "Mode", "Standard deviation"])}
rows = len(functions) * 3
cols = 2
subtitles = np.empty((rows, cols), dtype="object")
for f_i, function in enumerate(functions):
    for c_i, criterion in enumerate(criteria):
        row = f_i * 3 + int(c_i / 2)
        col = c_i % 2
        subtitles[row, col] = (f"{function_to_name.get(function)} {criterion} score for each course")


fig = make_subplots(rows=len(functions) * 3, cols=2, 
        subplot_titles=subtitles.reshape(-1))
        
for f_i, function in enumerate(functions):
    for c_i, criterion in enumerate(criteria):
        row = f_i * 3 + int(c_i / 2 + 1)
        col = c_i % 2 + 1
        course_to_criterion = function(df.loc[[(subject, criterion) for subject in courses]].T)

        scatter = px.scatter(course_to_criterion, color=categories, symbol=categories)
        for trace in scatter.select_traces():
            # Only show legend once
            if f_i == c_i == 0:
                trace.showlegend = True
            else:
                trace.showlegend = False

            fig.add_trace(
                trace,
                row=row, col=col
            )
        fig.update_yaxes(
            title=f"{function_to_name.get(function)} {criterion}",
            row=row, col=col
        )
    remove_double_naming_in_legend(fig)
    fig.update_layout(height=5000, width=2400)
fig

### Scores & weighted scores

In [265]:
score_df = get_scores_df(df, criteria, categories)

In [273]:
i = 0
course_to_num_ranking = {}
while i < len(df):
    course_to_num_ranking.update({df.iloc[i].name[0]:len(df.iloc[i].dropna())})
    i += 5
course_to_num_ranking_series = pd.Series(course_to_num_ranking)

In [447]:
sorted_categories = course_to_category((scores_df.sum().sort_values() / 25).index)
fig = px.scatter(scores_df.sum().sort_values() / 25, color=sorted_categories, symbol=sorted_categories, title="Course ranking (General)")
fig.update_xaxes(title="Course")
fig.update_yaxes(title="Ranking")

remove_double_naming_in_legend(fig)

fig.add_trace(
    go.Bar(x=course_to_num_ranking_series.index, y=course_to_num_ranking_series / np.linalg.norm(course_to_num_ranking_series), marker={"color": "rgba(100, 100, 100, 0.3)"}, name="Percentage of data points"),
    # row=2, col=1
)
fig.update_layout(height=1000, width=2000)

In [499]:
weights_real_world = {"Application": 1.5, "Relevance": 1, "Understanding": 0.2, "Ease": 0.5}
weights_academic = {"Insight": 1.5, "Understanding": 1.5}

scores_df_real_world = scores_df.apply(lambda x: weights_real_world.get(x.name, 0) * x, axis=1)
scores_df_academic = scores_df.apply(lambda x: weights_academic.get(x.name, 0) * x, axis=1)

standardized_scores_df_real_world = scores_df_real_world.sum() / max(scores_df_real_world.sum())
standardized_scores_df_academic = scores_df_academic.sum() / max(scores_df_academic.sum())

In [500]:
weighted_scores = pd.DataFrame([standardized_scores_df_real_world, standardized_scores_df_academic]).T
weighted_scores.columns = ("Real-world score", "Academic score")

In [533]:
sorted_categories = course_to_category(weighted_scores.index)
fig = px.scatter(weighted_scores, x="Real-world score", y="Academic score",color=sorted_categories, symbol=sorted_categories, title="Course ranking (Real-world Vs. Academic)",
                hover_name=weighted_scores.index, text=weighted_scores.index)

remove_double_naming_in_legend(fig)

fig.update_traces(textposition="top center")
fig.update_layout(height=1000, width=1600)
fig

## Meta-stats

In [431]:
rating_counts_per_criteria = []
for criterion in criteria:
    rating_counts_per_criteria.append(df_by_regex(f".*{criterion}.*").apply(pd.value_counts).sum(axis=1))
rating_counts_df = pd.DataFrame(rating_counts_per_criteria).T
rating_counts_df.columns = criteria

In [432]:
rating_counts_df

Unnamed: 0,Application,Relevance,Insight,Understanding,Ease
1.0,40.0,35.0,51.0,36.0,28.0
2.0,30.0,31.0,30.0,26.0,28.0
3.0,40.0,42.0,40.0,38.0,45.0
4.0,26.0,36.0,39.0,44.0,45.0
5.0,86.0,78.0,62.0,78.0,76.0


In [433]:
rating_counts_raw_criteria = []
for criterion in criteria:
    rating_counts_raw_criteria.append(np.concatenate(df_by_regex(f".*{criterion}.*").values))
rating_counts_raw_df = pd.DataFrame(rating_counts_raw_criteria).T
rating_counts_raw_df.columns = criteria

In [434]:
px.violin(rating_counts_raw_df)