In [154]:
import re
import operator
import itertools
from functools import reduce, partial

import numpy as np
from scipy import stats
import scipy
import pandas as pd
import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio

# Config
pio.templates.default = "plotly_dark"

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 50)
pd.set_option("plotting.backend", "plotly")

# TODO: Backtesting

# Data

In [155]:
data = pd.read_csv("data.csv")
data_male = data.groupby("College branch").get_group("Males")
data_female = data.groupby("College branch").get_group("Females")

data_filtered = data.drop(columns=data.columns[3:5])
data_male_filtered = data_male.drop(columns=data_male.columns[3:5])
data_female_filtered = data_female.drop(columns=data_female.columns[3:5])

courses = ["ENGLISH100", "ARB100", "MATH101", "CHEM101", "STAT101", "TECH101", "ENTREPRENEUR101", "FAJAB101", "NAHAJ101", "ENGLISH110",
            "SALAM107", "PHYS104", "MATH106", "CSC111", "MATH151", "SALAM108", "CSC113", "CSC220", "MATH244",
            "CSC212", "CSC215", "MATH281", "CSC304", "CSC380", "CSC227", "CSC311", "CSC339", "CSC343", "CSC361",
            "CSC329", "CSC340", "CSC453", "CSC496", "PHYS210", "PHYS103", "CHEM103", "CSC443", "CSC462", "CSC476", "CSC478", "CSC484", "CSC489"]
criteria = ["Application", "Relevance", "Insight", "Understanding", "Ease"]
ratings = list(range(1, 6))

filtered_comments = ["I do not take this course. "]
bayes_weight = 2

# Utility

In [156]:
@np.vectorize
def course_to_category(course):
    prefix_to_category = {"ENGLISH":"Humanities", "NAHAJ": "Humanities", "ARB": "Humanities", "ENTREPRENEUR": "Humanities", "FAJAB": "Humanities", "TECH": "Humanities",
                    "PHYS": "Physics", "CHEM": "Chemistry", "SALAM": "Islamics", "MATH": "Mathematics", "STAT": "Mathematics", "CSC": "Computer science"}
    return prefix_to_category[re.findall("[a-zA-Z]+", course)[0]]

def df_by_regex(df, regex):
    return df.filter(regex=regex, axis=1)

def get_subject_df(df, subject):
    return df_by_regex(df, f"{subject}.*\[.*\].*")

def remove_double_naming_in_legend(fig):
    for trace in fig.select_traces():
        trace.name = trace.name.split(",")[0]

def get_course_to_num_ranking_series(df):
    # a bit of a primitive way to count every filled in ranking (some rankings are NaN). Maybe it could be improved?
    i = 0
    course_to_num_ranking = {}
    while i < len(df):
        course_to_num_ranking.update({df.iloc[i].name[0]:len(df.iloc[i].dropna())})
        i += 5
    return pd.Series(course_to_num_ranking)

def get_scores_df(df, criteria, categories):
    return pd.DataFrame(((bayesian_mean(df.loc[[(subject, criterion) for subject in courses]].T, categories)) for criterion in criteria), index=criteria)

def bayesian_mean(course_to_criterion, categories):
    course_to_criterion_mean = course_to_criterion.mean().droplevel(1)

    temp = course_to_criterion_mean.reset_index()
    temp["index"] = categories
    category_to_criterion_mean = temp.groupby("index").mean()[0]

    return course_to_criterion.apply(lambda x: scores_to_bayesian_mean(x, category_to_criterion_mean)).droplevel(1)

def scores_to_bayesian_mean(scores, category_to_criterion_mean, category_weight_portion=bayes_weight):
    """
    scores: Series of scores with Name formatted like this (Course, _)
    category_to_criterion_mean: category_to_criterion_mean
    category_weight_portion: the portion of scores category average is weighted at. e.g. 1 means its just as weighty as scores, 2 means half as weighty
    returns bayesian estimation of mean of scores, given its category
    """
    scores = scores.dropna()
    category_mean_series = pd.Series({"": category_to_criterion_mean[course_to_category(scores.name[0]).item(0)]})
    weights = [max(len(scores) // category_weight_portion, 1)] + [1] * len(scores)
    normalized_weights = weights/np.linalg.norm(weights)
    return np.average(scores.append(category_mean_series), weights=normalized_weights)


# Wrangling

In [157]:
categories = pd.Series(course_to_category(courses))
assert sum(categories.isna()) == 0, "prefix_to_category must cover each course prefix"

# Dataframes
index = list((course, criterion) for criterion in criteria for course in courses)
sorted_index = sorted(index, key=lambda x: courses.index(x[0]))
df = get_subject_df(data_filtered, "").T.set_index(pd.Index(sorted_index))
df_male = get_subject_df(data_male_filtered, "").T.set_index(pd.Index(sorted_index))
df_female = get_subject_df(data_female_filtered, "").T.set_index(pd.Index(sorted_index))

# Comments
course_to_comments = {}
other_details_df = df_by_regex(data_filtered, "Other details you want to add.*")
for course, other_details_col in zip(courses, other_details_df):
    filt = (other_details_df[other_details_col].isna()) | reduce(operator.or_, (comment == other_details_df[other_details_col] for comment in filtered_comments))
    comments = other_details_df[other_details_col][~filt]
    if comments.all():
        course_to_comments[course] = comments

## Ranking

### Course ranking by criteria

In [158]:
functions = [
            lambda x: bayesian_mean(x, categories), 
            lambda x: pd.Series(np.nanmedian(x, axis=0), index=courses), 
            lambda x: pd.Series(stats.mode(x, nan_policy="omit")[0].data[0], index=courses),
            lambda x: x.std().droplevel(1),
]
function_to_name = {function: function_name for function, function_name in zip(functions, ["Bayesian estimated mean", "Median", "Mode", "Standard deviation"])}
rows = len(functions) * 3
cols = 2
subtitles = np.empty((rows, cols), dtype="object")
for f_i, function in enumerate(functions):
    for c_i, criterion in enumerate(criteria):
        row = f_i * 3 + int(c_i / 2)
        col = c_i % 2
        subtitles[row, col] = (f"{function_to_name.get(function)} {criterion} score for each course")


fig_ranking = make_subplots(rows=len(functions) * 3, cols=2, 
        subplot_titles=subtitles.reshape(-1))
        
for f_i, function in enumerate(functions):
    for c_i, criterion in enumerate(criteria):
        row = f_i * 3 + int(c_i / 2 + 1)
        col = c_i % 2 + 1
        course_to_criterion = function(df.loc[[(subject, criterion) for subject in courses]].T)

        scatter = px.scatter(course_to_criterion, color=categories, symbol=categories)
        for trace in scatter.select_traces():
            # Only show legend once
            if f_i == c_i == 0:
                trace.showlegend = True
            else:
                trace.showlegend = False

            fig_ranking.add_trace(
                trace,
                row=row, col=col
            )
        fig_ranking.update_yaxes(
            title=f"{function_to_name.get(function)} {criterion}",
            row=row, col=col
        )
    remove_double_naming_in_legend(fig_ranking)
    fig_ranking.update_layout(height=5000, width=2400)
fig_ranking.write_image("images/rankings_of_criteria.png")
fig_ranking

### Scores 

In [159]:
scores_df = get_scores_df(df, criteria, categories)

In [160]:
i = 0
course_to_num_ranking = {}
while i < len(df):
    course_to_num_ranking.update({df.iloc[i].name[0]:len(df.iloc[i].dropna())})
    i += 5
course_to_num_ranking_series = pd.Series(course_to_num_ranking)

In [161]:
sorted_categories = course_to_category((scores_df.sum().sort_values() / 25).index)
fig_scores = px.scatter(scores_df.sum().sort_values() / 25, color=sorted_categories, symbol=sorted_categories, title="Course ranking (General)")
fig_scores.update_xaxes(title="Course")
fig_scores.update_yaxes(title="Ranking")

remove_double_naming_in_legend(fig_scores)

fig_scores.add_trace(
    go.Bar(x=course_to_num_ranking_series.index, y=course_to_num_ranking_series / np.linalg.norm(course_to_num_ranking_series), marker={"color": "rgba(100, 100, 100, 0.3)"}, name="Percentage of data points"),
    # row=2, col=1
)
fig_scores.update_layout(height=1000, width=2000)
fig_scores.write_image("images/Course ranking (General).png")
fig_scores

### Weighted scores

In [162]:
weights_real_world = {"Application": 1.5, "Relevance": 1, "Understanding": 0.2, "Ease": 0.5}
weights_academic = {"Insight": 1.5, "Understanding": 1.5}

scores_df_real_world = scores_df.apply(lambda x: weights_real_world.get(x.name, 0) * x, axis=1)
scores_df_academic = scores_df.apply(lambda x: weights_academic.get(x.name, 0) * x, axis=1)

standardized_scores_df_real_world = scores_df_real_world.sum() / max(scores_df_real_world.sum())
standardized_scores_df_academic = scores_df_academic.sum() / max(scores_df_academic.sum())

In [163]:
weighted_scores = pd.DataFrame([standardized_scores_df_real_world, standardized_scores_df_academic]).T
weighted_scores.columns = ("Real-world score", "Academic score")

In [164]:
sorted_categories = course_to_category(weighted_scores.index)
fig_weighted_scores = px.scatter(weighted_scores, x="Real-world score", y="Academic score",color=sorted_categories, symbol=sorted_categories, title="Course ranking (Real-world Vs. Academic)",
                hover_name=weighted_scores.index, text=weighted_scores.index)

remove_double_naming_in_legend(fig_weighted_scores)

fig_weighted_scores.update_traces(textposition="top center")
fig_weighted_scores.update_layout(height=1000, width=1600)
fig_weighted_scores.add_annotation(
    x=.4, y=.98,
    text="Academic_score = (Insight: 1.5 Understanding * 1.5) / max_score",
    showarrow=False
)
fig_weighted_scores.add_annotation(
    x=.462, y=1,
    text="Real_world_score = (Application * 1.5 + Relevance + Understanding * 0.2 + Ease * 0.5) / max_score",
    showarrow=False
)
fig_weighted_scores.write_image("images/Course ranking (Real-world Vs. Academic).png")
fig_weighted_scores

# Comments

In [165]:
fig = go.Figure()
count = 1
for course, comments in course_to_comments.items():
    if len(comments):
        fig.add_annotation(
            x=1, y=count,
            text=course,
            showarrow=False,
        )
        for comment in comments:
            fig.add_annotation(
                x=7, y=count,
                text=comment,
                showarrow=False,
            )
            count += 1
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.update_layout(height=1200, width=1800, yaxis_range=[0, count], xaxis_range=[0, 10])
fig.write_image("images/comments.png")
fig