In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install git+https://github.com/gjf2a/SPARC_lib
!pip install openpyxl

from SPARC_lib import *

from IPython.display import display, Markdown
from dataclasses import dataclass
from typing import *
import math

In [None]:
courses = pd.read_excel("/kaggle/input/course-history/Course_History_Since_2014.xlsx", dtype=str)
data_dictionary = pd.read_excel('/kaggle/input/data-dictionary/Data_Dictionary.xlsx', dtype=str)
student2courses = load_course_table(courses)

In [None]:
class StudentRecord:
    def __init__(self, id_num, cohort, entrance_year, acrk, sem_1_courses=None):
        self.id_num = id_num
        self.cohort = cohort
        self.entrance_year = entrance_year
        self.acrk = int(acrk)
        self.sem_1_courses = [] if sem_1_courses is None else sem_1_courses
        
    def __repr__(self):
        return f"StudentRecord('{self.id_num}', '{self.cohort}', {self.acrk}, {self.sem_1_courses})"
    

In [None]:
ids2acrk = {row['id_num']:StudentRecord(row['id_num'], row['cohort'], int(row['entrance_yr']), row['ACRK Score']) for i, row in data_dictionary.iterrows() if type(row['ACRK Score']) == str and row['id_num'] in student2courses}

In [None]:
for id_num, record in ids2acrk.items():
    record.sem_1_courses = [course for course in student2courses[id_num] if course.term == '1S' and course.year == record.entrance_year]

* For each course
  * Give the DFW ratio
  * For each ACRK level
    * Give the DFW ratio for that level
* See if there are courses with clear differentiation
  * In many cases, the total numbers of taking the course first semester are just too low.
  * Maybe I should extend to the entire first year.

In [None]:
from functools import total_ordering

def course_key(discipline, number):
    return f'{discipline} {number}'

@total_ordering
class CourseStats:
    def __init__(self, discipline, number, overall=None, acrk_ratios=None):
        self.discipline = discipline
        self.number = number
        self.overall = Ratio() if overall is None else overall
        self.acrk_ratios = {i:Ratio() for i in range(1, 7)} if acrk_ratios is None else acrk_ratios   
        
    def __repr__(self):
        return f"CourseStats({self.discipline}, {self.number}, {self.overall}, {self.acrk_ratios})"
        
    def __lt__(self, other):
        if self.overall < other.overall:
            return True
        elif self.overall > other.overall:
            return False
        else:
            for acrk in range(1, 7):
                if self.acrk_ratios[acrk] < other.acrk_ratios[acrk]:
                    return True
                elif self.acrk_ratios[acrk] > other.acrk_ratios[acrk]:
                    return False
            if self.discipline < other.discipline:
                return True
            elif self.discipline > other.discipline:
                return False
            if self.number < other.number:
                return True
            return False
    
    def __eq__(self, other):
        return self.discipline == other.discipline and self.number == other.number\
            and self.overall == other.overall and self.acrk_ratios == other.acrk_ratios
        
courses = {}
for record in ids2acrk.values():
    for course in record.sem_1_courses:
        key = course_key(course.discipline, course.number)
        if key not in courses:
            courses[key] = CourseStats(course.discipline, course.number)
        dfw = course.grade in 'DFW'
        courses[key].overall.count(True, dfw)
        courses[key].acrk_ratios[record.acrk].count(True, dfw)
        

In [None]:
min_student_records = 30
sorted_courses = sorted([(course, key) for (key, course) in courses.items() if course.overall.denominator > min_student_records])

In [None]:
sorted_courses.reverse()

In [None]:
for stats, course in sorted_courses:
    print(course, stats.overall.percent())
    for acrk, acrk_ratio in stats.acrk_ratios.items():
        print(f"ACRK {acrk}: {acrk_ratio.percent()}")
    print()
    