# Imports

In [None]:
from canvasapi import Canvas

from os.path import join

from autocanvas.config import INPUT_DIR, OUTPUT_DIR

from autocanvas.core.conversions import (
    series_from_api_object, 
    df_from_api_list)

from autocanvas.core.course_info import (
    get_PHY_course, 
    get_assignment_group_from_name, 
    get_teaching_personel,
    get_students_from_sections,)

from autocanvas.core.assignments import (
    get_assignment,
    get_assignment_submissions,
    get_graded_submissions,
    get_submitted_submissions,
    get_assignment_groups,
    get_assignment_collection,
    get_submissions_in_collection,)

import re
from datetime import datetime
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt

API_URL = "https://ufl.instructure.com/"
try:
    from autocanvas.config import get_API_key
    API_KEY = get_API_key()
except FileNotFoundError as e:
    print(e)
    API_KEY = input("Asking for API token")

canvas = Canvas(API_URL, API_KEY)

In [None]:
plt.rcParams.update({'font.size': 18})

In [None]:
course = get_PHY_course(canvas, 
                        course_code="PHY2054", 
                        semester="Spring 2021")
print(course.name)

In [None]:
df_TAs, df_teachers = get_teaching_personel(
                                course, 
                                add_first_name=True, 
                                groups=["ta", "teacher"])

file_name = "section_ta_phy2054_spring2021.csv"

file_path = join(INPUT_DIR, file_name)
df_students, df_sections = get_students_from_sections(
                                course, 
                                section_ta_csv=file_path)
print("Completed importing students")

# Get Grand Averages per TA 

Following page 11 of https://www.nist.gov/system/files/documents/2017/05/09/combine-1.pdf

In [None]:
max_quiz_number = 9
df_regular_quizzes = get_assignment_collection(course,
                             assignment_group_name="recitation quizzes",
                             name_pattern="^Quiz \d+",
                             exclude_numbers=[0,10,13],
                             add_identifier_numbers=True
                            )
collection = df_regular_quizzes[
    df_regular_quizzes["identifier_number"]<=max_quiz_number
]
collection

In [None]:
all_submissions = get_submissions_in_collection(
                    assignment_collection=collection,
                    df_students=df_students, 
                    df_TAs=df_TAs)

In [None]:
df_makeup_quizzes = get_assignment_collection(course,
                             assignment_group_name="recitation quizzes",
                             name_pattern="^Makeup (Q|q)uiz \d+",
                             exclude_numbers=[0,10,13],
                             add_identifier_numbers=True
                            )
makeup_collection = df_makeup_quizzes[
    df_makeup_quizzes["identifier_number"]<=max_quiz_number
]
all_makeup_submissions = \
    get_submissions_in_collection(
                    assignment_collection=makeup_collection,
                    df_students=df_students, 
                    df_TAs=df_TAs)

In [None]:
common_columns = ["user_id", "name", "section_ta_first_name"]
use_makeups = True
df_full = pd.DataFrame()
for row_index, assignment in all_submissions.iterrows():
    prefix = assignment["name"]
    print(prefix)
    quiz_number = assignment["identifier_number"]
    
    df_subs = assignment["submissions"]
    df_graded = get_graded_submissions(df_subs)
    if use_makeups:
        df_makeup_subs = all_makeup_submissions[
            all_makeup_submissions["identifier_number"]==quiz_number
        ]["submissions"].iloc[0]
        df_makeup_graded = get_graded_submissions(df_makeup_subs)
        df_graded_combined = pd.concat((df_graded, df_makeup_graded))
    else:
        df_graded_combined = df_graded
    
    df_graded_combined = df_graded_combined.drop_duplicates(
                                            subset=["user_id"],
                                            keep="last")
    
    df_graded_combined = (df_graded_combined[common_columns+["grade",]]
                                .rename(columns={"grade": prefix+" grade"})
                        )
    
    
    
    if not df_full.empty:
        df_full = pd.merge(left=df_full, 
                           right=df_graded_combined, 
                           how="outer", 
                           on=common_columns,
                           validate="1:1"
                          )
    else:
        df_full = df_graded_combined

df_full

In [None]:
ta_order = (df_full.section_ta_first_name
            .drop_duplicates()
            .sort_values()
            .to_list()
           )
ta_order

In [None]:
quiz_columns = df_full.filter(regex='Quiz (\d)+ grade').columns
df_full[quiz_columns] = df_full[quiz_columns].astype("float")
df_full

In [None]:
quiz_noncum_avg_columns = df_full.filter(regex='Quiz (\d)+ grade').columns
ta_quiz_avgs = (df_full.groupby("section_ta_first_name")
                        [quiz_noncum_avg_columns]
                         .agg("mean")
               )
print(ta_quiz_avgs.round(2).to_markdown())

In [None]:
for quiz_idx, quiz_col in enumerate(quiz_columns):
    
    quiz_number = int("".join(filter(str.isdigit, quiz_col)))
    col_avg_name = "quiz_average_upto_{}".format(quiz_number)
    df_full[col_avg_name] = (df_full[quiz_columns[:quiz_idx+1]]
                             .mean(axis=1,skipna=True))
df_full
# df_full.groupby("section_ta_first")["quiz_averages"].agg("mean")

The estimate of the standard error of the mean using student average quiz scores is:

$$\hat{u} = \sqrt{\frac{\sum_{i=1}^k (\bar{x}_i-\bar{\bar{x}})^2}{k(k-1)}}$$

In [None]:
def get_ci_mean(data, confidence_level=95):
    """
    data: pd.Series, contains the measurements
    confidence level: float or int, percentage from 
        0 to 100. It gives the confidence that the true 
        population mean lies within the confidence interval
    
    Returns tuple of confidence interval bounds
    """
    from scipy.stats import t
    n_sample = len(data)
    degrees_of_freedom = n_sample - 1
    
    point_estimate_of_mean = data.mean()
    sample_std_estimate = data.std()
    standard_error_of_mean = (sample_std_estimate /
                              np.sqrt(n_sample))
    
    cl = confidence_level / 100
    
    # scipy.stats.t.ppf gives the Inverse of the CDF 
    # so it is onesided
    cl_one_sided = cl + (1-cl)/2
    t_star = t.ppf(cl_one_sided, df=degrees_of_freedom)
    
    
    lcb = point_estimate_of_mean - t_star * standard_error_of_mean
    ucb = point_estimate_of_mean + t_star * standard_error_of_mean
    
    return lcb, ucb, standard_error_of_mean

In [None]:
import numpy as np
def std_err_of_mean_of_means(data):
    """data is a series"""
    # drop nan elements
    data = data[~np.isnan(data)]
    
    lcb, ucb, std_err = get_ci_mean(data, confidence_level=95)
    
    return std_err


def get_lower_bound_95(data):
    """data is a series"""
    # drop nan elements
    data = data[~np.isnan(data)]
    
    lcb, ucb, std_err = get_ci_mean(data, confidence_level=95)
    
    return lcb


def get_upper_bound_95(data):
    """data is a series"""
    # drop nan elements
    data = data[~np.isnan(data)]
    
    lcb, ucb, std_err = get_ci_mean(data, confidence_level=95)
    
    return ucb
    

In [None]:
quiz_avg_columns = df_full.filter(regex='quiz_average_upto_(\d)+').columns
means = (df_full.groupby("section_ta_first_name")[quiz_avg_columns]
         .agg("mean")
         .add_suffix("_mean"))
stds = (df_full.groupby("section_ta_first_name")[quiz_avg_columns]
        .agg(std_err_of_mean_of_means)
        .add_suffix("_std"))

lcbs = (df_full.groupby("section_ta_first_name")[quiz_avg_columns]
        .agg(get_lower_bound_95)
        .add_suffix("_lcb"))

ucbs = (df_full.groupby("section_ta_first_name")[quiz_avg_columns]
        .agg(get_upper_bound_95)
        .add_suffix("_ucb"))

In [None]:
# Verifying what seaborn calculates in the following plot:
summary_cumul_ta = pd.concat([means,stds,lcbs, ucbs], axis=1)
print(summary_cumul_ta.filter(regex=".*_mean$").round(2).to_markdown())
print(summary_cumul_ta.filter(regex=".*_std$").round(2).to_markdown())

In [None]:
df_long = pd.wide_to_long(df_full, stubnames="quiz_average_upto_", 
                j="Quiz number", 
                i="user_id")

In [None]:
df_plot = df_long.reset_index(level=-1).copy()
df_plot

In [None]:
bars = sns.catplot(data=df_plot,
            kind="point",
            dodge=0.3,
#             capsize=.2,
            height=10,
            aspect=1.2,
            ci=95,
            join=False,
             x="Quiz number", 
             y="quiz_average_upto_", 
             hue="section_ta_first_name",
            hue_order=ta_order
           )
plt.ylabel("Cumulative Average Grade")
plt.title("Progression of Quiz Averages per TA (Cumulative)")
bars.legend.set_title('')
# plt.ylim(5.5,9)
plt.gcf().subplots_adjust(bottom=0.15,left=0.15, top=0.9)
plot_path = join(OUTPUT_DIR, "grade_progression_TA.png")
plt.savefig(plot_path, 
            facecolor='w', 
            transparent=False)