In [1]:
import pandas as pd
import numpy as np
import collections
import math as maths

def loaddata(filename, efforts=[], grades=[]):
    df = pd.read_csv(filename)
    for grade in df['grade_attained']:
        grades.append(grade)
    for effort in df['effort_hours']:
        efforts.append(effort)

def to_numeric_grades(grades):
    ng = []
    for grade in grades:
        if grade=='A':
            ng.append(80.0)
        if grade=='B':
            ng.append(70.0)
        if grade=='C':
            ng.append(60.0)
        if grade=='D':
            ng.append(50.0)
        if grade=='E':
            ng.append(40.0)
        if grade=='F':
            ng.append(30.0)
    return ng

def to_alpha_grades(ngrades):
    agrades = []
    for grade in ngrades:
        if grade >= 80.0:
            agrades.append('A')
        if grade >= 70.0 and grade < 80.0:
            agrades.append('B')
        if grade >= 60.0 and grade < 70.0:
            agrades.append('C')
        if grade >= 50.0 and grade < 60.0:
            agrades.append('D')
        if grade >= 40.0 and grade < 50.0:
            agrades.append('E')
        if grade < 40.0:
            agrades.append('F')
    return agrades

def cost(m, c, xs, ys):
    est_ys = [((m * x) + c) for x in xs]
    yns = [(ye - yd) for (ye, yd) in zip(est_ys, ys)]
    sqr_ys = np.power(yns, 2)
    sum_sq_ys = np.sum(sqr_ys)
    n = len(xs)
    lse = sum_sq_ys / (2 * n)
    return lse

def differentiate_numerically(f, coeffs, xs, ys, i, h):
    coeffs_new = np.ndarray.copy(coeffs)
    coeffs_new[i] = coeffs_new[i] + h
    f1 = f(coeffs_new[0], coeffs_new[1], xs, ys)

    coeffs_new = np.ndarray.copy(coeffs)
    coeffs_new[i] = coeffs_new[i] - h
    f2 = f(coeffs_new[0], coeffs_new[1], xs, ys)

    return (f1 - f2) / (2 * h)

def differentiate_analytically_dy_dm(coeffs, xs, ys):
    # return m*np.sum(np.power(xs,2)) + c*np.sum(xs) - np.sum(np.multiply(xs, ys))
    m = coeffs[0]
    c = coeffs[1]
    p = np.power(xs, 2)
    s = np.sum(p)
    a = m * s
    b = c * np.sum(xs)
    # i think i can use np.dot() instead of np.sum(np.multiply)
    c = np.sum(np.multiply(xs, ys))
    return a + b - c

def differentiate_analytically_dy_dc(coeffs, xs, ys):
    m = coeffs[0]
    c = coeffs[1]
    n = len(xs)
    return m*np.sum(xs) - np.sum(ys) + n*c

def magnitude(xs):
    sum = 0
    for x in xs:
        sum = sum + np.abs(x)
    return sum

def newton(m, c, xs, ys, method):
    import matplotlib.pyplot as plt

    coeffs = [m, c]
    h = 0.000001
    it = 100000
    learningRate = np.full(2, 0.0001)

    diff_m = 100
    diff_c = 100

    while it > 0:
        if method=='numeric':
            dy_dm = differentiate_numerically(cost, np.asfarray(coeffs), xs, ys, 0, h)
            dy_dc = differentiate_numerically(cost, np.asfarray(coeffs), xs, ys, 1, h)
        else:
            dy_dm = differentiate_analytically_dy_dm(coeffs, xs, ys)
            dy_dc = differentiate_analytically_dy_dc(coeffs, xs, ys)

        curr_m = m
        curr_c = c
        m = m - (dy_dm * learningRate[0])
        c = c - (dy_dc * learningRate[1])
        diff_m = m - curr_m
        diff_c = c - curr_c

        if np.abs(diff_m)<0.000001 and np.abs(diff_c)<0.000001:
            return m,c

        it = it - 1
        if it==1:
            return m,c

        coeffs[0] = m
        coeffs[1] = c

        if maths.isnan(m) or maths.isnan(c):
            print()

def scale(xs):
    avg = np.mean(xs)
    sd = np.std(xs)
    return (xs - avg) / sd

def rsquared(ys, modelys):
    rsq = ssr(ys, modelys) / sst(ys)
    return rsq

def ssr(ys, modelys):
    avg = np.mean(ys)
    ssr = np.sum((modelys - avg)**2)
    return ssr

def sst(ys):
    avg = np.mean(ys)
    sst = np.sum((ys - avg)**2)
    return sst

if __name__ == '__main__':
    import matplotlib.pyplot as plt

    real_efforts = []
    real_grades = []
    loaddata('efforts.csv', real_efforts, real_grades)
    nefforts = real_efforts
    #ngrades = to_numeric_grades(real_grades)
    ngrades = real_grades

    #nefforts = np.asfarray([1.0,2.0,3.0,4.0,5.0])
    #ngrades = np.asfarray([5.0,7.5,8.8,11.5,9.9])
    d = collections.OrderedDict(sorted(dict(zip(nefforts, ngrades)).items()))
    nefforts = list(d.keys())
    ngrades = list(d.values())
    nefforts = scale(nefforts)
    ngrades = scale(ngrades)

    # differentiate cost function at (0.62,018) and iterate down from there to where gradient is near 0
    m, c = newton(1, 1, nefforts, ngrades, 'numeric')
    print('Numeric - ', 'm=', m ,' , c=', c)
    modelys = [((m * x) + c) for x in nefforts]
    # calculate the goodness of fit
    print('Numeric - ', 'Goodness of fit ', rsquared(nefforts, modelys))
    print('Numeric - ', 'To get an A (80%) you need ', (80 - c ) / m, ' hours of effort')

    # differentiate cost function at (0.62,0.18) and iterate down from there to where gradient is near 0
    m, c = newton(5, 2, nefforts, ngrades, 'analytic')
    print('Analytic - ', 'm=', m ,' , c=', c)
    modelys = [((m * x) + c) for x in nefforts]
    # calculate the goodness of fit
    print('Analytic - ', 'Goodness of fit ', rsquared(nefforts, modelys))
    print('Analytic - ', 'To get an A (80%) you need ', (80 - c ) / m, ' hours of effort')

    # TODO this does not work unless i normalise the data, check with how others implement this

FileNotFoundError: File b'efforts.csv' does not exist