In [226]:
import sqlite3
import pandas as pd
import os
import copy
import traceback
import random
import regex as re
import statistics
import numpy as np
pd.options.mode.chained_assignment = None

In [227]:
process_path = os.path.join(os.sep+"home"+os.sep+"jupyter"+os.sep+"Team-Prophecy","Data","02_processed","intermediate.db")
print(process_path)

/home/jupyter/Team-Prophecy/Data/02_processed/intermediate.db


In [228]:
output_path = os.path.join(os.sep+"home"+os.sep+"jupyter"+os.sep+"Team-Prophecy","Data","03_output_for_tableau")
print(output_path)

/home/jupyter/Team-Prophecy/Data/03_output_for_tableau


In [229]:
process_connection = sqlite3.connect(process_path)

In [230]:
program_results_table = pd.DataFrame(process_connection.execute("SELECT * FROM program_results_table").fetchall(),columns=["semester", "program", "course_code", 
                                       "sect_id", "number_of_students", "number_of_waitlisted_students","random_students","percentage_international", "percentage_international_waitlist","total_size"])
results_table = pd.DataFrame(process_connection.execute("SELECT * FROM results_table").fetchall(),columns=["rec_id","semester", "program", "min_waitlisted_students", "proportional_size", "prior_proportional_size", "total_pop"])

In [231]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
from sklearn.metrics import mean_squared_error
import pydot
from sklearn.metrics import accuracy_score
import math

In [232]:
program_results_table = program_results_table[["semester", "program", "course_code", "sect_id", "number_of_students", "number_of_waitlisted_students","percentage_international","total_size"]]

In [233]:
program_results_table = program_results_table.loc[program_results_table["program"].isin(["EC-MS-CS","EC-MS-DAEN"]),:]

In [234]:
program_results_table["number_of_students"] = program_results_table["number_of_students"].abs()
program_results_table["number_of_waitlisted_students"] = program_results_table["number_of_waitlisted_students"].abs()
results_table["proportional_size"] = results_table["proportional_size"].abs()

In [235]:
program_results_table

Unnamed: 0,semester,program,course_code,sect_id,number_of_students,number_of_waitlisted_students,percentage_international,total_size
20,201770,EC-MS-CS,CS531,001,35,71,0.000000,36
21,201770,EC-MS-CS,CS531,003,35,43,0.000000,36
22,201770,EC-MS-CS,CS531,002,35,23,0.000000,36
23,201770,EC-MS-CS,CS531,DL2,35,7,0.000000,36
24,201770,EC-MS-CS,CS531,DL1,35,1,0.000000,36
...,...,...,...,...,...,...,...,...
5963,202310,EC-MS-DAEN,SYST573,001,3,0,0.886957,36
5964,202310,EC-MS-DAEN,SYST573,003,5,0,0.886957,36
5965,202310,EC-MS-DAEN,SYST573,002,10,0,0.886957,36
5966,202310,EC-MS-DAEN,SYST573,DL2,6,0,0.886957,36


In [236]:
results_table

Unnamed: 0,rec_id,semester,program,min_waitlisted_students,proportional_size,prior_proportional_size,total_pop
0,1,201770,EC-MS-CEIE,161,507,292,4001
1,2,201770,EC-MS-CPE,84,425,292,4001
2,3,201770,EC-MS-CS,212,1233,292,4001
3,4,201770,EC-MS-DAEN,279,630,292,4001
4,5,201770,EC-MS-ELEN,43,219,292,4001
...,...,...,...,...,...,...,...
844,845,202310,EC-MS-ISYS,0,490,2841,4001
845,846,202310,EC-MS-OPRS,126,141,2841,4001
846,847,202310,EC-MS-SWE,0,307,2841,4001
847,848,202310,EC-MS-SYST,0,21,2841,4001


In [237]:
semester_data = program_results_table["semester"].unique().tolist()

In [238]:
program_list = pd.Series(program_results_table["program"].unique(), dtype="category").tolist()
course_list = pd.Series(program_results_table["course_code"].unique(), dtype="category").tolist()
sect_list = pd.Series(program_results_table["sect_id"].unique(), dtype="category").tolist()

In [239]:
program_dict = {program_list[i] : i for i in range(0,len(program_list))}
course_dict = {course_list[i] : i for i in range(0,len(course_list))}
sect_dict = {sect_list[i] : i for i in range(0,len(sect_list))}

In [240]:
program_results_table["program"] = program_results_table["program"].transform(lambda x : program_dict[x])
program_results_table["course_code"] = program_results_table["course_code"].transform(lambda x : course_dict[x])
program_results_table["sect_id"] = program_results_table["sect_id"].transform(lambda x : sect_dict[x])

In [241]:
#This will classifies our waitlisted students
#program_results_table["number_of_waitlisted_students"] = numpy.where(program_results_table["number_of_waitlisted_students"] > 0, 1, 0)

#This noramlizes our waitlisted students
program_results_table["number_of_waitlisted_students"]= round(100*(program_results_table["number_of_waitlisted_students"]-program_results_table["number_of_waitlisted_students"].min())/(program_results_table["number_of_waitlisted_students"].max()-program_results_table["number_of_waitlisted_students"].min()))
program_results_table["number_of_students"]= round(100*(program_results_table["number_of_students"]-program_results_table["number_of_students"].min())/(program_results_table["number_of_students"].max()-program_results_table["number_of_students"].min()))

This checks to see if we can successfully identify waitlisted students from their x values

NOTE: Data accuracy is even worse when there is less data for linear regression

In [255]:
x = program_results_table.loc[:,["program","course_code","number_of_students","percentage_international"]]
y = program_results_table["number_of_waitlisted_students"]
y = np.where(y > 0, 1, 0)

xtrain, xtest,ytrain, ytest = train_test_split(x,y,test_size=0.3, random_state=5)
slr = LogisticRegression()
slr.fit(xtrain,ytrain)
print(slr.score(xtest,ytest))

0.9493006993006993


In [254]:
x = program_results_table.loc[:,["number_of_waitlisted_students"]]
y = program_results_table["program"]
y = np.where(y > 0, 1, 0)

xtrain, xtest,ytrain, ytest = train_test_split(x,y,test_size=0.3, random_state=5)
slr = LogisticRegression()
slr.fit(xtrain,ytrain)
print(slr.score(xtest,ytest))

0.5611888111888111


In [253]:
x = results_table.loc[:,["semester","min_waitlisted_students"]]
y = results_table["proportional_size"]

xtrain, xtest,ytrain, ytest = train_test_split(x,y,test_size=0.3, random_state=5)
slr = LinearRegression()
slr.fit(xtrain,ytrain)
print(slr.score(xtest,ytest))

0.6250250192434477


In [245]:
x = program_results_table.loc[:,["semester","course_code"]]
y = program_results_table["number_of_students"]

xtrain, xtest,ytrain, ytest = train_test_split(x,y,test_size=0.3, random_state=5)
slr = LinearRegression()
slr.fit(xtrain,ytrain)
print(slr.score(xtest,ytest))

0.17313623066727624


In [246]:
x = program_results_table.loc[:,["program","course_code"]]
y = program_results_table["number_of_students"]
y = np.where(y > 0, 1, 0)

xtrain, xtest,ytrain, ytest = train_test_split(x,y,test_size=0.3, random_state=5)
slr = LogisticRegression()
slr.fit(xtrain,ytrain)
print(slr.score(xtest,ytest))

0.9912587412587412


Here, I tackle class categorization using Random Forest Regression

In [247]:
x = program_results_table.loc[:,["semester","program","course_code"]]
y = program_results_table["number_of_students"]
#y = numpy.where(y > 0, 1, 0)

xtrain, xtest,ytrain, ytest = train_test_split(x,y,test_size=0.3, random_state=5)
rfr = RandomForestRegressor(max_depth=4,n_estimators=25)
rfr.fit(xtrain,ytrain)
#pred_vals = rfr.predict(xtrain)
#print(round(np.mean(abs(pred_vals - ytrain)),2)*100)
pred_vals = rfr.predict(xtest)
#print(accuracy_score(pred_vals, ytest))
print(mean_squared_error(pred_vals,ytest))

114.46355246110399


In [248]:
export_graphviz(rfr.estimators_[3], out_file='TreeReg.dot', feature_names = xtrain.columns)
(grph, ) = pydot.graph_from_dot_file('TreeReg.dot')
grph.write_png('TreeReg.png')

Here, I rexamine Logistic Regression by identifying the different class categorization