In [None]:
##Import useful libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import statistics
import scipy
from scipy import stats
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier

##Import scrubbed retention data from Rich Grant
df_retention = pd.read_csv('Retention1923.csv', index_col = False)
df_retention

##I do not have permission to include the scrubbed data do to FERPA concerns

In [None]:
#FFF means first year freshmen.  Data for 19FFF, 20FFF, 21FFF, 22FFF, and 23FFF provided.
#create dataframe with 21FFF and 22FFF only for a training and test set.  23FFF will be predicted.  
#19FFF and 20FFF were COVID years.  In my previous work on 19FFF - 22FFF, training using the full data set
#resulted in a poor performing model.  Using combinations of two years of data (for example:  19&20, 19&21,...) and 
#or four years of data for training all produced subtantially weaker performing models than 21&22 only.
df_retention_21 = (df_retention[df_retention['Cohort'] == '21FFF'])
df_retention_22 = (df_retention[df_retention['Cohort'] == '22FFF'])
both_frames = [df_retention_21, df_retention_22]
df_retention_21_22 = pd.concat(both_frames)
df_retention_21_22


In [None]:
df_retention_21_22.columns

In [None]:
#drop project ID, cohort, and birthdate columns
df_retention_21_22.drop(columns=['Cohort', 'Birthdate', 'Project ID'], inplace=True)
df_retention_21_22

In [None]:
#replace missing data with zeros
df_retention_21_22['Campus Orgs'] = df_retention_21_22['Campus Orgs'].fillna(0)
df_retention_21_22

In [None]:
#change campus clubs from names to true/false
df_retention_21_22['Campus Orgs'] = df_retention_21_22['Campus Orgs'].astype('bool')
df_retention_21_22

In [None]:
#view data to verify previous transformation worked as intended
with pd.option_context('display.max_rows', None,):
    print(df_retention_21_22['Campus Orgs'].value_counts(dropna=False))

In [None]:
#view data to verify previous transformation worked as intended
with pd.option_context('display.max_rows', None,):
    print(df_retention_21_22['Campus Orgs'])

In [None]:
#replace retained with not retained and change values from yes/no to false/true
df_retention_21_22['Retained'].replace(('Yes', 'No'), (False,True), inplace=True)
df_retention_21_22['not_retained']=df_retention_21_22['Retained']
df_retention_21_22.drop(['Retained'], axis=1, inplace=True)
df_retention_21_22

In [None]:
#replace blank values for first term GPA with zeros (most likely incompletes across all classes)
df_retention_21_22['1st Term GPA'] = df_retention_21_22['1st Term GPA'].fillna(0)
df_retention_21_22

In [None]:
####Deleted cells with efforts to get native classification working with HistGradBoostingClassifier
#Go back to earlier version to see those efforts

In [None]:
#ordinally encode nominal data.  Most of these are boolean categories.  In retrospect, one-hot-encoding would 
#have been better for 'Ethnic Race Fed' and 'Res Status'.
category_columns = ['Ethnic Race Fed',
 'GENDER',
 'Pell',
 '1st Gen (No Bach)',
 'Res Status',
 'Athlete',
 'Greek',
 'Campus Orgs',
 'RC Job',
 'Academic Suspension (1=Yes, 0=No)']

ord_enc = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)
df_retention_21_22_ord = ord_enc.fit_transform(df_retention_21_22[category_columns])
df_retention_21_22_ord

In [None]:
#verify numpy array is expected size
df_retention_21_22_ord.shape

In [None]:
#pull GPA data out of dataframe to add to ordinally encoded data
GPA = df_retention_21_22['1st Term GPA']
GPA

In [None]:
#insert the GPA data into numpy array
GPA = df_retention_21_22['1st Term GPA']

KBins = KBinsDiscretizer(encode='ordinal', strategy='uniform', random_state=42, n_bins=16)

GPA = GPA.values.reshape(-1, 1)

GPA_KBins = KBins.fit_transform(GPA)

#GPA_KBins.flatten()

#GPA_KBins_list = GPA_KBins.tolist()
#GPA_KBins_list


In [None]:
#type(GPA_KBins)

In [None]:
#verify my object is a numpy array
type(df_retention_21_22_ord)

In [None]:
#combine GPA data with ordinally encoded data
df_retention_21_22_ord_wGPA = np.concatenate((GPA_KBins, df_retention_21_22_ord), axis=1)
df_retention_21_22_ord_wGPA

In [None]:
df_retention_21_22_ord_wGPA.shape

In [None]:
#partition dataframe into training and test sets
X = df_retention_21_22_ord_wGPA
y = df_retention_21_22[df_retention_21_22.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#cross validate using best determined hyperparameters from last build of model.  GridSearch used in automating 
#previous search for hyperparamters

hgb_clf = HistGradientBoostingClassifier(max_depth=3, categorical_features=[1,5], random_state=42,learning_rate=0.02, 
                                         max_iter=250)

GBC_21_22_result = cross_val_score(hgb_clf, X_train, y_train, cv=5, scoring='recall', n_jobs=-1)
#GBC_21_22 [0.79487179 0.92307692 0.79487179 0.8974359  0.94871795]
#[0.74358974 0.84615385 0.71794872 0.82051282 0.94871795]
print(statistics.mean(GBC_21_22_result))
print(np.std(GBC_21_22_result))
#mean recall 0.856
#std 0.050
#mean and std above correspond to recall for an algorithm and hyperparameter combination

In [None]:
#playing around with what gets considered as a categorical feature.  Concerned about curse of dimensionality here.

#hgb_clf = HistGradientBoostingClassifier(max_depth=3, random_state=42, learning_rate=0.02, max_iter=250)
#mean recall = 0.8256410256410256
#std recall = 0.07142763219068778

#hgb_clf = HistGradientBoostingClassifier(max_depth=3, random_state=42, categorical_features=[1,2,3,4,5,6,7,8,9,10],learning_rate=0.02, 
                                        # max_iter=250)
#mean recall =  0.8153846153846154   
#std recall = 0.08173014077184217

#dropped 8 (clubs, there are alot of them)
#hgb_clf = HistGradientBoostingClassifier(max_depth=3, random_state=42, categorical_features=[1,2,3,4,5,6,7,9,10],learning_rate=0.02, 
 #                                        max_iter=250)
#0.8205128205128205
#0.07777308147745178    

#treated only GPA as a catagorical feature, changing the number of categories (bins).  This was fast due to small 
#dataset.  Manually changed number of bins rather than using GridSearch.

#hgb_clf = HistGradientBoostingClassifier(max_depth=3, random_state=42, categorical_features=[0],learning_rate=0.02, 
                                       #  max_iter=250)
#0.8717948717948718
#0.06486723405473602


#5 bins
#0.8358974358974359
#0.06607230116269294

#4 bins
#0.841025641025641
#0.06153846153846155

#3 bins
#0.841025641025641
#0.06363935203072234

#10 bins
#0.8461538461538461
#0.06067774136512425

#20 bins
#0.8512820512820513
#0.05475424744631442

#30 bins
#0.8461538461538461
#0.05847053462046862

#15 bins
#0.8512820512820513
#0.05475424744631442

#25 bins
#0.8461538461538461
#0.06067774136512425

#14 bins
#0.8461538461538461
#0.05847053462046862

#16 bins  <---best with lowest # bins
#0.8564102564102564
#0.050245943441706215

#17 bins
#0.8564102564102564
#0.050245943441706215

#18 bins
#0.8512820512820513
#0.05475424744631442


In [None]:
def evaluate(y, y_hat):
    cm_results=confusion_matrix(y, y_hat)
    print('accuracy', accuracy_score(y, y_hat))
    print('precision', precision_score(y, y_hat))
    print('recall', recall_score(y, y_hat))
    print('f1', f1_score(y, y_hat))
    print('confusion matrix', confusion_matrix(y, y_hat))

In [None]:
#check performance of model on test set
hgb_clf.fit(X_train, y_train)
y_hat = hgb_clf.predict(X_test)
evaluate(y_test, y_hat)

In [None]:
#time to predict using the 23FFF data
df_retention_23 = (df_retention[df_retention['Cohort'] == '23FFF'])
df_retention_23

In [None]:
#create dataframe to save index and Project ID association
ind_ID_23 = df_retention_23['Project ID']
ind_ID_23

In [None]:
#drop project ID, cohort, and birthdate columns
df_retention_23.drop(columns=['Cohort', 'Birthdate', 'Project ID'], inplace=True)
df_retention_23

#not sure why I'm getting the warning below, but not when I run the same operation above.  The dataframe looks fine.

In [None]:
#change NaNs to zeros for campus orgs.  This is the first part of changing this to a boolean
df_retention_23['Campus Orgs'] = df_retention_23['Campus Orgs'].fillna(0)
df_retention_23

In [None]:
#make campus organizations into a boolean.  Any campus org will become a 1.
df_retention_23['Campus Orgs'] = df_retention_23['Campus Orgs'].astype('bool')
df_retention_23

In [None]:
#drop 'Retained' as we are predicting it for the end of spring term
#df_retention_23.drop(['Retained'], axis=1, inplace=True)
df_retention_23

In [None]:
#ordinal encode everything except GPA
df_retention_23_ord = ord_enc.transform(df_retention_23[category_columns])
df_retention_23_ord

In [None]:
#convert GPA to 16 bins of values
df_retention_23['1st Term GPA'] = df_retention_23['1st Term GPA'].fillna(0)
GPA_23 = df_retention_23['1st Term GPA']


GPA_23 = GPA_23.values.reshape(-1, 1)

GPA_KBins_23 = KBins.transform(GPA_23)

In [None]:
#concatenate GPA and rest of data
df_retention_23_ord_wGPA = np.concatenate((GPA_KBins_23, df_retention_23_ord), axis=1)
df_retention_23_ord_wGPA

In [None]:
#predict results on new data
hgb_clf.fit(X,y)
predictions_23FFF = hgb_clf.predict(df_retention_23_ord_wGPA)
predictions_23FFF


In [None]:
ID_23 = ind_ID_23.to_numpy()
ID_23

In [None]:
ID_23 = ID_23.reshape(-1,1)
predictions_23FFF = predictions_23FFF.reshape(-1,1)

In [None]:
#combine predictions and IDs together in one array


ID_pred_23 = np.concatenate((ID_23, predictions_23FFF), axis=1)
ID_pred_23

In [None]:
#create dataframe of predictions and IDs
predictions_23 = pd.DataFrame(ID_pred_23, columns=['Project ID', 'Not Retained'])
predictions_23

In [None]:
#Create 'retained' column with true and false for clarity
predictions_23['Retained'] = (predictions_23['Not Retained']==0)
predictions_23

In [None]:
#drop not retained for clarity
predictions_23.drop(columns=['Not Retained'], inplace=True)
predictions_23

In [None]:
predictions_23.to_csv('predictions_23FFF.csv', index=False)

In [None]:
predictions_23['Retained'].value_counts()

In [None]:
#data set is small, so determine 'accuracy', 'recall', and 'precision' for the 
#algorithm/preprocessing/hyperparameter combination

GBC_21_22_result = cross_val_score(hgb_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print(GBC_21_22_result)
print(statistics.mean(GBC_21_22_result))
print(np.std(GBC_21_22_result))

In [None]:
GBC_21_22_result = cross_val_score(hgb_clf, X_train, y_train, cv=5, scoring='recall', n_jobs=-1)
print(GBC_21_22_result)
print(statistics.mean(GBC_21_22_result))
print(np.std(GBC_21_22_result))

In [None]:
GBC_21_22_result = cross_val_score(hgb_clf, X_train, y_train, cv=5, scoring='precision', n_jobs=-1)
print(GBC_21_22_result)
print(statistics.mean(GBC_21_22_result))
print(np.std(GBC_21_22_result))