## Trying multi-class classification (OvR --> One vs. Rest)

In [None]:
#Step 1: imports
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_curve, precision_recall_curve, auc, roc_auc_score, classification_report)
import matplotlib.pyplot as plt

In [None]:
#Step 2: Load dataset
df = pd.read_csv('tutoring_data.csv')
df.head()

Unnamed: 0,User_ID,Age_in_Months,Gender,Location,Grade,Logins_per_Month,Days_Completed_Activity,Exercises_Started,Total_Time_Spent_in_Minutes,Course_Name,...,Recommendation_Likelihood,Exercises_Completed,Points_Earned,Subscription_Tier,Subscription_Cost,Subscription_Length_in_Months,Renewal_Status,Tutoring,Referrals,Academic_Grade
0,654b113d-4ce4-41a9-a8f4-7f1419419230,156,Other,"Smithchester, VA",8th Grade,6,5,9.784359,107.889381,Chemistry,...,3,7,1910.292936,Free,0.0,4,Yes,Yes,0,D
1,2a044973-1d29-4b2f-83f6-c488290140bb,202,Female,"Beckside, FL",10th Grade,6,6,9.0,198.865171,Web Development,...,4,9,1698.767255,Free,0.0,1,Yes,No,0,F
2,d84bb18b-bd77-4be9-98bb-a0993b95af75,173,Other,"New Deborahborough, SD",9th Grade,7,4,12.159345,232.639784,Geometry,...,4,10,1860.295769,Premium,9.99,13,Yes,No,0,D
3,411cebf6-18cc-4846-89c7-f3f7bcaede01,199,Female,"West Stephanie, KY",12th Grade,17,17,28.0,506.644691,Pre-Calculus,...,3,28,4465.824908,Basic,5.99,11,Yes,No,1,F
4,e4ada708-10b6-4fcd-ac08-a83152658751,148,Female,"West Roberthaven, LA",7th Grade,10,8,15.458939,305.275706,Java Programming,...,4,17,2499.211664,Premium,9.99,12,Yes,No,0,F


In [None]:
#Step 3: Training and test set:

#1st: Extract integer from grade.
df['Grade'] = df['Grade'].str.extract('(\d+)') #To assign the extracted grades to replace the original Grade column.
#df['Grade'].str.extract('(\d+)') #By itself doesn't work since returns a dataframe (just a column of row numbers and a column of the extracted grades)

df.head()
#print(df) #Prints the addresses of the values in the table.

#3rd: Det. x and y.
x = df[['Age_in_Months', 'Grade', 'Points_Earned', 'Logins_per_Month']] #Prev. error: 1st: had 'Age_in_Months ' --> extra space before quote. 2nd: Need double brackets when selecting >1 col.
y = df['Referrals']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=0)

# Display the shapes of the resulting datasets to verify the split
X_train.shape, X_test.shape, y_train.shape, y_test.shape

##DON'T DO THE FOLLOWING SINCE EVERY TIME YOU RUN THE CELL, THE AGE IS DIVIDED **AGAIN** BY 12. SO, UNINTENTTIONALLY YOU MIGHT BE DECREASING THE AGE AGAIN AND AGAIN WITHOUT NOTICING --> SCARY!
#2nd: Convert age in mths to years for easier readability:
#df['Age_in_Months'] = df['Age_in_Months'].div(12)

((7421, 4), (7422, 4), (7421,), (7422,))

In [None]:
#Step 4: Create the Multi-class regression problem:

#4.1 Need to normalize data, don't forget.
modelName = Pipeline([
    ('scale', StandardScaler()),
    ('LogReg', OneVsRestClassifier(LogisticRegression())) #Did not use: LogisticRegression(multi_class='ovr') since
                                                          # multiclass is discontinued in a future version of Python.
    ])

#Before cross-validation:
modelName.fit(X_train, y_train)
modelName_pred = modelName.predict(X_test)

trainingAccuracyBeforeCV = accuracy_score(y_test, modelName_pred)
print("Accuracy", trainingAccuracyBeforeCV)
trainingError = 1- trainingAccuracyBeforeCV
print("Training errorBeforeCV", trainingError)


Accuracy 0.7025060630557801
Training errorBeforeCV 0.2974939369442199


Pretty bad accuracy.

In [None]:
#Try having other parameters that might be important:
x_2ndversion = df.drop(['Referrals'], axis=1)
y_2ndversion = df['Referrals']
X_train_2ndversion, X_test_2ndversion, y_train_2ndversion, y_test_2ndversion = train_test_split(x, y, test_size=0.5, random_state=0)

# Display the shapes of the resulting datasets to verify the split
X_train_2ndversion.shape, X_test_2ndversion.shape, y_train_2ndversion.shape, y_test_2ndversion.shape

((7421, 4), (7422, 4), (7421, 3), (7422, 3))

In [None]:
modelName_2ndversion = Pipeline([
    ('scale', StandardScaler()),
    ('LogReg', OneVsRestClassifier(LogisticRegression()))
    ])

#Before cross-validation:
modelName_2ndversion.fit(X_train_2ndversion, y_train_2ndversion)
modelName_pred_2ndversion = modelName_2ndversion.predict(X_test_2ndversion)
trainingAccuracyBeforeCV_2ndversion = accuracy_score(y_test_2ndversion, modelName_pred_2ndversion)
print("Accuracy", trainingAccuracyBeforeCV_2ndversion)
trainingError_2ndversion = 1- trainingAccuracyBeforeCV_2ndversion
print("Training errorBeforeCV", trainingError_2ndversion)

Accuracy 0.7025060630557801
Training errorBeforeCV 0.2974939369442199


In [None]:
#Try having other parameters that might be important:
x_3rdversion = df[['Points_Earned', 'Logins_per_Month', 'Total_Time_Spent_in_Minutes', 'Exercises_Completed']]
y_3rdversion = df['Referrals']
X_train_3rdversion, X_test_3rdversion, y_train_3rdversion, y_test_3rdversion = train_test_split(x, y, test_size=0.5, random_state=0)

# Display the shapes of the resulting datasets to verify the split
X_train_3rdversion.shape, X_test_3rdversion.shape, y_train_3rdversion.shape, y_test_3rdversion.shape

((7421, 4), (7422, 4), (7421,), (7422,))

In [None]:
modelName_3rdversion = Pipeline([
    ('scale', StandardScaler()),
    ('LogReg', OneVsRestClassifier(LogisticRegression()))
    ])

#Before cross-validation:
modelName_3rdversion.fit(X_train_3rdversion, y_train_3rdversion)
modelName_pred_3rdversion = modelName_3rdversion.predict(X_test_3rdversion)
trainingAccuracyBeforeCV_3rdversion = accuracy_score(y_test_3rdversion, modelName_pred_3rdversion)
print("Accuracy", trainingAccuracyBeforeCV_2ndversion)
trainingError_3rdversion = 1- trainingAccuracyBeforeCV_3rdversion
print("Training errorBeforeCV", trainingError_3rdversion)

Accuracy 0.7025060630557801
Training errorBeforeCV 0.2974939369442199


<br> <br>

Before doing forward selection, try printing the ROC curve:


<br>
<br> <br> <br>


In [None]:
#Check the AUC of the ROC curve.

#4.2 Now predict using our model:
modelName_pred = modelName.predict(X_test)
print("Accuracy", accuracy_score(y_test, modelName_pred))

# Recompute predicted probabilities and predictions if they aren't available
y_pred_prob = log_reg.predict_proba(X_test_amount)[:, 1]  # Get p(Outcome=1)

# Set a threshold of 0.5 for classifying positive labels
threshold = 0.5
y_pred_label = (y_pred_prob >= threshold).astype(int)

In [None]:
#4.2 Cross-validation (during training, before even testing)

#To determine ideal CV K folds within reasonable limits (typical is 5 to 10)
CVidealK = [5, 7, 10, 15]
CVresults = {}
for i in CVidealK:
  modelNameCV = cross_val_score(modelName, X_train, y_train, cv=i) #NOT cv=CVidealK[i]
  CVresults[i] = modelNameCV.mean()
  maxCVAccuracy = max(CVresults.values())

print(CVresults)
print("maxCVAccuracy", maxCVAccuracy)

#Now use the ideal CV # of folds:
modelNamepredCV = cross_val_score(modelName, X_train, y_train, cv=15)
print(f"Mean cross-validation score: {modelNamepredCV.mean()}")






#print("Accuracy of CV 10 folds", accuracy_score(modelNameCV) )
#print('Accuracy', modelNamepredCV.mean())
#modelNamepredCV.mean()


#

In [None]:
#4.2 Cross-validation (during training, before even testing)
#To determine ideal CV K folds within reasonable limits (typical is 5 to 10)
CVidealK = [5, 7, 10, 15]
CVresults = {}
for i in CVidealK:
  modelNameCV = cross_val_score(modelName, X_train, y_train, cv=i) #NOT cv=CVidealK[i] since out of bounds
  CVresults[i] = modelNameCV.mean()
  maxCVAccuracy = max(CVresults.values())
print(CVresults)
print("maxCVAccuracy", maxCVAccuracy)

{5: 0.7013880040295135, 7: 0.7013879750324542, 10: 0.701387976913003, 15: 0.7013885140200927}
maxCVAccuracy 0.7013885140200927


"\nmodelName = LogisticRegression(multi_class='ovr')\nmodelName.fit(X_train, y_train)\n"

In [None]:
#4.2 Now predict using our model:
modelName_pred = modelName.predict(X_test)
print("Accuracy", accuracy_score(y_test, modelName_pred))

Accuracy 0.7025060630557801


## BELOW: more detailed version with rough work. Kept the polished portions above for presentation screenshotting.

In [None]:
#Step 4: Create the Multi-class regression problem:

#4.1 Need to normalize data, don't forget.
'''
modelName = Pipeline([
    ('scale', StandardScaler()),
    ('LogReg', LogisticRegression(multi_class='ovr')) #This is optional parameter. If you don't select iter size, function will use default.
    ])
'''
modelName = Pipeline([
    ('scale', StandardScaler()),
    ('LogReg', OneVsRestClassifier(LogisticRegression())) #This is optional parameter. If you don't select iter size, function will use default.
    ])

#Before cross-validation:
modelName.fit(X_train, y_train)
modelName_pred = modelName.predict(X_test)
trainingAccuracyBeforeCV = accuracy_score(y_test, modelName_pred)
print("Accuracy", trainingAccuracyBeforeCV)
trainingError = 1- trainingAccuracyBeforeCV
print("Training errorBeforeCV", trainingError)

#This is nonsense. cv is used for the score, not as a model.
#modelNamePred = modelName.predict(X_test)
#print("Test accuracy using training model without cv", accuracy_score(y_test, modelNamePred))

In [None]:
#4.2 Cross-validation (during training, before even testing)

#To determine ideal CV K folds within reasonable limits (typical is 5 to 10)
CVidealK = [5, 7, 10, 15]
CVresults = {}
for i in CVidealK:
  modelNameCV = cross_val_score(modelName, X_train, y_train, cv=i) #NOT cv=CVidealK[i]
  CVresults[i] = modelNameCV.mean()
  maxCVAccuracy = max(CVresults.values())

print(CVresults)
print("maxCVAccuracy", maxCVAccuracy)

#Now use the ideal CV # of folds:
modelNamepredCV = cross_val_score(modelName, X_train, y_train, cv=15)
print(f"Mean cross-validation score: {modelNamepredCV.mean()}")






#print("Accuracy of CV 10 folds", accuracy_score(modelNameCV) )
#print('Accuracy', modelNamepredCV.mean())
#modelNamepredCV.mean()


#

In [None]:

#Cross-validation
CVidealK = [5, 7, 10, 15]
CVresults = {}
for i in CVidealK:
  modelNameCV = cross_val_score(modelName, X_train, y_train, cv=i) #NOT cv=CVidealK[i]
  CVresults[i] = modelNameCV.mean()
  maxCVAccuracy = max(CVresults.values())

print(CVresults)
print("maxCVAccuracy", maxCVAccuracy)

''' #ignore

modelNameCV = cross_validation.cross_val_score(modelName, X_train, y_train, cv=10) #A
#print(modelNameCV)
#print(modelNameCV.mean())

#
'''


'''
modelName = LogisticRegression(multi_class='ovr')
modelName.fit(X_train, y_train)
'''