In [None]:
#Step 1: imports
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import datasets,metrics
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector

from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_curve, precision_recall_curve, auc, roc_auc_score, classification_report)
import matplotlib.pyplot as plt

In [None]:
#Step 2: Load dataset
df = pd.read_csv('tutoring_data.csv')
df.head()

Unnamed: 0,User_ID,Age_in_Months,Gender,Location,Grade,Logins_per_Month,Days_Completed_Activity,Exercises_Started,Total_Time_Spent_in_Minutes,Course_Name,...,Recommendation_Likelihood,Exercises_Completed,Points_Earned,Subscription_Tier,Subscription_Cost,Subscription_Length_in_Months,Renewal_Status,Tutoring,Referrals,Academic_Grade
0,654b113d-4ce4-41a9-a8f4-7f1419419230,156,Other,"Smithchester, VA",8th Grade,6,5,9.784359,107.889381,Chemistry,...,3,7,1910.292936,Free,0.0,4,Yes,Yes,0,D
1,2a044973-1d29-4b2f-83f6-c488290140bb,202,Female,"Beckside, FL",10th Grade,6,6,9.0,198.865171,Web Development,...,4,9,1698.767255,Free,0.0,1,Yes,No,0,F
2,d84bb18b-bd77-4be9-98bb-a0993b95af75,173,Other,"New Deborahborough, SD",9th Grade,7,4,12.159345,232.639784,Geometry,...,4,10,1860.295769,Premium,9.99,13,Yes,No,0,D
3,411cebf6-18cc-4846-89c7-f3f7bcaede01,199,Female,"West Stephanie, KY",12th Grade,17,17,28.0,506.644691,Pre-Calculus,...,3,28,4465.824908,Basic,5.99,11,Yes,No,1,F
4,e4ada708-10b6-4fcd-ac08-a83152658751,148,Female,"West Roberthaven, LA",7th Grade,10,8,15.458939,305.275706,Java Programming,...,4,17,2499.211664,Premium,9.99,12,Yes,No,0,F


In [None]:
#Step 3: Training and test set:

#1st: Extract integer from grade.
df['Grade'] = df['Grade'].str.extract('(\d+)') #To assign the extracted grades to replace the original Grade column.
#df['Grade'].str.extract('(\d+)') #By itself doesn't work since returns a dataframe (just a column of row numbers and a column of the extracted grades)

df.head()
#print(df) #Prints the addresses of the values in the table.

#3rd: Det. x and y.
x = df[['Age_in_Months', 'Grade', 'Points_Earned', 'Logins_per_Month']] #Prev. error: 1st: had 'Age_in_Months ' --> extra space before quote. 2nd: Need double brackets when selecting >1 col.
y = df['Referrals']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=0)

# Display the shapes of the resulting datasets to verify the split
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7421, 4), (7422, 4), (7421,), (7422,))

In [None]:
#Step 4: Create the Multi-class regression problem:

#4.1 Need to normalize data:
modelName = Pipeline([
    ('scale', StandardScaler()),
    ('LogReg', OneVsRestClassifier(LogisticRegression())) #This is optional parameter. If you don't select iter size, function will use default.
    ])

modelName.fit(X_train, y_train)
modelName_pred = modelName.predict(X_test)
modelName_pred_prob = modelName.predict_proba(X_test)[:,1]
trainingAccuracyBeforeCV = accuracy_score(y_test, modelName_pred)
print("Accuracy", trainingAccuracyBeforeCV)
trainingError = 1- trainingAccuracyBeforeCV
print("Training errorBeforeCV", trainingError)


y = label_binarize(y, classes=[0, 1, 2, 3])
n_classes = y.shape[1]

precision = dict()
recall = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], modelName_pred_prob[:, i])

plt.figure()

for i in range(n_classes):
    plt.plot(recall[i], precision[i], lw=2, label=f'Class {i}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Multiclass')
plt.legend(loc='best')
plt.show()

'''
# Can't do the following for multiclass:

confusion_matrix(y_test, modelName_pred)
precision, recall, _ = precision_recall_curve(y, modelName_pred)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()
'''


Accuracy 0.7025060630557801
Training errorBeforeCV 0.2974939369442199


KeyError: 'key of type tuple not found and not a MultiIndex'

In [None]:
x = df[['Age_in_Months', 'Grade', 'Points_Earned', 'Logins_per_Month', 'Exercises_Started',
        'Exercises_Completed', 'Days_Completed_Activity' ]]
y = df['Referrals']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=0)

backWardFS = SequentialFeatureSelector(modelName,
                                      k_features='best',
                                      forward=False,
                                      verbose = 2,
                                      scoring='accuracy',
                                      cv=15)
backWardFS = backWardFS.fit(X_train, y_train)


[2024-12-01 00:55:44] Features: 6/1 -- score: 0.7013885140200927
[2024-12-01 00:55:51] Features: 5/1 -- score: 0.7013885140200927
[2024-12-01 00:55:57] Features: 4/1 -- score: 0.7013885140200927
[2024-12-01 00:56:01] Features: 3/1 -- score: 0.7013885140200927
[2024-12-01 00:56:07] Features: 2/1 -- score: 0.7013885140200927
[2024-12-01 00:56:08] Features: 1/1 -- score: 0.7013885140200927

So, we can see that feature selection isn't helping to narrow down to a model with good accuracy.

In [None]:
#Visualize the distribution:
y_prob = modelName.predict_proba(x_test)[:,1]

In [None]:
'''
# Binarize the output labels for ROC curve
y_bin = label_binarize(y, classes=[0, 1, 2, 3])
n_classes = y_bin.shape[1]

y_score = modelName.decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
'''

'''
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at these points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot ROC curve for each class
plt.figure()
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Plot macro-average ROC curve
plt.figure()
plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Overall ROC')
plt.legend(loc="lower right")
plt.show()
'''

KeyError: 'key of type tuple not found and not a MultiIndex'

### ROUGH WORK

In [None]:
x = df[['Age_in_Months', 'Grade', 'Points_Earned', 'Logins_per_Month', 'Exercises_Started', 'Exercises_Completed', 'Days_Completed_Activity' ]] #Prev. error: 1st: had 'Age_in_Months ' --> extra space before quote. 2nd: Need double brackets when selecting >1 col.
y = df['Referrals']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=0)

backWardFS = SequentialFeatureSelector(modelName,
                                      k_features='best',
                                      forward=False,
                                      verbose = 2,
                                      scoring='accuracy',
                                      cv=15)
backWardFS = backWardFS.fit(X_train, y_train)