# Random Forrest Classifier

## The overall video ratings for `Observer`(V1), `Strategy`(V2) and `MVC`(V3) patterns will be classified separately based on each learning style dimension scores below...
* A/R_Score
* S/I_Score
* Vi/Vb_Score
* S/G_Score

The classifiers were trained using `GridSearchCV`, a hyperparamater tuning technique, and the training set was split into `10-fold cross validation`. The average scores were collected after each classifier was 'train test split' a few times (various random states) with train-test.

# Importing the necessary libraries

In [12]:
### For data manipulation and visualisation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### To Encode the data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import LabelEncoder

### Machine Learning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE,RandomOverSampler

# Reading the dataset

In [13]:
pd.set_option('display.max_columns', None)
training = pd.read_csv('BDP_Cleaned.csv') #dataset after feature selection process for 5-Level Classification

# Dropping all irrelevant features
* `Time_Day_Watched` of all design patterns as features have already been extracted
* `Gender` as male and females are imbalanced and does not provide any useful insights
* `Duration` of each video as watch status and percentage duration have already been extracted from it

In [14]:
training.drop(['Time_Day_Watched_(V1-1)',
         'Time_Day_Watched_(V1-2)',
         'Time_Day_Watched_(V1-3)',
         'Time_Day_Watched_(V1-4)',
         'Time_Day_Watched_(V2-1)',
         'Time_Day_Watched_(V2-2)',
         'Time_Day_Watched_(V2-3)',
         'Time_Day_Watched_(V2-4)',
         'Time_Day_Watched_(V3-1)',
         'Time_Day_Watched_(V3-2)',
         'Time_Day_Watched_(V3-3)',
         'Time_Day_Watched_(V3-4)',
               'Gender',
               "V1-1_(1)",
               "V1-2_(1)",
               "V1-3_(1)",
               "V1-4_(1)",
               "V2-1_(2)",
               "V2-1_(2)",
               "V2-2_(2)",
               "V2-3_(2)",
               "V2-4_(2)",
               "V3-1_(3)",
               "V3-2_(3)",
               "V3-3_(3)",
               "V3-4_(3)",
               
        ], axis = 1,inplace = True)

# Getting numerical and categorical features for further tuning scaling and fitting

In [15]:
numerical_feats = training.dtypes[training.dtypes != "object"].index
categorical_feats = training.dtypes[training.dtypes == "object"].index

In [16]:
print("*"*100)
print(training[numerical_feats].columns)
print("*"*100)
print(training[categorical_feats].columns)
print("*"*100)
training.columns

****************************************************************************************************
Index(['V1_PercentWatched', 'V2_PercentWatched', 'V3_PercentWatched',
       'A/R_Score', 'Active', 'Reflective', 'S/I_Score', 'Sensing',
       'Intuitive', 'Vi/Vb_Score', 'Visual', 'Verbal', 'S/G_Score',
       'Sequential', 'Global', 'Quiz'],
      dtype='object')
****************************************************************************************************
Index(['CGPA_Class', 'Student_Rating_(V1-1)', 'Student_Rating_(V1-2)',
       'Student_Rating_(V1-3)', 'Student_Rating_(V1-4)', 'Overall_Rating_V1',
       'Watch_Status_V1', 'Student_Rating_(V2-1)', 'Student_Rating_(V2-2)',
       'Student_Rating_(V2-3)', 'Student_Rating_(V2-4)', 'Overall_Rating_V2',
       'Watch_Status_V2', 'Student_Rating_(V3-1)', 'Student_Rating_(V3-2)',
       'Student_Rating_(V3-3)', 'Student_Rating_(V3-4)', 'Overall_Rating_V3',
       'Watch_Status_V3', 'part_day_V1-1', 'part_day_V1-2', 'part_day_V1-

Index(['CGPA_Class', 'Student_Rating_(V1-1)', 'Student_Rating_(V1-2)',
       'Student_Rating_(V1-3)', 'Student_Rating_(V1-4)', 'Overall_Rating_V1',
       'Watch_Status_V1', 'V1_PercentWatched', 'Student_Rating_(V2-1)',
       'Student_Rating_(V2-2)', 'Student_Rating_(V2-3)',
       'Student_Rating_(V2-4)', 'Overall_Rating_V2', 'Watch_Status_V2',
       'V2_PercentWatched', 'Student_Rating_(V3-1)', 'Student_Rating_(V3-2)',
       'Student_Rating_(V3-3)', 'Student_Rating_(V3-4)', 'Overall_Rating_V3',
       'Watch_Status_V3', 'V3_PercentWatched', 'A/R_Score', 'Active',
       'Reflective', 'S/I_Score', 'Sensing', 'Intuitive', 'Vi/Vb_Score',
       'Visual', 'Verbal', 'S/G_Score', 'Sequential', 'Global', 'Quiz',
       'part_day_V1-1', 'part_day_V1-2', 'part_day_V1-3', 'part_day_V1-4',
       'part_day_V2-1', 'part_day_V2-2', 'part_day_V2-3', 'part_day_V2-4',
       'part_day_V3-1', 'part_day_V3-2', 'part_day_V3-3', 'part_day_V3-4'],
      dtype='object')

# Using MinMax Scaler to scale data from 0 to 1 since numerical features are not normally distributed

In [17]:
scaler = MinMaxScaler()
training['Quiz'] = scaler.fit_transform(training[['Quiz']])
training['A/R_Score'] = scaler.fit_transform(training[['A/R_Score']])
training['S/I_Score'] = scaler.fit_transform(training[['S/I_Score']])
training['Vi/Vb_Score'] = scaler.fit_transform(training[['Vi/Vb_Score']])
training['S/G_Score'] = scaler.fit_transform(training[['S/G_Score']])
training['V1_PercentWatched'] = scaler.fit_transform(training[['V1_PercentWatched']])
training['V2_PercentWatched'] = scaler.fit_transform(training[['V2_PercentWatched']])
training['V3_PercentWatched'] = scaler.fit_transform(training[['V3_PercentWatched']])
training.head()

Unnamed: 0,CGPA_Class,Student_Rating_(V1-1),Student_Rating_(V1-2),Student_Rating_(V1-3),Student_Rating_(V1-4),Overall_Rating_V1,Watch_Status_V1,V1_PercentWatched,Student_Rating_(V2-1),Student_Rating_(V2-2),Student_Rating_(V2-3),Student_Rating_(V2-4),Overall_Rating_V2,Watch_Status_V2,V2_PercentWatched,Student_Rating_(V3-1),Student_Rating_(V3-2),Student_Rating_(V3-3),Student_Rating_(V3-4),Overall_Rating_V3,Watch_Status_V3,V3_PercentWatched,A/R_Score,Active,Reflective,S/I_Score,Sensing,Intuitive,Vi/Vb_Score,Visual,Verbal,S/G_Score,Sequential,Global,Quiz,part_day_V1-1,part_day_V1-2,part_day_V1-3,part_day_V1-4,part_day_V2-1,part_day_V2-2,part_day_V2-3,part_day_V2-4,part_day_V3-1,part_day_V3-2,part_day_V3-3,part_day_V3-4
0,2.67 - 3.32,Excellent,Satisfactory,Very Good,Very Good,Very Good,Completed,0.747748,Very Good,Satisfactory,Very Good,Very Good,Very Good,Completed,0.440789,Excellent,Very Good,Very Good,Very Good,Very Good,Completed,0.632653,0.2,1,0,0.25,1,0,0.6,1,0,0.0,1,0,1.0,night,night,night,night,night,night,night,night,night,night,night,night
1,2.67 - 3.32,Very Good,Very Good,Very Good,Very Good,Very Good,Completed,0.747748,Very Good,Very Good,Very Good,Very Good,Very Good,Completed,0.440789,Very Good,Very Good,Very Good,Very Good,Very Good,Rewatched,0.653061,0.2,0,1,1.0,1,0,0.6,1,0,0.2,0,1,0.75,midnight,night,night,night,night,night,night,night,night,night,night,night
2,2.67 - 3.32,Excellent,Satisfactory,Very Good,Satisfactory,Very Good,Completed,0.747748,Very Good,Satisfactory,Very Good,Excellent,Very Good,Completed,0.440789,Excellent,Very Good,Very Good,Very Good,Very Good,Completed,0.632653,0.0,0,1,0.75,1,0,0.6,1,0,0.2,0,1,0.5,night,night,afternoon,afternoon,night,night,afternoon,afternoon,night,night,afternoon,afternoon
3,3.67 - 4.00,Excellent,Very Good,Excellent,Excellent,Excellent,Completed,0.747748,Excellent,Excellent,Very Good,Excellent,Excellent,Rewatched,0.618421,Excellent,Excellent,Very Good,Excellent,Excellent,Completed,0.632653,0.2,0,1,0.25,1,0,0.6,1,0,0.0,1,0,0.375,midnight,midnight,midnight,afternoon,midnight,midnight,midnight,midnight,midnight,midnight,midnight,midnight
4,2.67 - 3.32,Very Good,Very Good,Satisfactory,Very Good,Very Good,Completed,0.747748,Excellent,Very Good,Very Good,Satisfactory,Very Good,Completed,0.440789,Very Good,Very Good,Excellent,Very Good,Very Good,Completed,0.632653,0.2,1,0,0.0,1,0,0.4,1,0,0.2,0,1,0.75,afternoon,afternoon,afternoon,afternoon,afternoon,afternoon,afternoon,afternoon,afternoon,afternoon,afternoon,afternoon


# Encoding categorical values using label encoder

In [18]:
features =  ["CGPA_Class",
             "Overall_Rating_V1",
             "Overall_Rating_V2",
             "Overall_Rating_V3",
             "Watch_Status_V1",
             "Watch_Status_V2",
             "Watch_Status_V3",
             "part_day_V1-1",
             "part_day_V1-2",
             "part_day_V1-3",
             "part_day_V1-4",
             "part_day_V2-1",
             "part_day_V2-2",
             "part_day_V2-3",
             "part_day_V2-4",
             "part_day_V3-1",
             "part_day_V3-2",
             "part_day_V3-3",
             "part_day_V3-4"]

for i in features:
    training.loc[:,i] = LabelEncoder().fit_transform(training.loc[:,i]) 

### Overall_rating_V1 `(Observer)` for `Active/Reflective` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [19]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V1']]
X = training[['CGPA_Class',
              'Watch_Status_V1',
              'A/R_Score','Quiz','part_day_V1-1', 'part_day_V1-2', 'part_day_V1-3', 'part_day_V1-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_A = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_A

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.707317,0.695708,0.707317,0.697466,"{'criterion': 'gini', 'max_features': 'log2', ..."
1,0.5,69,0.650407,0.61646,0.650407,0.617468,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
2,0.5,101,0.715447,0.704101,0.715447,0.703891,"{'criterion': 'entropy', 'max_features': 'log2..."
3,0.4,7,0.734694,0.73165,0.734694,0.712596,"{'criterion': 'gini', 'max_features': 'auto', ..."
4,0.4,69,0.785714,0.8058,0.785714,0.768513,"{'criterion': 'gini', 'max_features': 'log2', ..."
5,0.4,101,0.72449,0.737059,0.72449,0.715724,"{'criterion': 'entropy', 'max_features': 'auto..."
6,0.3,7,0.702703,0.674324,0.702703,0.675709,"{'criterion': 'entropy', 'max_features': 'sqrt..."
7,0.3,69,0.810811,0.801031,0.810811,0.799142,"{'criterion': 'gini', 'max_features': 'auto', ..."
8,0.3,101,0.72973,0.746335,0.72973,0.723371,"{'criterion': 'gini', 'max_features': 'log2', ..."
9,0.2,7,0.693878,0.639366,0.693878,0.648102,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [20]:
df_A.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.795918,0.784621,0.795918,0.776799
0.3,0.747748,0.740563,0.747748,0.732741
0.4,0.748299,0.75817,0.748299,0.732278
0.5,0.691057,0.67209,0.691057,0.672942


### Overall_rating_V1 `(Observer)` for `Sensing/Intuitive` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)MVC

In [21]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V1']]
X = training[['CGPA_Class',
              'Watch_Status_V1',
              'S/I_Score','Quiz','part_day_V1-1', 'part_day_V1-2', 'part_day_V1-3', 'part_day_V1-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_B = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_B

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.739837,0.746665,0.739837,0.729617,"{'criterion': 'gini', 'max_features': 'log2', ..."
1,0.5,69,0.707317,0.713375,0.707317,0.697275,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
2,0.5,101,0.747967,0.761072,0.747967,0.747862,"{'criterion': 'entropy', 'max_features': 'sqrt..."
3,0.4,7,0.826531,0.836909,0.826531,0.816163,"{'criterion': 'gini', 'max_features': 'auto', ..."
4,0.4,69,0.77551,0.790276,0.77551,0.773691,"{'criterion': 'entropy', 'max_features': 'sqrt..."
5,0.4,101,0.765306,0.772799,0.765306,0.758862,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
6,0.3,7,0.797297,0.794337,0.797297,0.778171,"{'criterion': 'gini', 'max_features': 'auto', ..."
7,0.3,69,0.824324,0.81483,0.824324,0.816508,"{'criterion': 'gini', 'max_features': 'log2', ..."
8,0.3,101,0.783784,0.812125,0.783784,0.774692,"{'criterion': 'entropy', 'max_features': 'log2..."
9,0.2,7,0.714286,0.705642,0.714286,0.694606,"{'criterion': 'entropy', 'max_features': 'sqrt..."


In [22]:
df_B.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.816327,0.817659,0.816327,0.806726
0.3,0.801802,0.807097,0.801802,0.789791
0.4,0.789116,0.799995,0.789116,0.782905
0.5,0.731707,0.740371,0.731707,0.724918


### Overall_rating_V1 `(Observer)` for `Visual/Verbal` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [23]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V1']]
X = training[['CGPA_Class',
              'Watch_Status_V1',
              'Vi/Vb_Score','Quiz','part_day_V1-1', 'part_day_V1-2', 'part_day_V1-3', 'part_day_V1-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_C = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_C

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.715447,0.713873,0.715447,0.706913,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
1,0.5,69,0.634146,0.63988,0.634146,0.62448,"{'criterion': 'entropy', 'max_features': 'sqrt..."
2,0.5,101,0.715447,0.714644,0.715447,0.710682,"{'criterion': 'gini', 'max_features': 'log2', ..."
3,0.4,7,0.755102,0.748703,0.755102,0.749345,"{'criterion': 'entropy', 'max_features': 'sqrt..."
4,0.4,69,0.77551,0.801135,0.77551,0.771348,"{'criterion': 'gini', 'max_features': 'auto', ..."
5,0.4,101,0.744898,0.754066,0.744898,0.735218,"{'criterion': 'entropy', 'max_features': 'auto..."
6,0.3,7,0.716216,0.719598,0.716216,0.706384,"{'criterion': 'gini', 'max_features': 'log2', ..."
7,0.3,69,0.837838,0.846701,0.837838,0.836273,"{'criterion': 'entropy', 'max_features': 'auto..."
8,0.3,101,0.797297,0.823575,0.797297,0.790409,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
9,0.2,7,0.714286,0.727691,0.714286,0.708225,"{'criterion': 'entropy', 'max_features': 'auto..."


In [24]:
df_C.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.823129,0.840825,0.823129,0.818093
0.3,0.783784,0.796625,0.783784,0.777688
0.4,0.758503,0.767968,0.758503,0.75197
0.5,0.688347,0.689466,0.688347,0.680691


### Overall_rating_V1 `(Observer)` for `Sequential/Global` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [25]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V1']]
X = training[['CGPA_Class',
              'Watch_Status_V1',
              'S/G_Score','Quiz','part_day_V1-1', 'part_day_V1-2', 'part_day_V1-3', 'part_day_V1-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_D = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_D

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.723577,0.714548,0.723577,0.716909,"{'criterion': 'entropy', 'max_features': 'sqrt..."
1,0.5,69,0.650407,0.659787,0.650407,0.632575,"{'criterion': 'entropy', 'max_features': 'auto..."
2,0.5,101,0.747967,0.747768,0.747967,0.738323,"{'criterion': 'entropy', 'max_features': 'sqrt..."
3,0.4,7,0.734694,0.72382,0.734694,0.724556,"{'criterion': 'entropy', 'max_features': 'auto..."
4,0.4,69,0.77551,0.776276,0.77551,0.762835,"{'criterion': 'gini', 'max_features': 'auto', ..."
5,0.4,101,0.704082,0.731732,0.704082,0.696603,"{'criterion': 'entropy', 'max_features': 'auto..."
6,0.3,7,0.689189,0.66963,0.689189,0.67589,"{'criterion': 'gini', 'max_features': 'log2', ..."
7,0.3,69,0.797297,0.783835,0.797297,0.786873,"{'criterion': 'gini', 'max_features': 'auto', ..."
8,0.3,101,0.77027,0.782326,0.77027,0.760001,"{'criterion': 'entropy', 'max_features': 'auto..."
9,0.2,7,0.673469,0.643878,0.673469,0.654971,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [26]:
df_D.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.761905,0.753671,0.761905,0.750269
0.3,0.752252,0.745264,0.752252,0.740922
0.4,0.738095,0.743942,0.738095,0.727998
0.5,0.707317,0.707367,0.707317,0.695936


### Overall_rating_V2 `(Strategy)` for `Active/Reflective` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [27]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V2']]
X = training[['CGPA_Class',
              'Watch_Status_V2',
              'A/R_Score','Quiz','part_day_V2-1', 'part_day_V2-2', 'part_day_V2-3', 'part_day_V2-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_E = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_E

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.752,0.748536,0.752,0.749475,"{'criterion': 'gini', 'max_features': 'log2', ..."
1,0.5,69,0.664,0.68033,0.664,0.657534,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,0.5,101,0.784,0.789513,0.784,0.770503,"{'criterion': 'gini', 'max_features': 'log2', ..."
3,0.4,7,0.75,0.732854,0.75,0.737329,"{'criterion': 'gini', 'max_features': 'log2', ..."
4,0.4,69,0.76,0.77457,0.76,0.75387,"{'criterion': 'gini', 'max_features': 'auto', ..."
5,0.4,101,0.79,0.799238,0.79,0.780259,"{'criterion': 'gini', 'max_features': 'auto', ..."
6,0.3,7,0.693333,0.675979,0.693333,0.674871,"{'criterion': 'entropy', 'max_features': 'sqrt..."
7,0.3,69,0.813333,0.839908,0.813333,0.795923,"{'criterion': 'gini', 'max_features': 'auto', ..."
8,0.3,101,0.88,0.896854,0.88,0.876839,"{'criterion': 'entropy', 'max_features': 'auto..."
9,0.2,7,0.72,0.764127,0.72,0.709117,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [28]:
df_E.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.82,0.837051,0.82,0.815115
0.3,0.795556,0.804247,0.795556,0.782545
0.4,0.766667,0.768887,0.766667,0.757153
0.5,0.733333,0.73946,0.733333,0.725838


### Overall_rating_V2 `(Strategy)` for `Sensing/Intuitive` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [29]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V2']]
X = training[['CGPA_Class',
              'Watch_Status_V2',
              'S/I_Score','Quiz','part_day_V2-1', 'part_day_V2-2', 'part_day_V2-3', 'part_day_V2-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_F = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_F

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.784,0.778092,0.784,0.780518,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,0.5,69,0.664,0.667736,0.664,0.649261,"{'criterion': 'gini', 'max_features': 'log2', ..."
2,0.5,101,0.768,0.761579,0.768,0.754093,"{'criterion': 'entropy', 'max_features': 'log2..."
3,0.4,7,0.8,0.789523,0.8,0.79204,"{'criterion': 'entropy', 'max_features': 'log2..."
4,0.4,69,0.74,0.732074,0.74,0.725524,"{'criterion': 'entropy', 'max_features': 'auto..."
5,0.4,101,0.76,0.767363,0.76,0.749572,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
6,0.3,7,0.786667,0.786268,0.786667,0.765216,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
7,0.3,69,0.773333,0.761728,0.773333,0.764789,"{'criterion': 'gini', 'max_features': 'log2', ..."
8,0.3,101,0.84,0.870932,0.84,0.835245,"{'criterion': 'gini', 'max_features': 'auto', ..."
9,0.2,7,0.76,0.761571,0.76,0.738632,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [30]:
df_F.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.826667,0.839122,0.826667,0.816836
0.3,0.8,0.80631,0.8,0.788417
0.4,0.766667,0.762986,0.766667,0.755712
0.5,0.738667,0.735802,0.738667,0.727958


### Overall_rating_V2 `(Strategy)` for `Visual/Verbal` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [31]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V2']]
X = training[['CGPA_Class',
              'Watch_Status_V2',
              'Vi/Vb_Score','Quiz','part_day_V2-1', 'part_day_V2-2', 'part_day_V2-3', 'part_day_V2-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_G = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_G

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.784,0.768927,0.784,0.772669,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
1,0.5,69,0.616,0.637459,0.616,0.599152,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,0.5,101,0.816,0.827914,0.816,0.80925,"{'criterion': 'gini', 'max_features': 'log2', ..."
3,0.4,7,0.74,0.71723,0.74,0.720008,"{'criterion': 'gini', 'max_features': 'auto', ..."
4,0.4,69,0.75,0.756959,0.75,0.742175,"{'criterion': 'gini', 'max_features': 'log2', ..."
5,0.4,101,0.81,0.833317,0.81,0.798277,"{'criterion': 'entropy', 'max_features': 'auto..."
6,0.3,7,0.746667,0.735988,0.746667,0.723717,"{'criterion': 'entropy', 'max_features': 'sqrt..."
7,0.3,69,0.786667,0.790374,0.786667,0.782828,"{'criterion': 'gini', 'max_features': 'log2', ..."
8,0.3,101,0.813333,0.809352,0.813333,0.800187,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
9,0.2,7,0.74,0.733212,0.74,0.70807,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [32]:
df_G.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.826667,0.845751,0.826667,0.820744
0.3,0.782222,0.778572,0.782222,0.768911
0.4,0.766667,0.769169,0.766667,0.753487
0.5,0.738667,0.744767,0.738667,0.727024


### Overall_rating_V2 `(Strategy)` for `Sequential/Global` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [33]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V2']]
X = training[['CGPA_Class',
              'Watch_Status_V2',
              'S/G_Score','Quiz','part_day_V2-1', 'part_day_V2-2', 'part_day_V2-3', 'part_day_V2-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_H = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_H

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.752,0.757163,0.752,0.752517,"{'criterion': 'entropy', 'max_features': 'auto..."
1,0.5,69,0.704,0.711148,0.704,0.676175,"{'criterion': 'entropy', 'max_features': 'sqrt..."
2,0.5,101,0.768,0.771701,0.768,0.751173,"{'criterion': 'entropy', 'max_features': 'auto..."
3,0.4,7,0.73,0.724815,0.73,0.726045,"{'criterion': 'entropy', 'max_features': 'sqrt..."
4,0.4,69,0.79,0.788475,0.79,0.778339,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
5,0.4,101,0.73,0.716707,0.73,0.71161,"{'criterion': 'gini', 'max_features': 'auto', ..."
6,0.3,7,0.733333,0.713479,0.733333,0.713359,"{'criterion': 'gini', 'max_features': 'log2', ..."
7,0.3,69,0.826667,0.825368,0.826667,0.821142,"{'criterion': 'entropy', 'max_features': 'sqrt..."
8,0.3,101,0.773333,0.760279,0.773333,0.759103,"{'criterion': 'gini', 'max_features': 'log2', ..."
9,0.2,7,0.72,0.712393,0.72,0.707185,"{'criterion': 'gini', 'max_features': 'log2', ..."


In [34]:
df_H.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.806667,0.809858,0.806667,0.796633
0.3,0.777778,0.766375,0.777778,0.764535
0.4,0.75,0.743332,0.75,0.738665
0.5,0.741333,0.746671,0.741333,0.726622


### Overall_rating_V3 `(MVC)` for `Active/Reflective` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [35]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V3']]
X = training[['CGPA_Class',
              'Watch_Status_V3',
              'A/R_Score','Quiz','part_day_V3-1', 'part_day_V3-2', 'part_day_V3-3', 'part_day_V3-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_I = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_I

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.72,0.721876,0.72,0.718409,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,0.5,69,0.76,0.763064,0.76,0.753199,"{'criterion': 'entropy', 'max_features': 'auto..."
2,0.5,101,0.773333,0.775248,0.773333,0.772926,"{'criterion': 'gini', 'max_features': 'auto', ..."
3,0.4,7,0.766667,0.765979,0.766667,0.764267,"{'criterion': 'entropy', 'max_features': 'auto..."
4,0.4,69,0.758333,0.750497,0.758333,0.751562,"{'criterion': 'gini', 'max_features': 'auto', ..."
5,0.4,101,0.841667,0.835552,0.841667,0.833545,"{'criterion': 'gini', 'max_features': 'auto', ..."
6,0.3,7,0.788889,0.791013,0.788889,0.783397,"{'criterion': 'entropy', 'max_features': 'auto..."
7,0.3,69,0.833333,0.832655,0.833333,0.828864,"{'criterion': 'gini', 'max_features': 'auto', ..."
8,0.3,101,0.855556,0.856058,0.855556,0.855556,"{'criterion': 'entropy', 'max_features': 'sqrt..."
9,0.2,7,0.916667,0.913757,0.916667,0.914603,"{'criterion': 'entropy', 'max_features': 'auto..."


In [36]:
df_I.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.877778,0.886011,0.877778,0.874621
0.3,0.825926,0.826575,0.825926,0.822605
0.4,0.788889,0.784009,0.788889,0.783125
0.5,0.751111,0.753396,0.751111,0.748178


### Overall_rating_V3 `(MVC)` for `Sensing/Intuitive` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [37]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V3']]
X = training[['CGPA_Class',
              'Watch_Status_V3',
              'S/I_Score','Quiz','part_day_V3-1', 'part_day_V3-2', 'part_day_V3-3', 'part_day_V3-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_J = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_J

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.733333,0.749248,0.733333,0.73081,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,0.5,69,0.793333,0.783122,0.793333,0.779174,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
2,0.5,101,0.82,0.807049,0.82,0.804649,"{'criterion': 'gini', 'max_features': 'auto', ..."
3,0.4,7,0.866667,0.861023,0.866667,0.862864,"{'criterion': 'entropy', 'max_features': 'log2..."
4,0.4,69,0.85,0.888598,0.85,0.825188,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
5,0.4,101,0.816667,0.797345,0.816667,0.801097,"{'criterion': 'gini', 'max_features': 'auto', ..."
6,0.3,7,0.877778,0.875437,0.877778,0.876379,"{'criterion': 'gini', 'max_features': 'auto', ..."
7,0.3,69,0.9,0.908266,0.9,0.893184,"{'criterion': 'gini', 'max_features': 'auto', ..."
8,0.3,101,0.822222,0.799724,0.822222,0.805276,"{'criterion': 'entropy', 'max_features': 'auto..."
9,0.2,7,0.9,0.911111,0.9,0.903472,"{'criterion': 'entropy', 'max_features': 'auto..."


In [38]:
df_J.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.866667,0.873057,0.866667,0.861814
0.3,0.866667,0.861142,0.866667,0.85828
0.4,0.844444,0.848989,0.844444,0.829716
0.5,0.782222,0.779807,0.782222,0.771544


### Overall_rating_V3 `(MVC)` for `Visual/Verbal` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [39]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V3']]
X = training[['CGPA_Class',
              'Watch_Status_V3',
              'Vi/Vb_Score','Quiz','part_day_V3-1', 'part_day_V3-2', 'part_day_V3-3', 'part_day_V3-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_K = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_K

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.806667,0.804238,0.806667,0.803357,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
1,0.5,69,0.8,0.794723,0.8,0.781849,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,0.5,101,0.82,0.829979,0.82,0.81966,"{'criterion': 'entropy', 'max_features': 'auto..."
3,0.4,7,0.875,0.870546,0.875,0.869743,"{'criterion': 'entropy', 'max_features': 'auto..."
4,0.4,69,0.783333,0.768247,0.783333,0.772668,"{'criterion': 'entropy', 'max_features': 'sqrt..."
5,0.4,101,0.833333,0.83356,0.833333,0.832182,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
6,0.3,7,0.877778,0.876045,0.877778,0.876067,"{'criterion': 'entropy', 'max_features': 'sqrt..."
7,0.3,69,0.844444,0.833788,0.844444,0.833069,"{'criterion': 'entropy', 'max_features': 'sqrt..."
8,0.3,101,0.866667,0.858598,0.866667,0.859691,"{'criterion': 'gini', 'max_features': 'auto', ..."
9,0.2,7,0.866667,0.86735,0.866667,0.864931,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [40]:
df_K.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.861111,0.868855,0.861111,0.851208
0.3,0.862963,0.856143,0.862963,0.856276
0.4,0.830556,0.824118,0.830556,0.824864
0.5,0.808889,0.809646,0.808889,0.801622


### Overall_rating_V3 `(MVC)` for `Sequential/Global` learners (60 Train - 40 Test) | (70 Train - 30 Test) | (80 Train - 20 Test)

In [41]:
temp = []
rand_states = [7,69,101]
y = training[['Overall_Rating_V3']]
X = training[['CGPA_Class',
              'Watch_Status_V3',
              'S/G_Score','Quiz','part_day_V3-1', 'part_day_V3-2', 'part_day_V3-3', 'part_day_V3-4']]

oversample = RandomOverSampler(sampling_strategy='all')
X, y = oversample.fit_resample(X, y)


for i in [0.5,0.4,0.3,0.2]:
    for j in rand_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=j)
        param_grid = {'n_estimators': [200, 400],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'criterion' :['gini', 'entropy']}

        model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
        model.fit(X_train, y_train.values.ravel())
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test,pred)
        pre = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted')
        f1 = f1_score(y_test, pred, average='weighted')
        param = model.best_params_
        
        temp.append([i,j,acc,pre,recall,f1,param])
        
df_L = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp)
df_L

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,0.5,7,0.813333,0.822021,0.813333,0.799804,"{'criterion': 'entropy', 'max_features': 'sqrt..."
1,0.5,69,0.8,0.802807,0.8,0.774007,"{'criterion': 'gini', 'max_features': 'log2', ..."
2,0.5,101,0.786667,0.794562,0.786667,0.788332,"{'criterion': 'gini', 'max_features': 'auto', ..."
3,0.4,7,0.808333,0.801303,0.808333,0.79757,"{'criterion': 'gini', 'max_features': 'auto', ..."
4,0.4,69,0.791667,0.781775,0.791667,0.763836,"{'criterion': 'gini', 'max_features': 'log2', ..."
5,0.4,101,0.816667,0.821643,0.816667,0.811816,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
6,0.3,7,0.788889,0.796189,0.788889,0.785846,"{'criterion': 'gini', 'max_features': 'auto', ..."
7,0.3,69,0.855556,0.857402,0.855556,0.849241,"{'criterion': 'entropy', 'max_features': 'auto..."
8,0.3,101,0.822222,0.830061,0.822222,0.812515,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
9,0.2,7,0.883333,0.884097,0.883333,0.878395,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [42]:
df_L.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.844444,0.855638,0.844444,0.838347
0.3,0.822222,0.827884,0.822222,0.815868
0.4,0.805556,0.801574,0.805556,0.791074
0.5,0.8,0.806463,0.8,0.787381
