In [2]:
'''important libs'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


'''metrics'''
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

'''preprocessing'''
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV


'''model'''
from sklearn.ensemble import RandomForestClassifier

In [3]:
dataframe = pd.read_csv('data.csv')

In [4]:
dataframe.head(3)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0


In [5]:
data = dataframe.drop(['StudentID','GPA'], axis=1)
data.head(3)

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GradeClass
0,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.0
1,18,0,0,1,15.408756,0,0,1,0,0,0,0,1.0
2,15,0,2,3,4.21057,26,0,2,0,0,0,0,4.0


In [12]:
'''seperating features and targrt'''
features = data.drop('GradeClass', axis=1)
targets = data['GradeClass']

In [16]:
'''over sample the data'''
over_sample = RandomOverSampler()
X,y = over_sample.fit_resample(features, targets)

In [20]:
'''scaling features'''
std_scaler = StandardScaler()
scaled_X = std_scaler.fit_transform(X)
scaled_X

array([[ 0.47299114,  0.99851473, -0.87801753, ..., -0.673374  ,
         1.96755665, -0.42300475],
       [ 1.36742004, -1.00148748, -0.87801753, ..., -0.673374  ,
        -0.50824458, -0.42300475],
       [-1.31586666, -1.00148748,  1.04995932, ..., -0.673374  ,
        -0.50824458, -0.42300475],
       ...,
       [-1.31586666,  0.99851473,  0.08597089, ...,  1.48505881,
         1.96755665, -0.42300475],
       [-1.31586666, -1.00148748, -0.87801753, ..., -0.673374  ,
        -0.50824458, -0.42300475],
       [ 0.47299114,  0.99851473,  0.08597089, ..., -0.673374  ,
        -0.50824458, -0.42300475]])

In [21]:
'''turning the data back into a dataframe'''
features_X = pd.DataFrame(scaled_X, columns=features.columns)
features_X.head()

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering
0,0.472991,0.998515,-0.878018,0.297762,1.632144,-0.39722,1.359931,-0.241838,-0.841737,-0.673374,1.967557,-0.423005
1,1.36742,-1.001487,-0.878018,-0.734855,0.858205,-1.253109,-0.735332,-1.136902,-0.841737,-0.673374,-0.508245,-0.423005
2,-1.315867,-1.001487,1.049959,1.330379,-1.100387,1.925906,-0.735332,-0.241838,-0.841737,-0.673374,-0.508245,-0.423005
3,0.472991,0.998515,-0.878018,1.330379,-0.082758,0.458668,-0.735332,0.653227,1.188019,-0.673374,-0.508245,-0.423005
4,0.472991,0.998515,-0.878018,0.297762,-1.019595,0.825478,1.359931,0.653227,-0.841737,-0.673374,-0.508245,-0.423005


In [22]:
'''splitng the data betweeen train and test data'''
X_train, X_test, y_train, y_test = train_test_split(features_X, y, test_size= 0.33, random_state= 24, shuffle=None)

In [23]:
X_train.shape

(4056, 12)

In [24]:
'''hyperparamter tunning step'''
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2,5,10],
    'max_features': ['sqrt', 'log2']
}

In [25]:
'''creating an object of the model'''
rf = RandomForestClassifier(random_state=24)

In [26]:
'''initialize GridSearchCV'''
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5,n_jobs=-1)

In [27]:
'''fit the model to the data'''
grid_search.fit(X_train, y_train)


In [28]:
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

In [29]:
best_params

{'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_split': 2,
 'n_estimators': 100}

In [30]:
best_rf

In [31]:
'''make_prediction'''
y_pred = best_rf.predict(X_test)

In [32]:
'''Evaluation'''
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.95
