In [6]:
'''important libs'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


'''metrics'''
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

'''preprocessing'''
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV


'''model'''
from sklearn.ensemble import RandomForestClassifier

In [7]:
dataframe = pd.read_csv('data.csv')

In [8]:
dataframe.head(3)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0


In [10]:
data = dataframe.drop(['StudentID','GPA'], axis=1)
data.head(3)

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GradeClass
0,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.0
1,18,0,0,1,15.408756,0,0,1,0,0,0,0,1.0
2,15,0,2,3,4.21057,26,0,2,0,0,0,0,4.0


In [14]:
std_scaler = StandardScaler()

In [12]:
'''seperating features and targrt'''
features = data.drop('GradeClass', axis=1)
targets = data['GradeClass']

In [20]:
'''scaling features'''
scaled_features = std_scaler.fit_transform(features)
scaled_features

array([[ 0.47291901,  0.97849211, -0.85339088, ..., -0.66013204,
         2.01954358, -0.43186565],
       [ 1.36294441, -1.02198065, -0.85339088, ..., -0.66013204,
        -0.49516139, -0.43186565],
       [-1.30713178, -1.02198065,  1.09164102, ..., -0.66013204,
        -0.49516139, -0.43186565],
       ...,
       [-0.41710638,  0.97849211, -0.85339088, ..., -0.66013204,
        -0.49516139,  2.31553495],
       [-0.41710638,  0.97849211,  0.11912507, ...,  1.51484845,
         2.01954358, -0.43186565],
       [-0.41710638,  0.97849211, -0.85339088, ..., -0.66013204,
        -0.49516139,  2.31553495]])

In [22]:
'''turning the data back into a dataframe'''
features_X = pd.DataFrame(scaled_features, columns=features.columns)
features_X.head()

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering
0,0.472919,0.978492,-0.853391,0.253711,1.780336,-0.890822,1.522371,-0.108744,-0.788476,-0.660132,2.019544,-0.431866
1,1.362944,-1.021981,-0.853391,-0.746087,0.997376,-1.717694,-0.65687,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866
2,-1.307132,-1.021981,1.091641,1.253509,-0.984045,1.353542,-0.65687,-0.108744,-0.788476,-0.660132,-0.495161,-0.431866
3,0.472919,0.978492,-0.853391,1.253509,0.045445,-0.063951,-0.65687,0.782063,1.268269,-0.660132,-0.495161,-0.431866
4,0.472919,0.978492,-0.853391,0.253711,-0.902311,0.290422,1.522371,0.782063,-0.788476,-0.660132,-0.495161,-0.431866


In [23]:
'''splitng the data betweeen train and test data'''
X_train, X_test, y_train, y_test = train_test_split(features_X, targets, test_size= 0.33, random_state= 24, shuffle=None)

In [25]:
X_train.shape

(1602, 12)

In [51]:
'''hyperparamter tunning step'''
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2,5,10],
    'max_features': ['sqrt', 'log2']
}

In [48]:
'''creating an object of the model'''
rf = RandomForestClassifier(random_state=24)

In [54]:
'''initialize GridSearchCV'''
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5,n_jobs=-1)

In [55]:
'''fit the model to the data'''
grid_search.fit(X_train, y_train)


In [56]:
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

In [57]:
best_params

{'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_split': 2,
 'n_estimators': 150}

In [58]:
best_rf

In [62]:
'''make_prediction'''
y_pred = best_rf.predict(X_test)

In [66]:
'''Evaluation'''
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.69
