In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# custom files
import model_best_hyperparameters

In [3]:
# read train data
ds = pd.read_csv("../data/train_data.csv")

In [4]:
ds.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,PracticeSport,IsFirstChild,NrSiblings,WklyStudyHours,MathScore,ReadingScore,WritingScore,AverageScore,Passed
0,male,group B,,free/reduced,completed,sometimes,no,1.0,5 - 10,58,68,63,63.0,1
1,female,group C,associate's degree,standard,none,sometimes,yes,1.0,> 10,57,50,63,56.666667,1
2,female,group D,some college,standard,,regularly,no,1.0,< 5,67,74,79,73.333333,1
3,male,group D,some high school,free/reduced,none,sometimes,no,2.0,> 10,48,49,45,47.333333,0
4,female,group A,associate's degree,free/reduced,completed,sometimes,no,1.0,< 5,71,83,83,79.0,1


In [5]:
mean_impute_columns = [
 'MathScore',
 'ReadingScore',
 'WritingScore',
 'AverageScore']

mode_impute_columns = [
 'Gender',
 'EthnicGroup',
 'ParentEduc',
 'LunchType',
 'TestPrep',
 'PracticeSport',
 'IsFirstChild',
 'NrSiblings',
 'WklyStudyHours',
 'Passed']

In [6]:
# feature engineering

# Missing data imputation

def impute_na(df, variable, value):
    return df[variable].fillna(value)

# Missing data imputation

In [7]:
# Let's create a dict and impute mean values
mean_impute_values = dict()
for column in mean_impute_columns:
    mean_impute_values[column] = ds[column].mean()
    ds[column] = impute_na(ds, column, mean_impute_values[column])

In [8]:
# Let's create a dict and impute mode values
mode_impute_values = dict()
for column in mode_impute_columns:
    mode_impute_values[column] = ds[column].mode()[0]
    ds[column] = impute_na(ds, column, mode_impute_values[column])

In [9]:
print("Any missing sample in training set:",ds.isnull().values.any())

Any missing sample in training set: False


In [10]:
ds.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,PracticeSport,IsFirstChild,NrSiblings,WklyStudyHours,MathScore,ReadingScore,WritingScore,AverageScore,Passed
0,male,group B,some college,free/reduced,completed,sometimes,no,1.0,5 - 10,58,68,63,63.0,1
1,female,group C,associate's degree,standard,none,sometimes,yes,1.0,> 10,57,50,63,56.666667,1
2,female,group D,some college,standard,none,regularly,no,1.0,< 5,67,74,79,73.333333,1
3,male,group D,some high school,free/reduced,none,sometimes,no,2.0,> 10,48,49,45,47.333333,0
4,female,group A,associate's degree,free/reduced,completed,sometimes,no,1.0,< 5,71,83,83,79.0,1


## Categorical encoding

In [11]:
cat_columns = [
 'Gender',
 'EthnicGroup',
 'ParentEduc',
 'LunchType',
 'TestPrep',
 'PracticeSport',
 'IsFirstChild',
 'NrSiblings',
 'WklyStudyHours',
 'Passed']

In [12]:
# Categorical encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for column in cat_columns:
    ds[column]=le.fit_transform(ds[column])

In [13]:
ds.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,PracticeSport,IsFirstChild,NrSiblings,WklyStudyHours,MathScore,ReadingScore,WritingScore,AverageScore,Passed
0,1,1,4,0,0,2,0,1,0,58,68,63,63.0,1
1,0,2,0,1,1,2,1,1,2,57,50,63,56.666667,1
2,0,3,4,1,1,1,0,1,1,67,74,79,73.333333,1
3,1,3,5,0,1,2,0,2,2,48,49,45,47.333333,0
4,0,0,0,0,0,2,0,1,1,71,83,83,79.0,1


In [14]:
# Define target and features columns
X = ds.drop("Passed", axis=1)
y = ds["Passed"]

In [15]:
# Let's say we want to split the data in 90:10 for train:test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.9)

In [16]:
# Building and train Random Forest Model
rf = RandomForestClassifier(**model_best_hyperparameters.params)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('test set metrics: ', metrics.classification_report(y_test, y_pred))

test set metrics:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       314
           1       1.00      1.00      1.00      2444

    accuracy                           1.00      2758
   macro avg       1.00      1.00      1.00      2758
weighted avg       1.00      1.00      1.00      2758



In [17]:
filename = 'finalized_model.sav'
pickle.dump(rf, open(filename, 'wb'))