In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
# custom files
import model_best_hyperparameters

In [3]:
# read train data
ds = pd.read_csv("../data/new_data.csv")
print('new data size', ds.shape)

new data size (3065, 14)


In [4]:
ds.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,PracticeSport,IsFirstChild,NrSiblings,WklyStudyHours,MathScore,ReadingScore,WritingScore,AverageScore,Passed
0,female,group B,some high school,standard,completed,regularly,no,3.0,< 5,70,87,87,81.333333,1
1,male,group E,,standard,completed,sometimes,yes,2.0,5 - 10,90,83,84,85.666667,1
2,male,group C,associate's degree,standard,,sometimes,yes,4.0,< 5,81,89,81,83.666667,1
3,female,group E,some high school,free/reduced,none,regularly,yes,0.0,5 - 10,55,57,55,55.666667,1
4,male,group B,high school,free/reduced,completed,never,no,2.0,5 - 10,49,50,53,50.666667,1


In [5]:
mean_impute_columns = [
 'MathScore',
 'ReadingScore',
 'WritingScore',
 'AverageScore']

mode_impute_columns = [
 'Gender',
 'EthnicGroup',
 'ParentEduc',
 'LunchType',
 'TestPrep',
 'PracticeSport',
 'IsFirstChild',
 'NrSiblings',
 'WklyStudyHours',
 'Passed']

In [6]:
def impute_na(df, variable, value):
    return df[variable].fillna(value)

In [7]:
# Let's create a dict and impute mean values
mean_impute_values = dict()
for column in mean_impute_columns:
    mean_impute_values[column] = ds[column].mean()
    ds[column] = impute_na(ds, column, mean_impute_values[column])

In [8]:
# Let's create a dict and impute mode values
mode_impute_values = dict()
for column in mode_impute_columns:
    mode_impute_values[column] = ds[column].mode()[0]
    ds[column] = impute_na(ds, column, mode_impute_values[column])

In [9]:
print("Any missing sample in training set:",ds.isnull().values.any())

Any missing sample in training set: False


In [10]:
ds.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,PracticeSport,IsFirstChild,NrSiblings,WklyStudyHours,MathScore,ReadingScore,WritingScore,AverageScore,Passed
0,female,group B,some high school,standard,completed,regularly,no,3.0,< 5,70,87,87,81.333333,1
1,male,group E,some college,standard,completed,sometimes,yes,2.0,5 - 10,90,83,84,85.666667,1
2,male,group C,associate's degree,standard,none,sometimes,yes,4.0,< 5,81,89,81,83.666667,1
3,female,group E,some high school,free/reduced,none,regularly,yes,0.0,5 - 10,55,57,55,55.666667,1
4,male,group B,high school,free/reduced,completed,never,no,2.0,5 - 10,49,50,53,50.666667,1


# feature engineering

In [11]:
cat_columns = [
 'Gender',
 'EthnicGroup',
 'ParentEduc',
 'LunchType',
 'TestPrep',
 'PracticeSport',
 'IsFirstChild',
 'NrSiblings',
 'WklyStudyHours',
 'Passed']

In [12]:
# Categorical encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for column in cat_columns:
    ds[column]=le.fit_transform(ds[column])

In [13]:
ds.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,PracticeSport,IsFirstChild,NrSiblings,WklyStudyHours,MathScore,ReadingScore,WritingScore,AverageScore,Passed
0,0,1,5,1,0,1,0,3,1,70,87,87,81.333333,1
1,1,4,4,1,0,2,1,2,0,90,83,84,85.666667,1
2,1,2,0,1,1,2,1,4,1,81,89,81,83.666667,1
3,0,4,5,0,1,1,1,0,0,55,57,55,55.666667,1
4,1,1,2,0,0,0,0,2,0,49,50,53,50.666667,1


In [14]:
X = ds.drop(['Passed'], axis=1)
y = ds['Passed']

In [15]:
# load the model and predict
knn = pickle.load(open('finalized_model.sav', 'rb'))

In [16]:
y_pred = knn.predict(X.values)

In [17]:
predictions = knn.predict(X.values)
original_labels = le.inverse_transform(predictions)
original_charges_cat = le.inverse_transform(ds['Passed'])
ds['Passed_pred'] = original_labels
ds['Passed'] = original_charges_cat
ds.to_csv('prediction_results.csv', index=False)

In [18]:
pr = pd.read_csv("prediction_results.csv")

In [21]:
pr.sample(20)

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,PracticeSport,IsFirstChild,NrSiblings,WklyStudyHours,MathScore,ReadingScore,WritingScore,AverageScore,Passed,Passed_pred
370,1,0,2,1,1,2,1,2,0,59,52,46,52.333333,1,1
109,0,2,4,1,1,1,1,2,0,87,90,90,89.0,1,1
2387,1,3,0,0,0,2,1,0,0,83,85,84,84.0,1,1
130,0,1,0,1,1,0,1,0,2,53,58,51,54.0,1,1
163,0,2,4,0,1,1,1,1,0,55,68,60,61.0,1,1
2517,1,1,1,1,1,2,1,2,2,93,89,84,88.666667,1,1
1700,1,2,0,0,1,1,1,1,0,43,45,43,43.666667,0,0
2971,0,2,2,1,0,2,1,4,0,47,61,73,60.333333,1,1
2376,1,2,1,1,1,1,1,2,0,48,52,51,50.333333,1,1
1065,1,2,2,0,1,1,0,3,0,38,39,41,39.333333,0,0
