**Reading the Data set**

In [5]:
import pandas as pd

# reading the data
data = pd.read_csv('../input/students_performance_data_prepared.csv')

# getting the shape of the data
print(data.shape)

(1000, 12)


In [6]:
# looking at the head of the data

data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,pass_math,pass_reading,pass_writing,status
0,0,1,1,1,1,72,72,74,1,1,1,1
1,0,2,4,1,0,69,90,88,1,1,1,1
2,0,1,3,1,1,90,95,93,1,1,1,1
3,1,0,0,0,1,47,57,44,1,1,1,1
4,1,2,4,1,1,76,78,75,1,1,1,1


In [7]:
# describing the dataset

data.describe()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,pass_math,pass_reading,pass_writing,status
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.482,2.174,2.486,0.645,0.642,66.089,69.169,68.054,0.96,0.974,0.968,0.949
std,0.499926,1.157179,1.829522,0.478753,0.479652,15.16308,14.600192,15.195657,0.196057,0.159215,0.176088,0.220108
min,0.0,0.0,0.0,0.0,0.0,0.0,17.0,10.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,0.0,57.0,59.0,57.75,1.0,1.0,1.0,1.0
50%,0.0,2.0,2.0,1.0,1.0,66.0,70.0,69.0,1.0,1.0,1.0,1.0
75%,1.0,3.0,4.0,1.0,1.0,77.0,79.0,79.0,1.0,1.0,1.0,1.0
max,1.0,4.0,5.0,1.0,1.0,100.0,100.0,100.0,1.0,1.0,1.0,1.0


In [4]:
# splitting the dependent and independent variables

In [8]:
x = data.iloc[:,:5]
y = data.iloc[:,11]

print(x.shape)
print(y.shape)

(1000, 5)
(1000,)


In [7]:
# splitting the dataset into training and test sets

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 45)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(750, 5)
(750,)
(250, 5)
(250,)


In [9]:
# create a scaler with sklearn minmaxscaler

In [10]:
# importing the MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# creating a scaler
mm = MinMaxScaler()

In [8]:
# applying the scaler on the whole feature dataset (independent variables)

In [11]:
x_train = mm.fit_transform(x_train)
x_test = mm.transform(x_test)

## Modelling

## Logistic Regression

In [13]:
# create a logistic regression model

In [12]:
from sklearn.linear_model import  LogisticRegression

# creating a model
model_lr = LogisticRegression()

In [15]:
# train it on the train dataset

In [13]:
model_lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
# use it to predict results on the test dataset

In [14]:
y_pred = model_lr.predict(x_test)

In [13]:
# compute accuracy on train and test datasets

In [15]:
print("Training Accuracy :", model_lr.score(x_train, y_train))
print("Testing Accuracy :", model_lr.score(x_test, y_test))

Training Accuracy : 0.9533333333333334
Testing Accuracy : 0.936


In [21]:
# check feature importance, which is the most important feature for this model?

In [16]:
feature_importances = dict(zip(x.columns, model_lr.coef_[0]))

sorted(feature_importances.items(), key=lambda item: item[1])

[('test preparation course', -1.1605748894554067),
 ('parental level of education', -0.5918145793622861),
 ('gender', 0.4404073993484032),
 ('race/ethnicity', 0.9417161778247116),
 ('lunch', 1.9411212203777408)]

In [23]:
# get the confusion matrix

In [17]:
from sklearn.metrics import confusion_matrix

# creating a confusion matrix
confusion_matrix(y_test, y_pred)

array([[  0,  16],
       [  0, 234]])

**Random Forest**

In [1]:
# create a random forest model

In [18]:
from sklearn.ensemble import RandomForestClassifier

# creating a model
model_rf = RandomForestClassifier()

In [3]:
# train it on the train dataset

In [19]:
model_rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [5]:
# use it to predict results on the test dataset

In [20]:
y_pred = model_rf.predict(x_test)

In [7]:
# compute accuracy on train and test datasets

In [21]:
print("Training Accuracy :", model_rf.score(x_train, y_train))
print("Testing Accuracy :", model_rf.score(x_test, y_test))

Training Accuracy : 0.9573333333333334
Testing Accuracy : 0.912


In [9]:
# check feature importance, which is the most important feature for this model?

In [22]:
feature_importances = dict(zip(x.columns, model_rf.feature_importances_))

sorted(feature_importances.items(), key=lambda item: item[1])

[('test preparation course', 0.07216202307987),
 ('gender', 0.09314937787149997),
 ('lunch', 0.15274722107561112),
 ('race/ethnicity', 0.31314128039008654),
 ('parental level of education', 0.3688000975829325)]

In [12]:
# get the confusion matrix

In [23]:
confusion_matrix(y_test, y_pred)

array([[  0,  16],
       [  6, 228]])

**Save model as a file**

In [24]:
# save logistic regression model
import pickle

pickle.dump(model_lr, open('../output/model_lr.pkl', 'wb'))

In [1]:
! jupyter nbconvert --to html students_performance_build_validate_model_solution.ipynb

[NbConvertApp] Converting notebook students_performance_build_validate_model_solution.ipynb to html
[NbConvertApp] Writing 303453 bytes to students_performance_build_validate_model_solution.html
