In [1]:
#importing all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

  import pandas.util.testing as tm


In [2]:
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [3]:
import joblib

In [4]:
df = pd.read_csv('data/train.csv')


In [5]:
df.head()

Unnamed: 0,S/N,Gender,Age,Location,famsize,Pstatus,Medu,Fedu,traveltime,studytime,...,paid,activities,nursery,higher,internet,famrel,freetime,health,absences,Score
0,1,F,13,U,GT3,A,4,4,2,2,...,no,no,yes,yes,no,4,3,3,4,22
1,2,F,12,U,GT3,T,1,1,1,2,...,no,no,no,yes,yes,5,3,3,2,31
2,3,F,10,U,LE3,T,1,1,1,2,...,no,no,yes,yes,yes,4,3,3,6,37
3,4,F,10,U,GT3,T,4,2,1,3,...,no,yes,yes,yes,yes,3,2,5,0,42
4,5,F,11,U,GT3,T,3,3,1,2,...,no,no,yes,yes,no,4,3,5,0,37


In [6]:
df['famsize'].unique()

array(['GT3', 'LE3'], dtype=object)

In [7]:
df.drop('S/N', axis=1, inplace=True)

In [8]:
df.columns

Index(['Gender', 'Age', 'Location', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid',
       'activities', 'nursery', 'higher', 'internet', 'famrel', 'freetime',
       'health', 'absences', 'Score'],
      dtype='object')

In [9]:
df.isnull().sum()

Gender        0
Age           0
Location      0
famsize       0
Pstatus       0
Medu          0
Fedu          0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
famrel        0
freetime      0
health        0
absences      0
Score         0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Gender      325 non-null    object
 1   Age         325 non-null    int64 
 2   Location    325 non-null    object
 3   famsize     325 non-null    object
 4   Pstatus     325 non-null    object
 5   Medu        325 non-null    int64 
 6   Fedu        325 non-null    int64 
 7   traveltime  325 non-null    int64 
 8   studytime   325 non-null    int64 
 9   failures    325 non-null    int64 
 10  schoolsup   325 non-null    object
 11  famsup      325 non-null    object
 12  paid        325 non-null    object
 13  activities  325 non-null    object
 14  nursery     325 non-null    object
 15  higher      325 non-null    object
 16  internet    325 non-null    object
 17  famrel      325 non-null    int64 
 18  freetime    325 non-null    int64 
 19  health      325 non-null    int64 
 20  absences  

In [11]:
df_target = df['Score']
df_target

0      22
1      31
2      37
3      42
4      37
       ..
320    39
321    43
322    27
323    30
324    31
Name: Score, Length: 325, dtype: int64

In [12]:
# dropping taget variable
df.drop(['Score'],axis=1,inplace=True)

In [13]:
# droping corelated columns for model complexity
df.drop(["Fedu","Medu","Pstatus","schoolsup","famsup","famrel","absences"], axis=1, inplace=True)

In [14]:
print(len(df.columns))
df.columns

14


Index(['Gender', 'Age', 'Location', 'famsize', 'traveltime', 'studytime',
       'failures', 'paid', 'activities', 'nursery', 'higher', 'internet',
       'freetime', 'health'],
      dtype='object')

In [15]:
# dividing dataset into numerical and categorical dataset
num_df = df.loc[:,df.dtypes!=np.object]
cat_df = df.loc[:,df.dtypes==np.object]

In [16]:
# num_df.head()
cat_df.head()

Unnamed: 0,Gender,Location,famsize,paid,activities,nursery,higher,internet
0,F,U,GT3,no,no,yes,yes,no
1,F,U,GT3,no,no,no,yes,yes
2,F,U,LE3,no,no,yes,yes,yes
3,F,U,GT3,no,yes,yes,yes,yes
4,F,U,GT3,no,no,yes,yes,no


In [17]:
le = LabelEncoder()

In [18]:
cat_df = cat_df.apply(le.fit_transform)
cat_df.head()

Unnamed: 0,Gender,Location,famsize,paid,activities,nursery,higher,internet
0,0,1,0,0,0,1,1,0
1,0,1,0,0,0,0,1,1
2,0,1,1,0,0,1,1,1
3,0,1,0,0,1,1,1,1
4,0,1,0,0,0,1,1,0


In [19]:
le.classes_

array(['no', 'yes'], dtype=object)

In [20]:
# saving label enconder
output = open('model/labelEncoder.pkl', 'wb')
joblib.dump(le, output)
output.close()

In [21]:
df = num_df.join(cat_df)

In [22]:
# df.info()
df.head()

Unnamed: 0,Age,traveltime,studytime,failures,freetime,health,Gender,Location,famsize,paid,activities,nursery,higher,internet
0,13,2,2,0,3,3,0,1,0,0,0,1,1,0
1,12,1,2,0,3,3,0,1,0,0,0,0,1,1
2,10,1,2,0,3,3,0,1,1,0,0,1,1,1
3,10,1,3,0,2,5,0,1,0,0,1,1,1,1
4,11,1,2,0,3,5,0,1,0,0,0,1,1,0


In [23]:
X_train, X_test, y_train, y_test = train_test_split(df, df_target, test_size=0.3,random_state=109)

RANDOM FOREST CLASSIFIER

In [24]:
model = RandomForestClassifier()
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [25]:
CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)

In [26]:
CV_rfc.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [27]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 200}

In [28]:
pipeline = Pipeline(steps=[('model', RandomForestClassifier(criterion= 'gini', 
                                                            max_depth= 4,
                                                            max_features= 'auto',
                                                            n_estimators= 500)
                           )])

In [29]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('model',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=4,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=500, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [30]:
y_predict = pipeline.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_predict))

Accuracy: 0.05102040816326531


In [32]:
#classification error
print("Classification error is",1- metrics.accuracy_score(y_test, y_predict, normalize = True))
#sensitivity
print("sensitivity is", metrics.recall_score(y_test, y_predict, labels=None, average =  'micro', sample_weight=None))
#specificity
print("specificity is", 1 - metrics.recall_score(y_test, y_predict,labels=None, average =  'micro', sample_weight=None))

Classification error is 0.9489795918367347
sensitivity is 0.05102040816326531
specificity is 0.9489795918367347


In [33]:
output = open('model/model2.pkl', 'wb')
joblib.dump(pipeline, output)
output.close()