### Section 2: ML Practical

#### Part 1: Loading all the Python libraries for the ML Model 

In [1]:
#Importing library for loading the data into Python
import pandas as pd

#Importing library for Pre-processing of the data
from sklearn import preprocessing

#Importing library for getting all the models
from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier

#Importing library for splitting the data for learning and testing phases
from sklearn.model_selection import train_test_split

#Importing library for getting all the metrics for performance evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score

#Importing library for saving the models with the results
#import pickle
#import zipfile
import os
import joblib
import argparse

#### Part 2: Data Ingestion and Preprocessing 

In [2]:
#Loading the dataset and print the data types of the columns...
data = pd.read_csv("data1.csv")
df = data.infer_objects()
print(df.dtypes[0:14])

f1      int64
f2     object
f3      int64
f4     object
f5      int64
f6     object
f7     object
f8     object
f9     object
f10    object
f11     int64
f12     int64
f13     int64
f14    object
dtype: object


In [3]:
# Removing the empty instances (or instances with white columns) of the dataset (lines in the file)
for i in range (0,14):
    if df.dtypes[i] != 'int64':
        data.iloc[:,i] = df.iloc[:,i].map(lambda x:x.strip())
array = data
array.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,1


In [4]:
# Seperating the input columns and the target (output) columns...
inputs = array.drop('class', axis='columns')
target = array['class']

In [5]:
#Converting categorical variables into non-categorical counterparts...

labelenc = preprocessing.LabelEncoder()

X= inputs.values
Y= target.values

for i in range (0,14):
    X[:,i] = labelenc.fit_transform(X[:,i])
    
#This is how X (input array) and Y (output array) now looks like...
print(X)

[[22 7 2671 ... 0 39 39]
 [33 6 2926 ... 0 12 39]
 [21 4 14086 ... 0 39 39]
 ...
 [41 4 7883 ... 0 39 39]
 [5 4 12881 ... 0 19 39]
 [35 5 17825 ... 0 39 39]]


In [6]:
print(Y)

[1 1 1 ... 1 1 0]


#### Part 3: Splitting the Dataset

In [7]:
# Splitting the dataset into a Validation Set...
test_ratio = 0.2
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = test_ratio)

In [8]:
#Visualizing the shape of the Training Dataset...
print(X_train.shape)
print(Y_train.shape)

(26048, 14)
(26048,)


In [9]:
#Visualizing the shape of the Testing Dataset...
print(X_test.shape)
print(Y_test.shape)

(6513, 14)
(6513,)


#### Part 4: Training and Evaluating the Model 

In [10]:
M1 = DecisionTreeClassifier(criterion='entropy',max_features=13,max_depth=10)

In [11]:
#Fitting and Testing the Decision Tree Model (M1)...
M1 = M1.fit(X_train,Y_train)
M1_pred = M1.predict(X_test)

In [12]:
print("--------------------Model 1: Decision Tree--------------------------")
print("Accuracy Score: ", accuracy_score(Y_test,M1_pred)*100,"%")
print("Confusion Matrix: \n", confusion_matrix(Y_test,M1_pred))
print("Hamming Loss: ", hamming_loss(Y_test,M1_pred)*100,"%")
print("F1 Score: ", f1_score(Y_test,M1_pred))
print("Average Precision Curve: ", average_precision_score(Y_test,M1_pred))

--------------------Model 1: Decision Tree--------------------------
Accuracy Score:  85.10670965760787 %
Confusion Matrix: 
 [[ 936  620]
 [ 350 4607]]
Hamming Loss:  14.893290342392138 %
F1 Score:  0.9047525530243519
Average Precision Curve:  0.8728916376063398


In [13]:
#Saving the first Model into a Pickle File...
# print("Export the model to DTmodel.pkl")
# f1= open('M1.pkl','wb')
# pickle.dump(M1,f1)
# f1.close
print("Export the model to DTmodel.pkl")
joblib.dump(M1, "DTmodel.pkl")

Export the model to DTmodel.pkl


['DTmodel.pkl']

In [14]:
# #Saving the first Model into a Zip File...
# zipfile.ZipFile('model1.zip',mode='w').write('M1.pkl')