In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier , BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV , cross_val_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("Faults.NNA" , sep='\s+' , header=None)
names = pd.read_csv("Faults27x7_var" ,header=None )
df.columns = names.iloc[:,0]

In [3]:
df.head(10)

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.8182,-0.2913,0.5822,1,0,0,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0.6667,-0.1228,0.215,1,0,0,0,0,0,0
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.8444,-0.1568,0.5212,1,0,0,0,0,0,0
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0.9338,-0.1992,1.0,1,0,0,0,0,0,0
5,430,441,100250,100337,630,20,87,62357,64,127,...,0.8736,-0.2267,0.9874,1,0,0,0,0,0,0
6,413,446,138468,138883,9052,230,432,1481991,23,199,...,0.9205,0.2791,1.0,1,0,0,0,0,0,0
7,190,200,210936,210956,132,11,20,20007,124,172,...,0.5,0.1841,0.3359,1,0,0,0,0,0,0
8,330,343,429227,429253,264,15,26,29748,53,148,...,0.5,-0.1197,0.5593,1,0,0,0,0,0,0
9,74,90,779144,779308,1506,46,167,180215,53,143,...,0.9024,-0.0651,1.0,1,0,0,0,0,0,0


In [4]:
df_shuffled = df.sample(frac=1.0)

In [5]:
df_shuffled.head(10)

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
788,297,302,1028213,1028219,16,5,6,1990,120,132,...,0.1667,-0.0283,0.14,0,0,0,1,0,0,0
1176,1077,1090,699354,699365,114,15,11,11778,71,132,...,-0.1539,-0.1928,0.257,0,0,0,0,0,1,0
1341,827,834,1930709,1930720,56,11,11,6337,97,132,...,0.3636,-0.1159,0.1822,0,0,0,0,0,0,1
1552,23,32,893638,893649,63,10,11,7769,111,140,...,0.1818,-0.0366,0.2051,0,0,0,0,0,0,1
1167,1272,1286,290640,290652,125,15,12,12560,61,125,...,-0.1429,-0.215,0.2901,0,0,0,0,0,1,0
360,857,865,211983,211993,52,8,10,9968,178,207,...,0.2,0.4976,0.1852,0,0,1,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
487,37,194,1881566,1881746,12489,700,462,1439221,31,141,...,0.1278,-0.0997,1.0,0,0,1,0,0,0,0
698,41,216,2985655,2985724,7003,252,138,734568,40,127,...,-0.6057,-0.1805,1.0,0,0,1,0,0,0,0
1760,79,86,2887095,2887114,86,17,19,9690,102,123,...,0.6316,-0.1197,0.2445,0,0,0,0,0,0,1


In [6]:
X = df_shuffled.loc[:,'X_Minimum':'SigmoidOfAreas']

In [7]:
Y = df_shuffled.loc[:,'Pastry':'Other_Faults']

In [8]:
y= np.argmax(np.array(Y),axis=1) # since there are more than 2 dependent/predictor variables we are trying to make it 1
y

array([3, 5, 6, ..., 0, 5, 6], dtype=int64)

In [9]:
# Instances of the base classifiers 

nb = GaussianNB()
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()

In [35]:
cross_val_score(estimator=nb,X=X,y=y,cv=5,n_jobs=-1)

array([0.48081841, 0.45641026, 0.40463918, 0.46632124, 0.4507772 ])

In [36]:
boosted_nb =AdaBoostClassifier(base_estimator=nb , n_estimators=50 , random_state=321)

In [37]:
cross_val_score(estimator=boosted_nb,X=X,y=y,cv=5,n_jobs=-1) 

# the boosted score is very less because the base classifier itself provide accuracy less than 0.5
# in general the BC should give accuracy better than the random guess(i.e 0.5) 

array([0.35294118, 0.44358974, 0.26804124, 0.33937824, 0.34715026])

In [38]:
best_tree = GridSearchCV(tree, param_grid={'max_depth':[1,2,5,7,12,15,18,20,26],
                                            'criterion':['gini','entropy']} , cv=5 , n_jobs=-1)

# 9*2=18 different combinations

In [39]:
best_tree.fit(X=X,y=y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 5, 7, 12, 15, 18, 20, 26]},
      

In [40]:
best_tree.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=15,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [41]:
best_tree.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [45]:
boosted_tree=AdaBoostClassifier(base_estimator=best_tree.best_estimator_ , n_estimators=500)

In [46]:
cross_val_score(estimator=best_tree.best_estimator_,X=X,y=y,cv=5,n_jobs=-1)

array([0.75959079, 0.75897436, 0.70618557, 0.76683938, 0.69948187])

In [47]:
cross_val_score(estimator=boosted_tree,X=X,y=y,cv=5,n_jobs=-1)

array([0.70332481, 0.7       , 0.67783505, 0.76165803, 0.71502591])

In [49]:
# Bagging
#Since it reduces variance error it is allowed to grow to its maximum depth

bagged_tree = BaggingClassifier(base_estimator=tree , n_estimators=100)

In [50]:
cross_val_score(estimator=bagged_tree,X=X,y=y,cv=5,n_jobs=-1)

array([0.77493606, 0.77179487, 0.76030928, 0.79792746, 0.80051813])