In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import cohen_kappa_score , make_scorer

In [2]:
df = pd.read_csv("Faults.NNA" , sep='\s+' , header=None)
names = pd.read_csv("Faults27x7_var" ,header=None )
df.columns = names.iloc[:,0]

In [3]:
df.head(10)

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.8182,-0.2913,0.5822,1,0,0,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0.6667,-0.1228,0.215,1,0,0,0,0,0,0
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.8444,-0.1568,0.5212,1,0,0,0,0,0,0
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0.9338,-0.1992,1.0,1,0,0,0,0,0,0
5,430,441,100250,100337,630,20,87,62357,64,127,...,0.8736,-0.2267,0.9874,1,0,0,0,0,0,0
6,413,446,138468,138883,9052,230,432,1481991,23,199,...,0.9205,0.2791,1.0,1,0,0,0,0,0,0
7,190,200,210936,210956,132,11,20,20007,124,172,...,0.5,0.1841,0.3359,1,0,0,0,0,0,0
8,330,343,429227,429253,264,15,26,29748,53,148,...,0.5,-0.1197,0.5593,1,0,0,0,0,0,0
9,74,90,779144,779308,1506,46,167,180215,53,143,...,0.9024,-0.0651,1.0,1,0,0,0,0,0,0


In [4]:
df_shuffled = df.sample(frac=1.0)
df_shuffled.head(10)

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
192,436,442,530968,530985,63,17,17,6374,90,116,...,0.6471,-0.2096,0.2083,0,1,0,0,0,0,0
631,41,209,1853674,1853744,6484,223,117,666047,40,126,...,-0.5833,-0.1975,1.0,0,0,1,0,0,0,0
288,82,98,1951256,1951299,490,38,44,50867,86,118,...,0.6279,-0.189,0.929,0,1,0,0,0,0,0
466,39,200,1576208,1576347,10796,572,363,1245388,38,141,...,-0.1366,-0.0988,1.0,0,0,1,0,0,0,0
1278,1611,1619,1671950,1671963,69,10,14,5361,63,93,...,0.3846,-0.393,0.2105,0,0,0,0,0,0,1
729,238,354,87744,87794,3288,191,114,260660,36,103,...,-0.569,-0.3807,1.0,0,0,1,0,0,0,0
424,41,222,990300,990473,15896,756,469,1847464,22,143,...,-0.0442,-0.092,1.0,0,0,1,0,0,0,0
1529,1672,1712,116120,116370,5323,245,395,460547,19,120,...,0.84,-0.3241,1.0,0,0,0,0,0,0,1
1035,1538,1562,830518,830524,51,25,11,4561,77,103,...,-0.75,-0.3013,0.2583,0,0,0,0,0,1,0
972,57,64,2014358,2014368,59,11,10,7052,107,134,...,0.3,-0.0662,0.1753,0,0,0,0,0,1,0


In [5]:
X = df_shuffled.loc[:,'X_Minimum':'SigmoidOfAreas']
Y = df_shuffled.loc[:,'Pastry':'Other_Faults']

In [6]:
y= np.argmax(np.array(Y),axis=1) # since there are more than 2 dependent/predictor variables we are trying to make it 1
y

array([1, 2, 1, ..., 2, 6, 5], dtype=int64)

In [7]:
# Instances of the base classifiers 

nb = GaussianNB()
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()

In [8]:
cross_val_score(estimator=nb,X=X,y=y,cv=5,n_jobs=-1)

array([0.43222506, 0.5       , 0.44072165, 0.44300518, 0.4611399 ])

In [9]:
boosted_nb =AdaBoostClassifier(base_estimator=nb , n_estimators=50 , random_state=321)

In [10]:
cross_val_score(estimator=boosted_nb,X=X,y=y,cv=5,n_jobs=-1) 

# the boosted score is very less because the base classifier itself provide accuracy less than 0.5
# in general the BC should give accuracy better than the random guess(i.e 0.5) 

array([0.44757033, 0.39230769, 0.38402062, 0.38341969, 0.43782383])

In [11]:
best_tree = GridSearchCV(tree, param_grid={'max_depth':[1,2,5,7,12,15,18,20,26],
                                            'criterion':['gini','entropy']} , cv=5 , n_jobs=-1)

# 9*2=18 different combinations

In [12]:
best_tree.fit(X=X,y=y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 5, 7, 12, 15, 18, 20, 26]},
      

In [13]:
best_tree.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=18,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [14]:
best_tree.best_params_

{'criterion': 'entropy', 'max_depth': 18}

In [15]:
boosted_tree=AdaBoostClassifier(base_estimator=best_tree.best_estimator_ , n_estimators=500)

In [16]:
cross_val_score(estimator=best_tree.best_estimator_,X=X,y=y,cv=5,n_jobs=-1)

array([0.77237852, 0.71794872, 0.7242268 , 0.74611399, 0.75906736])

In [17]:
cross_val_score(estimator=boosted_tree,X=X,y=y,cv=5,n_jobs=-1)

array([0.71355499, 0.72051282, 0.69845361, 0.72797927, 0.76683938])

In [18]:
# Bagging
#Since it reduces variance error it is allowed to grow to its maximum depth

bagged_tree = BaggingClassifier(base_estimator=tree , n_estimators=100)

In [19]:
cross_val_score(estimator=bagged_tree,X=X,y=y,cv=5,n_jobs=-1)

array([0.77493606, 0.82051282, 0.74742268, 0.78238342, 0.80829016])

## KNN Bagging on May26

In [20]:
X_scaled = StandardScaler().fit_transform(X)

In [21]:
knn_bagging = BaggingClassifier(base_estimator=knn , n_jobs=-1 , random_state=123)

In [22]:
# Since optimising entire ensemble using Gridsearch we params

params = {"base_estimator__n_neighbors": [1,3,5,7,9,11] , 
          "base_estimator__weights": ['uniform' , 'distance'],
          "n_estimators": [100,200,500]
         }

In [23]:
# cohen_kappa_score is for imbalance data and multiclass problems

def kappa_score(y_actual,y_pred):
    return cohen_kappa_score(y_actual,y_pred)

Kappa = make_scorer(kappa_score)

In [24]:
best_knn_bagging = GridSearchCV(estimator=knn_bagging, 
                                param_grid=params,
                                scoring = Kappa,
                                cv=5
                               )

In [None]:
best_knn_bagging.fit(X=X_scaled,y=y)

In [None]:
best_knn_bagging.best_params_