In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV , cross_val_score
from sklearn.metrics import cohen_kappa_score , make_scorer
from xgboost import XGBClassifier
from sklearn.decomposition import PCA , TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [27]:
steel_data = pd.read_csv("Faults.NNA" , sep='\s+' , header=None)

In [28]:
steel_data.shape

(1941, 34)

In [29]:
steel_data.columns# The classes should be randomised but it data it is not randomised . So randomising it 
# frac = entire data will be choosen but the datpoints will be mixed

steel_data_shuffled = steel_data.sample(frac=1.0 , random_state=12345)

In [30]:
# The datase has 7 different types of Faults and it has to be put in a single column by argmax func.
# Sofirst converting it to array 

arr = np.array(steel_data)

In [31]:
steel_data.columns

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
            33],
           dtype='int64')

In [32]:
# Last columns are dummified class information (Faults) - dependent variables
# Moving from last
# iloc and loc is ONLY for Dataframes 
# For arrays we can use directly
# DataFrame - Different types of datatypes
# Array - ONLY one type of datatype

X = arr[:,:27] # It will start from 0 column until 26 rows
Y = arr[:,27:] # It will start from 28th column(ie 27)

In [33]:
Y.shape

(1941, 7)

In [34]:
# Converting 7 Dimensional data into a single Dimension
# Converting dummified column into a single column

y = np.argmax(Y, axis=1) # Compare column by column and the column having high value corressponding INDEX will be returned 

In [35]:
len(y)

1941

In [36]:
y.shape # We need to supply One dimensional column for SKlearn type of modelling . Fo NN any dimensions can be supplied

(1941,)

In [39]:
# For Training data =  its fit_transform
# For Test data = its ONLY transform 
# Eg. sc = StandardScaler()
#     X_scaled_train = sc.fit_transform(X_train)
#     X_scaled_test = sc.fit_transform(X_test) --> WRONG by Logic
#     X_scaled_test = sc.transform(X_test)     --> RIGHT

X_scaled = StandardScaler().fit_transform(X)

## Gradient Boosting & XGBOOST

In [40]:
gbm = GradientBoostingClassifier()  # Loss = Deviance , Learning_rate = 0.1 (constant and not steepest descent)
xgb = XGBClassifier()

In [41]:
best_gbm = GridSearchCV(gbm , param_grid={'learning_rate': [0.01,0.05,0.1],
                                          'max_depth':[1,2,3],
                                          'n_estimators':[100,200,500] }, cv =5 , n_jobs=1 )

In [42]:
best_xgb = GridSearchCV(xgb , param_grid={'learning_rate': [0.01,0.05,0.1],
                                          'max_depth':[1,2,3],
                                          'n_estimators':[100,200,500] }, cv =5 , n_jobs=1 )

In [43]:
best_gbm.fit(X_scaled,y)
best_xgb.fit(X_scaled,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constrai...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                   

In [44]:
# cohen_kappa_score is for imbalance data and multiclass problems

def kappa_score(y_actual,y_pred):
    return cohen_kappa_score(y_actual,y_pred)

Kappa = make_scorer(kappa_score)

In [47]:
cross_val_score(best_gbm.best_estimator_ , X = X_scaled , y=y , cv=5 , scoring= Kappa , n_jobs=-1) 

array([0.36883177, 0.53820413, 0.54041155, 0.66233697, 0.39831286])

In [48]:
cross_val_score(best_xgb.best_estimator_ , X = X_scaled , y=y , cv=5 , scoring= Kappa , n_jobs=-1) 

array([0.4152731 , 0.55880491, 0.52220041, 0.62620278, 0.45631827])