In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report,roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [2]:
df=pd.read_csv('processed_outliers.csv')
print('Data shape:',df.shape)

Data shape: (1159, 22)


In [3]:
list(enumerate(df.columns))

[(0, 'w/b'),
 (1, 'Water'),
 (2, 'Cement type'),
 (3, 'Cement'),
 (4, 'Slag'),
 (5, 'Fly ash'),
 (6, 'Silica fume'),
 (7, 'Lime filler'),
 (8, 'FA'),
 (9, 'CA'),
 (10, 'Plasticizer'),
 (11, 'Superplasticizer'),
 (12, 'Air entraining'),
 (13, 'Comp. str. test age'),
 (14, 'Compressive strength'),
 (15, 'Air content'),
 (16, 'Spreed'),
 (17, 'Slump'),
 (18, 'Fresh density'),
 (19, 'Dry  density'),
 (20, 'Migration test age'),
 (21, 'Migration resistance')]

1-hot encoding

In [4]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[2])], remainder='passthrough')
X=ct.fit_transform(df)
# remove dummy variable and output
y=X[:,-1]
X=X[:,1:-1]
print(X.shape)
print(y.shape)

(1159, 30)
(1159,)


XGB

In [5]:
param_grid={
    'n_estimators': [None,10,50,200,300],   #default=100
    'max_depth': [None,2,6,10,20],
    'max_leaves': [None,10,20],
    'learning_rate': [None,.5,1],
    'subsample': [None,.5,.8],
    # 'reg_lambda': [None,.1,1,10,100],
    # 'reg_alpha': [None,.01,.1,1],
    # 'colsample_bytree': [None,.2,.5,.8],
    'gamma': [None,.1,.5,1],
    # 'min_child_weight': [None,.1,.5,2,4]    #default=1
}

splits=list(StratifiedKFold(shuffle=True,random_state=0).split(X,y))
train_index, test_index = splits[0]
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

def train_xgb(X_train,y_train):
    clf = XGBClassifier(random_state=0,n_jobs=-1)
    grid=GridSearchCV(clf,param_grid,cv=5,n_jobs=3,scoring='accuracy')
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_score_)
    return grid

In [6]:
grid=train_xgb(X_train,y_train)

{'gamma': None, 'learning_rate': None, 'max_depth': 20, 'max_leaves': None, 'n_estimators': 300, 'subsample': 0.5}
0.7714850334205173


In [6]:
best_params={'gamma': None, 'learning_rate': None, 'max_depth': 20, 'max_leaves': None, 'n_estimators': 300, 'subsample': 0.5}
best_model=XGBClassifier(random_state=0,n_jobs=-1,**best_params)
best_model.fit(X_train,y_train)
print(classification_report(y_test,best_model.predict(X_test)))
roc_auc_score(y_test,best_model.predict_proba(X_test),multi_class='ovr')

              precision    recall  f1-score   support

         0.0       0.91      0.89      0.90        46
         1.0       0.81      0.74      0.78        47
         2.0       0.73      0.76      0.74        46
         3.0       0.85      0.87      0.86        47
         4.0       0.96      1.00      0.98        46

    accuracy                           0.85       232
   macro avg       0.85      0.85      0.85       232
weighted avg       0.85      0.85      0.85       232



0.979857002716064

Removing cement type

In [7]:
X=df.drop('Cement type',axis=1).iloc[:,:-1].values
y=df.iloc[:,-1].values
print(X.shape)
print(y.shape)

(1159, 20)
(1159,)


In [8]:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
grid=train_xgb(X_train,y_train)

{'gamma': None, 'learning_rate': 1, 'max_depth': None, 'max_leaves': None, 'n_estimators': None, 'subsample': None}
0.7768497529787852


In [9]:
best_params={'gamma': None, 'learning_rate': 1, 'max_depth': None, 'max_leaves': None, 'n_estimators': None, 'subsample': None}
best_model=XGBClassifier(random_state=0,n_jobs=-1,**best_params)
best_model.fit(X_train,y_train)
print(classification_report(y_test,best_model.predict(X_test)))
roc_auc_score(y_test,best_model.predict_proba(X_test),multi_class='ovr')

              precision    recall  f1-score   support

           0       0.98      0.89      0.93        46
           1       0.84      0.77      0.80        47
           2       0.75      0.83      0.78        46
           3       0.84      0.89      0.87        47
           4       0.98      0.98      0.98        46

    accuracy                           0.87       232
   macro avg       0.88      0.87      0.87       232
weighted avg       0.88      0.87      0.87       232



0.9813678095749653