In [8]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from scikeras.wrappers import KerasClassifier

In [9]:
df=pd.read_csv('processed_outliers.csv')
print('Data shape:',df.shape)

Data shape: (1159, 22)


In [10]:
list(enumerate(df.columns))

[(0, 'w/b'),
 (1, 'Water'),
 (2, 'Cement type'),
 (3, 'Cement'),
 (4, 'Slag'),
 (5, 'Fly ash'),
 (6, 'Silica fume'),
 (7, 'Lime filler'),
 (8, 'FA'),
 (9, 'CA'),
 (10, 'Plasticizer'),
 (11, 'Superplasticizer'),
 (12, 'Air entraining'),
 (13, 'Comp. str. test age'),
 (14, 'Compressive strength'),
 (15, 'Air content'),
 (16, 'Spreed'),
 (17, 'Slump'),
 (18, 'Fresh density'),
 (19, 'Dry  density'),
 (20, 'Migration test age'),
 (21, 'Migration resistance')]

1-hot encoding

In [11]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
X=ct.fit_transform(df)
# remove dummy variable and output
y=X[:,-1]
X=X[:,1:-1]
print(X.shape)
print(y.shape)

(1159, 30)
(1159,)


ANN

In [12]:
def get_ann(optimizer='rmsprop', init='glorot_uniform'):
    ann=keras.models.Sequential()
    ann.add(keras.layers.Dense(60,activation='relu',kernel_initializer=init,
                            input_shape=(X.shape[1],)))
    ann.add(keras.layers.Dense(32,activation='relu'))
    ann.add(keras.layers.Dense(32,activation='relu'))
    ann.add(keras.layers.Dense(5,activation='softmax'))
    ann.compile(optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return ann

In [13]:
param_grid={
    'optimizer': ['rmsprop', 'adam'],
    'model__init': ['glorot_uniform', 'normal', 'uniform'],
    'epochs': [200,400],
    'batch_size': [32,64]
}

splits=list(StratifiedKFold(shuffle=True,random_state=0).split(X,y))
train_index, test_index = splits[0]
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

def train_ann(X_train,y_train):
    grid=GridSearchCV(KerasClassifier(get_ann,random_state=0,verbose=0),
                      param_grid,cv=5,n_jobs=3,scoring='accuracy')
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_score_)
    return grid

def cv_scores(X,y,model):
    acc_test=[]
    acc_train=[]
    for train_index, test_index in StratifiedKFold(shuffle=True,random_state=1).split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        model.fit(X_train,y_train)
        acc_test.append(accuracy_score(y_test,model.predict(X_test)))
        acc_train.append(accuracy_score(y_train,model.predict(X_train)))
    print('Train acc:',np.mean(acc_train))
    print('Test acc:',np.mean(acc_test))

In [14]:
grid=train_ann(X_train,y_train)

{'batch_size': 32, 'epochs': 400, 'model__init': 'normal', 'optimizer': 'rmsprop'}
0.8080790467887242


In [15]:
best_model=KerasClassifier(get_ann,random_state=0,verbose=0,**grid.best_params_)
best_model.fit(X_train,y_train)
accuracy_score(y_test,best_model.predict(X_test))

0.896551724137931

In [16]:
cv_scores(X,y,best_model)

Train acc: 0.9950382211806719
Test acc: 0.8852142110762801


Removing cement type

In [17]:
X=df.drop('Cement type',axis=1).iloc[:,:-1].values
y=df.iloc[:,-1].values
print(X.shape)
print(y.shape)

(1159, 20)
(1159,)


In [18]:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
grid=train_ann(X_train,y_train)



{'batch_size': 32, 'epochs': 200, 'model__init': 'uniform', 'optimizer': 'rmsprop'}
0.8415228131357164


In [21]:
best_model=KerasClassifier(get_ann,random_state=0,verbose=0,**grid.best_params_)
best_model.fit(X_train,y_train)
accuracy_score(y_test,best_model.predict(X_test))

0.8922413793103449

In [22]:
cv_scores(X,y,best_model)

Train acc: 0.9887840363798684
Test acc: 0.8930325421704733
