In [18]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [19]:
df=pd.read_csv('processed.csv')
print('Data shape:',df.shape)

Data shape: (1069, 22)


In [20]:
list(enumerate(df.columns))

[(0, 'w/b'),
 (1, 'Water'),
 (2, 'Cement type'),
 (3, 'Cement'),
 (4, 'Slag'),
 (5, 'Fly ash'),
 (6, 'Silica fume'),
 (7, 'Lime filler'),
 (8, 'FA'),
 (9, 'CA'),
 (10, 'Plasticizer'),
 (11, 'Superplasticizer'),
 (12, 'Air entraining'),
 (13, 'Comp. str. test age'),
 (14, 'Compressive strength'),
 (15, 'Air content'),
 (16, 'Spreed'),
 (17, 'Slump'),
 (18, 'Fresh density'),
 (19, 'Dry  density'),
 (20, 'Migration test age'),
 (21, 'Migration resistance')]

1-hot encoding

In [21]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
X=ct.fit_transform(df)
# remove dummy variable and output
y=X[:,-1]
X=X[:,1:-1]
print(X.shape)
print(y.shape)

(1069, 30)
(1069,)


Decision Tree

In [22]:
param_grid={
    'criterion': ['gini', 'entropy', 'log_loss'],
    'ccp_alpha': np.logspace(-1,-3,3),
    'max_depth': [None, 5, 6, 7, 8, 9, 10, 20, 40, 100, 200, 500],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': range(1,7),
    'max_features': [None,1,2,3,4]
}

splits=list(StratifiedKFold(shuffle=True,random_state=0).split(X,y))
train_index, test_index = splits[0]
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

def train_dtree(X_train,y_train):
    clf = DecisionTreeClassifier(random_state=0)
    grid=GridSearchCV(clf,param_grid,cv=5,n_jobs=3,scoring='accuracy')
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_score_)
    return grid

def cv_scores(X,y,model):
    acc_test=[]
    acc_train=[]
    for train_index, test_index in StratifiedKFold(shuffle=True,random_state=1).split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        model.fit(X_train,y_train)
        acc_test.append(accuracy_score(y_test,model.predict(X_test)))
        acc_train.append(accuracy_score(y_train,model.predict(X_train)))
    print('Train acc:',np.mean(acc_train))
    print('Test acc:',np.mean(acc_test))

In [23]:
grid=train_dtree(X_train,y_train)

{'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.7672514619883041


In [24]:
best_model=DecisionTreeClassifier(random_state=0,
                                  criterion=grid.best_params_['criterion'],
                                  max_depth=grid.best_params_['max_depth'],
                                  max_features=grid.best_params_['max_features'],
                                  min_samples_leaf=grid.best_params_['min_samples_leaf'],
                                  min_samples_split=grid.best_params_['min_samples_split'],
                                  ccp_alpha=grid.best_params_['ccp_alpha'])
best_model.fit(X_train,y_train)
accuracy_score(y_test,best_model.predict(X_test))

0.7990654205607477

In [25]:
cv_scores(X,y,best_model)

Train acc: 0.9679608679018419
Test acc: 0.8184809793339477


Removing cement type

In [27]:
X=df.drop('Cement type',axis=1).iloc[:,:-1].values
y=df.iloc[:,-1].values
print(X.shape)
print(y.shape)

(1069, 20)
(1069,)


In [28]:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
grid=train_dtree(X_train,y_train)

{'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.7602339181286549


In [29]:
best_model=DecisionTreeClassifier(random_state=0,
                                  criterion=grid.best_params_['criterion'],
                                  max_depth=grid.best_params_['max_depth'],
                                  max_features=grid.best_params_['max_features'],
                                  min_samples_leaf=grid.best_params_['min_samples_leaf'],
                                  min_samples_split=grid.best_params_['min_samples_split'],
                                  ccp_alpha=grid.best_params_['ccp_alpha'])
best_model.fit(X_train,y_train)
accuracy_score(y_test,best_model.predict(X_test))

0.8364485981308412

In [30]:
cv_scores(X,y,best_model)

Train acc: 1.0
Test acc: 0.8259795533324559
