In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
df=pd.read_csv('processed.csv')
print('Data shape:',df.shape)

Data shape: (1069, 22)


In [3]:
list(enumerate(df.columns))

[(0, 'w/b'),
 (1, 'Water'),
 (2, 'Cement type'),
 (3, 'Cement'),
 (4, 'Slag'),
 (5, 'Fly ash'),
 (6, 'Silica fume'),
 (7, 'Lime filler'),
 (8, 'FA'),
 (9, 'CA'),
 (10, 'Plasticizer'),
 (11, 'Superplasticizer'),
 (12, 'Air entraining'),
 (13, 'Comp. str. test age'),
 (14, 'Compressive strength'),
 (15, 'Air content'),
 (16, 'Spreed'),
 (17, 'Slump'),
 (18, 'Fresh density'),
 (19, 'Dry  density'),
 (20, 'Migration test age'),
 (21, 'Migration resistance')]

1-hot encoding

In [4]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
X=ct.fit_transform(df)
# remove dummy variable and output
y=X[:,-1]
X=X[:,1:-1]
print(X.shape)
print(y.shape)

(1069, 30)
(1069,)


solver='newton-cg'

In [5]:
param_grid={
    'penalty': [None,'l2'],
    'C': [.1,1,10,100,200]
}
splits=list(StratifiedKFold(shuffle=True,random_state=0).split(X,y))
train_index, test_index = splits[0]

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

def train_lgreg(X_train,y_train):
    clf = LogisticRegression(multi_class='multinomial',
                            max_iter=int(1e4),
                            solver='newton-cg',
                            random_state=0)
    grid=GridSearchCV(clf,param_grid,cv=5,n_jobs=-1)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_score_)
    return grid

def cv_scores(X,y,model):
    acc_test=[]
    acc_train=[]
    for train_index, test_index in StratifiedKFold(shuffle=True,random_state=1).split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        model.fit(X_train,y_train)
        acc_test.append(accuracy_score(y_test,model.predict(X_test)))
        acc_train.append(accuracy_score(y_train,model.predict(X_train)))
    print('Train acc:',np.mean(acc_train))
    print('Test acc:',np.mean(acc_test))

In [6]:
grid=train_lgreg(X_train,y_train)

{'C': 100, 'penalty': 'l2'}
0.6701754385964913


In [7]:
best_model=LogisticRegression(multi_class='multinomial',max_iter=int(1e4),solver='newton-cg',
                              random_state=0,
                              penalty=grid.best_params_['penalty'],
                              C=grid.best_params_['C'])
best_model.fit(X_train,y_train)
accuracy_score(y_test,best_model.predict(X_test))

0.7242990654205608

In [8]:
cv_scores(X,y,best_model)

Train acc: 0.769407006613106
Test acc: 0.7258654732131105


Removing cement type

In [9]:
X=df.drop('Cement type',axis=1).iloc[:,:-1].values
y=df.iloc[:,-1].values
print(X.shape)
print(y.shape)

(1069, 20)
(1069,)


In [10]:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
grid=train_lgreg(X_train,y_train)

{'C': 10, 'penalty': 'l2'}
0.6549707602339181


In [11]:
best_model=LogisticRegression(multi_class='multinomial',max_iter=int(1e4),solver='newton-cg',
                              random_state=0,
                              penalty=grid.best_params_['penalty'],
                              C=grid.best_params_['C'])
best_model.fit(X_train,y_train)
accuracy_score(y_test,best_model.predict(X_test))

0.7336448598130841

In [13]:
cv_scores(X,y,best_model)

Train acc: 0.7247409411378914
Test acc: 0.6950419025053749
