In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LogisticRegression

In [None]:
df=pd.read_csv('processed.csv')
print('Data shape:',df.shape)

In [None]:
list(enumerate(df.columns))

1-hot encoding

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
X=ct.fit_transform(df)
# remove dummy variable and output
y=X[:,-1]
X=X[:,1:-2]
print(X.shape)
print(y.shape)

Logistic Regression

In [None]:
acc_train=[]
acc_test=[]
param_grid={
    'penalty': ['l1','l2'],
    'C': [.1,1,10,100],
    'solver': ['saga','newton-cg']
}
for train_index, test_index in StratifiedKFold().split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    clf = LogisticRegression(multi_class='multinomial',max_iter=int(1e4))
    grid=GridSearchCV(clf,param_grid,cv=5,n_jobs=-1)
    grid.fit(X_train, y_train)
    acc_train.append(accuracy_score(y_train,grid.predict(X_train)))
    acc_test.append(accuracy_score(y_test,grid.predict(X_test)))
print('Test acc:',np.mean(acc_test))
print('Train acc:',np.mean(acc_train))
plt.plot(range(1,6),acc_train)
plt.plot(range(1,6),acc_test)
plt.legend(['train','test'])

Polynomial transform

In [None]:
X=df.drop('Cement type',axis=1).iloc[:,:-1]
X=PolynomialFeatures(interaction_only=True,include_bias=False).fit_transform(X)
X=np.insert(X,0,df['Cement type'],axis=1)
X = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder='passthrough').fit_transform(X)
# remove dummy variable
X=X[:,1:]
y=df.iloc[:,-1].values
X.shape, y.shape

Logistic Regression with grid search

In [None]:
acc_train=[]
acc_test=[]
param_grid={
    'penalty': ['l1','l2'],
    'C': [.1,1,10,100]
}
for train_index, test_index in StratifiedKFold().split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    clf = LogisticRegression(multi_class='multinomial',max_iter=int(1e4),solver='saga')
    grid=GridSearchCV(clf,param_grid,cv=5,n_jobs=-1)
    grid.fit(X_train, y_train)
    acc_train.append(accuracy_score(y_train,grid.predict(X_train)))
    y_pred=grid.predict(X_test)
    acc_test.append(accuracy_score(y_test,y_pred))
print('Test acc:',np.mean(acc_test))
print('Train acc:',np.mean(acc_train))
plt.plot(range(1,6),acc_train)
plt.plot(range(1,6),acc_test)
plt.legend(['train','test'])