In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

from datetime import datetime
from sklearn.linear_model import LogisticRegression

In [2]:
# sns.set()
start_time = datetime.now()

data = pd.read_csv('Data_for_UCI_named.csv')

map1 = {'unstable': 0, 'stable': 1}
data['stabf'] = data['stabf'].replace(map1)

data = data.sample(frac=1)

In [3]:
X = data.iloc[:, :12]
y = data.iloc[:, 13]

X_training = X.iloc[:7000, :]
y_training = y.iloc[:7000]

X_testing = X.iloc[3000:, :]
y_testing = y.iloc[3000:]

ratio_training = y_training.value_counts(normalize=True)
ratio_testing = y_testing.value_counts(normalize=True)
ratio_training, ratio_testing

(0    0.638429
 1    0.361571
 Name: stabf, dtype: float64,
 0    0.634429
 1    0.365571
 Name: stabf, dtype: float64)

In [4]:
X_training = X_training.values
y_training = y_training.values

X_testing = X_testing.values
y_testing = y_testing.values

In [5]:
scaler = StandardScaler()
X_training = scaler.fit_transform(X_training)
X_testing = scaler.transform(X_testing)

In [6]:
from sklearn.metrics import accuracy_score
cross_val_round = 1
print(f'Model evaluation\n')
classifier = LogisticRegression(random_state=0)

for train_index, val_index in KFold(10, shuffle=True, random_state=10).split(X_training):
    x_train, x_val = X_training[train_index], X_training[val_index]
    y_train ,y_val = y_training[train_index], y_training[val_index]
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_val)
    classifier_accuracy = accuracy_score(y_val, y_pred)
    print(f'Round {cross_val_round} - | Accuracy: {classifier_accuracy * 100:.2f} %')
    cross_val_round += 1

Model evaluation

Round 1 - | Accuracy: 82.14 %
Round 2 - | Accuracy: 79.57 %
Round 3 - | Accuracy: 81.57 %
Round 4 - | Accuracy: 81.57 %
Round 5 - | Accuracy: 81.00 %
Round 6 - | Accuracy: 81.29 %
Round 7 - | Accuracy: 82.57 %
Round 8 - | Accuracy: 81.86 %
Round 9 - | Accuracy: 84.29 %
Round 10 - | Accuracy: 82.43 %


In [7]:
y_pred = classifier.predict(X_testing)
y_pred[y_pred <= 0.5] = 0
y_pred[y_pred > 0.5] = 1

In [8]:
cm = pd.DataFrame(data=confusion_matrix(y_testing, y_pred, labels=[0, 1]),
                  index=["Actual Unstable", "Actual Stable"],
                  columns=["Predicted Unstable", "Predicted Stable"])
cm

Unnamed: 0,Predicted Unstable,Predicted Stable
Actual Unstable,3916,525
Actual Stable,784,1775


In [9]:
print(f'Accuracy per the confusion matrix: {((cm.iloc[0, 0] + cm.iloc[1, 1]) / len(y_testing) * 100):.2f}%')

Accuracy per the confusion matrix: 81.30%


In [10]:
end_time = datetime.now()

print('\nStart time', start_time)
print('End time', end_time)
print('Time elapsed', end_time - start_time)


Start time 2021-11-02 18:39:24.876929
End time 2021-11-02 18:39:44.681380
Time elapsed 0:00:19.804451
