In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

df = pd.read_csv('S05-hw-dataset.csv')

print(df.head())
print(df.info())
print(df['default'].value_counts(normalize=True))

X = df.drop(['client_id', 'default'], axis=1)
y = df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)

acc_dummy = accuracy_score(y_test, y_pred_dummy)
roc_auc_dummy = roc_auc_score(y_test, dummy.predict_proba(X_test)[:, 1])

print(f"Dummy accuracy: {acc_dummy:.3f}, ROC-AUC: {roc_auc_dummy:.3f}")

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])


best_score = 0
best_C = None

for C in [0.01, 0.1, 1.0, 10.0]:
    pipe.set_params(logreg__C=C)
    pipe.fit(X_train, y_train)
    roc_auc = roc_auc_score(y_test, pipe.predict_proba(X_test)[:, 1])
    if roc_auc > best_score:
        best_score = roc_auc
        best_C = C


pipe.set_params(logreg__C=best_C)
pipe.fit(X_train, y_train)
y_pred_logreg = pipe.predict(X_test)
y_proba_logreg = pipe.predict_proba(X_test)[:, 1]


acc_logreg = accuracy_score(y_test, y_pred_logreg)
roc_auc_logreg = roc_auc_score(y_test, y_proba_logreg)

print(f"LogReg accuracy: {acc_logreg:.3f}, ROC-AUC: {roc_auc_logreg:.3f}, Best C: {best_C}")

fpr, tpr, _ = roc_curve(y_test, y_proba_logreg)
plt.plot(fpr, tpr, label=f'LogReg (AUC={roc_auc_logreg:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.savefig('figures/roc_curve.png', dpi=100)
plt.show()

results = pd.DataFrame({
    'Model': ['Dummy', 'LogisticRegression'],
    'Accuracy': [acc_dummy, acc_logreg],
    'ROC-AUC': [roc_auc_dummy, roc_auc_logreg]
})
print(results)

ModuleNotFoundError: No module named 'sklearn'