In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
# Importing the data
df = pd.read_excel("default_of_credit_card_clients.xls", header=1)

In [5]:
# Data Exploration and Preprocessing
# Checking for missing values
print(df.isnull().sum())

# Checking for outliers
print(df.describe())

# Scaling the data
scaler = StandardScaler()
df.iloc[:,1:24] = scaler.fit_transform(df.iloc[:,1:24])

# Encoding categorical variables
df = pd.get_dummies(df, columns=["SEX", "EDUCATION", "MARRIAGE"])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:29], df.iloc[:,24], test_size=0.2, random_state=42)

# Classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# Parameter settings
parameters = {
    "Logistic Regression": {"C": [0.1, 1, 10]},
    "K-Nearest Neighbors": {"n_neighbors": [3, 5, 7]},
    "Decision Tree": {"max_depth": [3, 5, 7]},
    "Random Forest": {"n_estimators": [50, 100, 150]}
}

# Experiment with different classifiers and parameter settings
results = {}
for clf_name, clf in classifiers.items():
    for param_name, param_values in parameters[clf_name].items():
        for param_value in param_values:
            clf.set_params(**{param_name: param_value})
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            result = {
                "Classifier": clf_name,
                "Parameter": f"{param_name} = {param_value}",
                "Accuracy": accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1-score": f1
            }
            results[(clf_name, param_name, param_value)] = result

# Evaluate and analyze the performance
# Display the results in a table
results_df = pd.DataFrame(results.values())
print(results_df)

# Compare holdout to cross-validation
from sklearn.model_selection import cross_validate
scores = cross_validate(RandomForestClassifier(), df.iloc[:,1:29], df.iloc[:,24], cv=10, scoring=["accuracy", "precision", "recall", "f1"])
print(scores)

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64
                 ID       LIMIT_BAL           SEX     EDUCATION      MARRIAGE  \
count  30000.000000    30000.000000  30000.000000  30000.000000  30000.000000   
mean   15000.500000   167

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


             Classifier           Parameter  Accuracy  Precision  Recall  \
0   Logistic Regression             C = 0.1  0.999333        0.0    0.00   
1   Logistic Regression               C = 1  0.999500        1.0    0.25   
2   Logistic Regression              C = 10  1.000000        1.0    1.00   
3   K-Nearest Neighbors     n_neighbors = 3  0.999333        0.0    0.00   
4   K-Nearest Neighbors     n_neighbors = 5  0.999333        0.0    0.00   
5   K-Nearest Neighbors     n_neighbors = 7  0.999333        0.0    0.00   
6         Decision Tree       max_depth = 3  1.000000        1.0    1.00   
7         Decision Tree       max_depth = 5  1.000000        1.0    1.00   
8         Decision Tree       max_depth = 7  1.000000        1.0    1.00   
9         Random Forest   n_estimators = 50  0.999667        1.0    0.50   
10        Random Forest  n_estimators = 100  1.000000        1.0    1.00   
11        Random Forest  n_estimators = 150  0.999833        1.0    0.75   

    F1-scor

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'fit_time': array([3.50953746, 4.29316211, 3.95884204, 4.50946784, 3.58934474,
       3.79520416, 3.93513227, 3.14107656, 3.60491586, 3.11849761]), 'score_time': array([0.14467788, 0.12985802, 0.07378983, 0.18696213, 0.08522081,
       0.13843489, 0.07866549, 0.06832385, 0.09045076, 0.07192135]), 'test_accuracy': array([1.        , 1.        , 0.99966667, 1.        , 1.        ,
       0.99966667, 1.        , 0.99933333, 1.        , 0.99966667]), 'test_precision': array([1., 1., 0., 1., 1., 0., 1., 0., 1., 1.]), 'test_recall': array([1. , 1. , 0. , 1. , 1. , 0. , 1. , 0. , 1. , 0.5]), 'test_f1': array([1.        , 1.        , 0.        , 1.        , 1.        ,
       0.        , 1.        , 0.        , 1.        , 0.66666667])}
