In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [43]:
# Importing the data
df = pd.read_excel("default_of_credit_card_clients.xls", header=1)

In [44]:
# Data Exploration and Preprocessing
# Checking for missing values
print(df.isnull().sum())

# Checking for outliers
print(df.describe())

# Scaling the data
scaler = StandardScaler()
df.iloc[:,1:24] = scaler.fit_transform(df.iloc[:,1:24])

# Encoding categorical variables
df = pd.get_dummies(df, columns=["SEX", "EDUCATION", "MARRIAGE",'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:29], df.iloc[:,24], test_size=0.2, random_state=42)

print(X_train)
print(X_test)
print(y_train)
print(y_test)


# Classifiers
classifiers = {
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGB" : GradientBoostingClassifier(),
    "Neural Network": MLPClassifier()
}

# Parameter settings
parameters = {
    "K-Nearest Neighbors": {"n_neighbors": [3, 5, 7]},
    "Decision Tree": {"max_depth": [3, 5, 7]},
    "Random Forest": {"n_estimators": [50, 100, 150],'max_features': ['auto', 'sqrt', 'log2'],'max_depth' : [4,5,6,7,8],'criterion' :['gini', 'entropy']},
    "XGB": {'max_depth': [2,3,4,5,6,7,8,9,10],'n_estimators': [60,100,140,180,220],'learning_rate': [0.1, 0.01, 0.05]},
    "Neural Network" : {'solver' : ['sgd'], 'alpha':[1e-5],'hidden_layer_sizes':[(10, 10)], 'learning_rate':['adaptive'], 'random_state':[1]}
}

# Experiment with different classifiers and parameter settings
results = {}
for clf_name, clf in classifiers.items():
    for param_name, param_values in parameters[clf_name].items():
        for param_value in param_values:
            clf.set_params(**{param_name: param_value})
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            result = {
                "Classifier": clf_name,
                "Parameter": f"{param_name} = {param_value}",
                "Accuracy": accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1-score": f1
            }
            results[(clf_name, param_name, param_value)] = result

# Evaluate and analyze the performance
# Display the results in a table
results_df = pd.DataFrame(results.values())
print(results_df)

# Compare holdout to cross-validation
from sklearn.model_selection import cross_validate
scores = cross_validate(RandomForestClassifier(), df.iloc[:,1:29], df.iloc[:,24], cv=10, scoring=["accuracy", "precision", "recall", "f1"])
print(scores)

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64
                 ID       LIMIT_BAL           SEX     EDUCATION      MARRIAGE  \
count  30000.000000    30000.000000  30000.000000  30000.000000  30000.000000   
mean   15000.500000   167

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  warn(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


             Classifier                      Parameter  Accuracy  Precision  \
0   K-Nearest Neighbors                n_neighbors = 3  0.999000        1.0   
1   K-Nearest Neighbors                n_neighbors = 5  0.998667        0.0   
2   K-Nearest Neighbors                n_neighbors = 7  0.998667        0.0   
3         Decision Tree                  max_depth = 3  1.000000        1.0   
4         Decision Tree                  max_depth = 5  1.000000        1.0   
5         Decision Tree                  max_depth = 7  1.000000        1.0   
6         Random Forest              n_estimators = 50  1.000000        1.0   
7         Random Forest             n_estimators = 100  1.000000        1.0   
8         Random Forest             n_estimators = 150  1.000000        1.0   
9         Random Forest            max_features = auto  1.000000        1.0   
10        Random Forest            max_features = sqrt  1.000000        1.0   
11        Random Forest            max_features = lo

In [46]:
df

Unnamed: 0,ID,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,...,PAY_6_-1.486040757591074,PAY_6_-0.6164516900089603,PAY_6_0.2531373775731533,PAY_6_1.9923155127373806,PAY_6_2.8619045803194942,PAY_6_3.731493647901608,PAY_6_4.601082715483721,PAY_6_5.470671783065835,PAY_6_6.340260850647948,PAY_6_7.209849918230062
0,1,-1.136720,-1.246020,-0.642501,-0.647399,-0.667993,-0.672497,-0.663059,-0.652724,-0.341942,...,1,0,0,0,0,0,0,0,0,0
1,2,-0.365981,-1.029047,-0.659219,-0.666747,-0.639254,-0.621636,-0.606229,-0.597966,-0.341942,...,0,0,0,1,0,0,0,0,0,0
2,3,-0.597202,-0.161156,-0.298560,-0.493899,-0.482408,-0.449730,-0.417188,-0.391630,-0.250292,...,0,0,1,0,0,0,0,0,0,0
3,4,-0.905498,0.164303,-0.057491,-0.013293,0.032846,-0.232373,-0.186729,-0.156579,-0.221191,...,0,0,1,0,0,0,0,0,0,0
4,5,-0.905498,2.334029,-0.578618,-0.611318,-0.161189,-0.346997,-0.348137,-0.331482,-0.221191,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,0.404759,0.381275,1.870379,2.018136,2.326690,0.695474,-0.149259,-0.384392,0.171250,...,0,0,1,0,0,0,0,0,0,0
29996,29997,-0.134759,0.815221,-0.672786,-0.665299,-0.627430,-0.532924,-0.577691,-0.652724,-0.231032,...,0,0,1,0,0,0,0,0,0,0
29997,29998,-1.059646,0.164303,-0.647227,-0.643830,-0.638158,-0.347961,-0.324517,-0.327687,-0.341942,...,0,0,1,0,0,0,0,0,0,0
29998,29999,-0.674276,0.598248,-0.717982,0.410269,0.422373,0.147844,-0.468063,0.169130,4.844316,...,0,1,0,0,0,0,0,0,0,0
