In [None]:
from google.colab import files
import pandas as pd
import io

upload_files = files.upload()

file_name = next(iter(upload_files))

df = pd.read_csv(io.StringIO(upload_files[file_name].decode('utf-8')), header=1, delimiter=";")

df

In [None]:
df = df.drop('ID', axis = 1)
df

In [None]:
print(df["EDUCATION"].unique())

In [None]:
# Combine EDUCATION values 0, 5 and 6 into 4
# This was done because documentation says that the column can only take 4 values (1 = graduate school; 2 = university; 3 = high school; 4 = others).
df["EDUCATION"] = df["EDUCATION"].replace([0, 5, 6], 4)
df["EDUCATION"].unique()

In [None]:
print(df["MARRIAGE"].unique())

In [None]:
# Combining MARRIAGE column "0" values to 3
# The documentation says the column can only take 3 values: (1 = married; 2 = single; 3 = others). 0 is neither of those.
df["MARRIAGE"] = df["MARRIAGE"].replace(0, 3)
df["MARRIAGE"].unique()

In [None]:
print(df["SEX"].unique())

In [None]:
# Encoding SEX column:
df['SEX'] = df['SEX'] - 1
print(df["SEX"].unique())
# 0 = male, 1 = female

In [None]:
from sklearn.model_selection import train_test_split

x = df.drop(columns=["default payment next month"])
y = df["default payment next month"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42, max_depth=10, ccp_alpha=0.001, max_features='sqrt')
model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(x_test)

acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 10))
plot_tree(model, filled=True, feature_names=x.columns, class_names=['No default', 'Default'])
plt.show()