In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
import utils.clean_utils as cu


### Explore individual datasets


In [None]:
account = pd.read_csv("../data/account.csv", delimiter=";")
card_dev = pd.read_csv("../data/card_dev.csv", delimiter=";")
client = pd.read_csv("../data/client.csv", delimiter=";")
disp = pd.read_csv("../data/disp.csv", delimiter=";")
district = pd.read_csv("../data/district.csv", delimiter=";")
loan_dev = pd.read_csv("../data/loan_dev.csv", delimiter=";")
trans_dev = pd.read_csv("../data/trans_dev.csv", delimiter=";", dtype={"bank": str})


#### District


In [None]:
district.info()


In [None]:
new_district = district.copy()

new_district["no. of commited crimes '95 "] = pd.to_numeric(
    new_district["no. of commited crimes '95 "], errors="coerce"
)
new_district["no. of commited crimes '96 "] = pd.to_numeric(
    new_district["no. of commited crimes '96 "], errors="coerce"
)

plt.scatter(
    new_district["no. of commited crimes '95 "],
    new_district["no. of commited crimes '96 "],
    color="#011C50",
)

plt.xlabel("Commited crimes '95")
plt.ylabel("Commited crimes '96")
plt.title("Commited crimes '95/'96")


In [None]:
sns.lmplot(
    x="no. of commited crimes '95 ",
    y="no. of commited crimes '96 ",
    data=new_district,
    line_kws={"color": "#011C50"},
    scatter_kws={"color": "#011C50"},
)

plt.xlabel("Commited crimes '95")
plt.ylabel("Commited crimes '96")
plt.title("Commited crimes '95/'96")


In [None]:
new_district["unemploymant rate '95 "] = pd.to_numeric(
    new_district["unemploymant rate '95 "], errors="coerce"
)
new_district["unemploymant rate '96 "] = pd.to_numeric(
    new_district["unemploymant rate '96 "], errors="coerce"
)

plt.scatter(
    new_district["unemploymant rate '95 "],
    new_district["unemploymant rate '96 "],
    color="#011C50",
)

plt.xlabel("Unemployment rate '95 (%)")
plt.ylabel("Unemployment rate '96 (%)")
plt.title("Unemployment rate '95/'96")


In [None]:
sns.lmplot(
    x="unemploymant rate '95 ",
    y="unemploymant rate '96 ",
    data=new_district,
    line_kws={"color": "#011C50"},
    scatter_kws={"color": "#011C50"},
)

plt.xlabel("Unemployment rate '95 (%)")
plt.ylabel("Unemployment rate '96 (%)")
plt.title("Unemployment rate '95/'96")


#### Transactions


In [None]:
trans_dev.info()


In [None]:
corr = trans_dev.corr()
mask = np.triu(corr)

fig = plt.figure(figsize=(25, 15))

colormap = sns.diverging_palette(250, 20, as_cmap=True)
sns.heatmap(
    corr, annot=True, fmt=".2f", cbar_kws={"shrink": 0.5}, mask=mask, cmap=colormap
).set(title="Correlation Matrix")

plt.show()


#### Loan


In [None]:
loan_dev.info()


In [None]:
corr = loan_dev.corr()
mask = np.triu(corr)

fig = plt.figure(figsize=(25, 15))

colormap = sns.diverging_palette(250, 20, as_cmap=True)
sns.heatmap(
    corr, annot=True, fmt=".2f", cbar_kws={"shrink": 0.5}, mask=mask, cmap=colormap
).set(title="Correlation Matrix")

plt.show()


In [None]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(10, 5))

loan_dev.boxplot(column="amount", by="status", figsize=(5, 5), ax=axs[0])
loan_dev.boxplot(column="payments", by="status", figsize=(5, 5), ax=axs[1])
loan_dev.boxplot(column="duration", by="status", figsize=(5, 5), ax=axs[2])


#### Card


In [None]:
card_dev.info()


### Explore training dataset


In [None]:
account = cu.clean_accounts(account, clean=False)
disp = cu.clean_disp(disp, clean=False)
client = cu.clean_clients(client, clean=False)
district = cu.clean_districts(district, clean=False)
card_dev = cu.clean_cards(card_dev, disp, clean=False)
loan_dev = cu.clean_loans(loan_dev)
trans_dev = cu.clean_transactions(trans_dev, clean=False, op=True, k_symbol=True)

df = pd.read_csv("../data/clean/df-exploratory.csv")


In [None]:
df.info()


In [None]:
df["status"].value_counts().plot.bar(color="#011C50")
plt.xlabel("Status")
plt.ylabel("Count")
plt.title("Status count for bank loans")


In [None]:
# correlation analysis

corr = df.corr()
mask = np.triu(corr)

fig = plt.figure(figsize=(25, 15))

colormap = sns.diverging_palette(250, 20, as_cmap=True)
sns.heatmap(
    corr, annot=True, fmt=".2f", cbar_kws={"shrink": 0.5}, mask=mask, cmap=colormap
).set(title="Correlation Matrix")

plt.show()


In [None]:
# correlation analysis
corr = trans_dev.corr()
mask = np.triu(corr)

fig = plt.figure(figsize=(25, 15))

colormap = sns.diverging_palette(250, 20, as_cmap=True)
sns.heatmap(
    corr, annot=True, fmt=".2f", cbar_kws={"shrink": 0.5}, mask=mask, cmap=colormap
).set(title="Correlation Matrix")

plt.show()


In [None]:
# correlation analysis

corr = loan_dev.corr()
mask = np.triu(corr)

fig = plt.figure(figsize=(25, 15))

colormap = sns.diverging_palette(250, 20, as_cmap=True)
sns.heatmap(
    corr, annot=True, fmt=".2f", cbar_kws={"shrink": 0.5}, mask=mask, cmap=colormap
).set(title="Correlation Matrix")

plt.show()


In [None]:
# correlation analysis

corr = district.corr()
mask = np.triu(corr)

fig = plt.figure(figsize=(25, 15))

colormap = sns.diverging_palette(250, 20, as_cmap=True)
sns.heatmap(
    corr, annot=True, fmt=".2f", cbar_kws={"shrink": 0.5}, mask=mask, cmap=colormap
).set(title="Correlation Matrix")

plt.show()


In [None]:
print("STATISTICS FOR SOME OF THE RELEVANT NUMERICAL FEATURES")

# most of the columns, such as the ones with IDs, are irrelevant for this type of analysis
relevant_cols = [
    "amount",
    "duration",
    "payments",
    "average_salary",
    "unemployment_growth",
    "avg_commited_crimes",
    "ratio_entrepreneurs",
    "avg_amount_credit",
    "avg_amount_withdrawal",
    "avg_amount_total",
    "min_amount",
    "max_amount",
    "credit_ratio",
    "num_trans",
    "avg_balance",
    "min_balance",
    "max_balance",
    "std_balance",
    "num_cash_credit",
    "num_coll",
    "num_interest",
    "num_cash_withdrawal",
    "num_rem",
    "num_card_withdrawal",
    "mean_no_symbol",
    "num_no_symbol",
    "mean_household",
    "num_household",
    "mean_statement",
    "num_statement",
    "mean_insurance",
    "num_insurance",
    "mean_sanction",
    "num_sanction",
    "mean_pension",
    "num_pension",
    "age_at_loan",
    "days_between",
]

status_positive = df[df["status"] == 1]
status_negative = df[df["status"] == -1]

df[relevant_cols].describe()


In [None]:
status_positive[relevant_cols].describe()


In [None]:
status_negative[relevant_cols].describe()


In [None]:
print("STATISTICS FOR THE NON-NUMERICAL FEATURES")

df.describe(include=["object"])


In [None]:
df.groupby("has_disponent")["status"].value_counts().unstack(1).plot.bar(
    color=["#011C50", "#F5BE49"]
)

plt.xlabel("Disponents")
plt.ylabel("Count by number of disponents")
plt.legend(["Status = -1", "Status = 1"])
plt.title("Status count by number of account disponents")


In [None]:
sns.histplot(
    data=df[df["status"] == 1],
    x="avg_balance",
    color="#F5BE49",
    stat="percent",
    kde=True,
)
sns.histplot(
    data=df[df["status"] == -1],
    x="avg_balance",
    color="#011C50",
    stat="percent",
    kde=True,
)

plt.xlabel("Average Balance after transactions")
plt.legend(["Status 1", "Status -1"])
plt.title("Distribution of average balance per status")
plt.show()


In [None]:
fig = px.parallel_categories(
    df,
    dimensions=["gender", "has_card", "negative_balance", "has_disponent", "frequency"],
    color="status",
)

fig.layout["coloraxis"]["colorbar"]["x"] = 1.1
fig.show()


In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 5), sharey=True)

# commited crimes
sns.histplot(
    data=df[df["status"] == 1],
    x="amount",
    color="#F5BE49",
    kde=True,
    stat="percent",
    ax=axs[0],
)

# unemplyment rate
sns.histplot(
    data=df[df["status"] == -1],
    x="amount",
    color="#011C50",
    stat="percent",
    kde=True,
    ax=axs[1],
)

axs[0].set_xlabel("Amount of successful loans")
axs[1].set_xlabel("Amount of unsuccessful loans")
axs[0].legend(["Status 1"])
axs[1].legend(["Status -1"])

fig.suptitle("Loan Amount")
fig.show()


In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 5), sharey=True)

# commited crimes
sns.histplot(
    data=df[df["status"] == 1],
    x="avg_amount_credit",
    color="#F5BE49",
    kde=True,
    stat="percent",
    ax=axs[0],
)

# unemplyment rate
sns.histplot(
    data=df[df["status"] == -1],
    x="avg_amount_credit",
    color="#011C50",
    stat="percent",
    kde=True,
    ax=axs[1],
)

axs[0].set_xlabel("Average amount credit transactions")
axs[1].set_xlabel("Average amount credit transactions")
axs[0].legend(["Status 1"])
axs[1].legend(["Status -1"])

fig.suptitle("Average amount of credit transactions")
fig.show()


In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 5), sharey=True)
df["avg_amount_withdrawal"] = df["avg_amount_withdrawal"] * -1

# commited crimes
sns.histplot(
    data=df[df["status"] == 1],
    x="avg_amount_withdrawal",
    color="#F5BE49",
    kde=True,
    stat="percent",
    ax=axs[0],
)

# unemplyment rate
sns.histplot(
    data=df[df["status"] == -1],
    x="avg_amount_withdrawal",
    color="#011C50",
    stat="percent",
    kde=True,
    ax=axs[1],
)

axs[0].set_xlabel("Average amount withdrawal transactions")
axs[1].set_xlabel("Average amount withdrawal transactions")
axs[0].legend(["Status 1"])
axs[1].legend(["Status -1"])

fig.suptitle("Average amount of withdrawal transactions")
fig.show()


In [None]:
df.groupby("has_card")["status"].value_counts().unstack(1).plot.bar(
    color=["#011C50", "#F5BE49"]
)

plt.ylabel("Count")
plt.xlabel("Has card")
plt.legend(["Status = -1", "Status = 1"])
plt.title("Status count per accounts with(out)")
plt.xticks(rotation=0)
plt.show()


In [None]:
df.groupby("negative_balance")["status"].value_counts().unstack(1).plot.bar(
    color=["#011C50", "#F5BE49"]
)

plt.xlabel("Has had at least one negative balance")
plt.ylabel("Count")
plt.legend(["Status = -1", "Status = 1"])
plt.xticks(rotation=0)
plt.title("Negative balance by number of accounts")


In [None]:
sns.histplot(
    data=df[df["status"] == 1],
    x="days_between",
    color="#F5BE49",
    stat="percent",
    kde=True,
)
sns.histplot(
    data=df[df["status"] == -1],
    x="days_between",
    color="#011C50",
    stat="percent",
    kde=True,
)

plt.xlabel("Days between loan and account creation")
plt.legend(["Status 1", "Status -1"])
plt.title("Distribution of days between loand and account creation per status")
plt.show()


In [None]:
sns.histplot(
    data=df[df["status"] == 1],
    x="age_at_loan",
    color="#F5BE49",
    stat="percent",
    kde=True,
)
sns.histplot(
    data=df[df["status"] == -1],
    x="age_at_loan",
    color="#011C50",
    stat="percent",
    kde=True,
)

plt.xlabel("Age at loan")
plt.legend(["Status 1", "Status -1"])
plt.title("Distribution of age at loan per status")
plt.show()


In [None]:
plt.scatter(df["average_salary"], df["avg_commited_crimes"], color="#011C50")

plt.xlabel("average salary")
plt.ylabel("average commited crimes")
plt.title("Relation between a district's salary and commited crimes")


In [None]:
sns.histplot(
    data=df[df["status"] == 1], x="region", color="#F5BE49", stat="percent", kde=True
)
sns.histplot(
    data=df[df["status"] == -1], x="region", color="#011C50", stat="percent", kde=True
)

plt.xlabel("Region")
plt.xticks(rotation=45)
plt.legend(["Status 1", "Status -1"])
plt.title("Distribution of region per status")
plt.show()


In [None]:
sns.histplot(
    data=df[df["status"] == 1], x="payments", color="#F5BE49", stat="percent", kde=True
)
sns.histplot(
    data=df[df["status"] == -1], x="payments", color="#011C50", stat="percent", kde=True
)

plt.xlabel("Payments")
plt.legend(["Status 1", "Status -1"])
plt.title("Distribution of payments per status")
plt.show()


In [None]:
values = np.array(df.groupby("gender")["status"].value_counts())
labels = ["female / 1", "female / -1", "male / 1", "male / -1"]

plt.pie(values, labels=labels)
plt.title("Status count by gender")
plt.show()


In [None]:
sns.histplot(
    data=df[df["status"] == 1],
    x="criminality_growth",
    color="#F5BE49",
    stat="percent",
    kde=True,
)
sns.histplot(
    data=df[df["status"] == -1],
    x="criminality_growth",
    color="#011C50",
    stat="percent",
    kde=True,
)

plt.xlabel("Criminality growth")
plt.legend(["Status 1", "Status -1"])
plt.title("Distribution of criminality growth per status")
plt.show()


In [None]:
sns.histplot(
    data=df[df["status"] == 1],
    x="unemployment_growth",
    color="#F5BE49",
    stat="percent",
    kde=True,
)
sns.histplot(
    data=df[df["status"] == -1],
    x="unemployment_growth",
    color="#011C50",
    stat="percent",
    kde=True,
)

plt.xlabel("Unemployment growth")
plt.legend(["Status 1", "Status -1"])
plt.title("Distribution of unemployment growth per status")
plt.show()


In [None]:
fig, axs = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))

df.boxplot(column="amount", by="status", figsize=(5, 5), ax=axs[0][0])
df.boxplot(column="payments", by="status", figsize=(5, 5), ax=axs[0][1])
df.boxplot(column="duration", by="status", figsize=(5, 5), ax=axs[0][2])
df.boxplot(column="average_salary", by="status", figsize=(5, 5), ax=axs[1][0])
df.boxplot(column="unemployment_rate", by="status", figsize=(5, 5), ax=axs[1][1])
df.boxplot(column="avg_commited_crimes", by="status", figsize=(5, 5), ax=axs[1][2])
df.boxplot(column="ratio_entrepreneurs", by="status", figsize=(5, 5), ax=axs[2][0])
df.boxplot(column="criminality_growth", by="status", figsize=(5, 5), ax=axs[2][1])
df.boxplot(column="avg_amount_credit", by="status", figsize=(5, 5), ax=axs[2][2])
df.boxplot(column="avg_amount_withdrawal", by="status", figsize=(5, 5), ax=axs[3][0])
df.boxplot(column="avg_amount_total", by="status", figsize=(5, 5), ax=axs[3][1])
df.boxplot(column="credit_ratio", by="status", figsize=(5, 5), ax=axs[3][2])


In [None]:
fig, axs = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))

df.boxplot(column="min_amount", by="status", figsize=(5, 5), ax=axs[0][0])
df.boxplot(column="max_amount", by="status", figsize=(5, 5), ax=axs[0][1])
df.boxplot(column="num_trans", by="status", figsize=(5, 5), ax=axs[0][2])
df.boxplot(column="avg_balance", by="status", figsize=(5, 5), ax=axs[1][0])
df.boxplot(column="min_balance", by="status", figsize=(5, 5), ax=axs[1][1])
df.boxplot(column="max_balance", by="status", figsize=(5, 5), ax=axs[1][2])
df.boxplot(column="std_balance", by="status", figsize=(5, 5), ax=axs[2][0])
df.boxplot(column="num_cash_credit", by="status", figsize=(5, 5), ax=axs[2][1])
df.boxplot(column="num_coll", by="status", figsize=(5, 5), ax=axs[2][2])
df.boxplot(column="num_interest", by="status", figsize=(5, 5), ax=axs[3][0])
df.boxplot(column="num_pension", by="status", figsize=(5, 5), ax=axs[3][0])
df.boxplot(column="num_cash_withdrawal", by="status", figsize=(5, 5), ax=axs[3][1])
df.boxplot(column="days_between", by="status", figsize=(5, 5), ax=axs[3][2])


In [None]:
fig, axs = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))

df.boxplot(column="mean_no_symbol", by="status", figsize=(5, 5), ax=axs[0][0])
df.boxplot(column="mean_household", by="status", figsize=(5, 5), ax=axs[0][1])
df.boxplot(column="num_household", by="status", figsize=(5, 5), ax=axs[0][2])
df.boxplot(column="mean_statement", by="status", figsize=(5, 5), ax=axs[1][0])
df.boxplot(column="num_statement", by="status", figsize=(5, 5), ax=axs[1][1])
df.boxplot(column="mean_insurance", by="status", figsize=(5, 5), ax=axs[1][2])
df.boxplot(column="num_insurance", by="status", figsize=(5, 5), ax=axs[2][0])
df.boxplot(column="mean_sanction", by="status", figsize=(5, 5), ax=axs[2][1])
df.boxplot(column="num_sanction", by="status", figsize=(5, 5), ax=axs[2][2])
df.boxplot(column="mean_pension", by="status", figsize=(5, 5), ax=axs[3][0])
df.boxplot(column="age_at_loan", by="status", figsize=(5, 5), ax=axs[3][1])
df.boxplot(column="days_between", by="status", figsize=(5, 5), ax=axs[3][2])


In [None]:
df.columns
