# Kreditwürdigkeit


### [KR01] Einlesen der CSV Files


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df0 = pd.read_csv('data/loan_approval_0.csv', delimiter='|')
df1 = pd.read_csv('data/loan_approval_1.csv', delimiter='|')
df2 = pd.read_csv('data/loan_approval_2.csv', delimiter='|')
df3 = pd.read_csv('data/loan_approval_3.csv', delimiter='|')
df4 = pd.read_csv('data/loan_approval_4.csv', delimiter='|')

### [KR02] Zusammenfassung der DataFrames


In [None]:
frames = [df0, df1, df2, df3, df4]

#Mergen der Frames
df = pd.concat(frames, ignore_index=True)

df

### [KR03] Identifikation von Daten Problemen


In [None]:
df.info()

In [None]:
df.describe(include="all")

In [None]:
print(df['Gender'].unique())
print(df['Married'].unique())
print(df['Education'].unique())

In [None]:
df.loc[
    df['Gender'] == 'nan'
]

In [None]:
duplicateRows = df[df.duplicated(keep='first')]

duplicateRows

In [None]:
df = df.drop_duplicates()
duplicateRows = df[df.duplicated()]

duplicateRows

In [None]:
df["Loan_ID"].duplicated().unique()

In [None]:
df.isna().sum()

### [KR04] Visualisierung der Zielklasse


In [None]:
# Kredithistorie: Informationen wie die Person in der Vergangenheit mit Krediten und Schulden umgegangen ist (verspätete Zahlungen, Kreditlimits, etc.)
# Lohnstatus: Informationen, wie viel Geld eine Person verdient. Gibt Kreditgebern Einblick in finanzielle Stabilität der Person und Ihre Fähigkeit, Kredite zurückzuzahlen.
# CoApplicant_Income: Gehalt des Mitantragstellers (zweite Person) --> ergibt Gesamtkredidwürdigkeit
# Loan Amount = Kreditbetrag
# Loan Amount Term = Laufzeit des Kreditbetrags

# WICHTIG für Zielklassenbestimmung
# Incomes Zusammenrechnen

urban_region = [(df['Loan_Status'] == 1)]

### [KR05] Anzahl Duplikate


In [None]:
duplicateRows = df[df.duplicated(keep='first')]
duplicateRows2 = df[df.duplicated(keep='last')]

dfdp = pd.concat([duplicateRows, duplicateRows2], ignore_index=True).sort_values('Loan_ID')
dfdp

In [None]:
# Duplikate rauswerfen

print(df.shape)
df = df.drop_duplicates()
print(df.shape)

### [KR06] Kredite in der urbanen Gegend


In [None]:
# Fragen ob was mit urban = städtisch alles gemeint ist (nur Core Central Region?)
# Und mit Kredite, Historie oder Kreditwürdigkeit gefragt?

urban_region = df.loc[(df['Property_District']  == 'CCR')]
urban_region [(df['Credit History'] == 1)]


### [KR07] Durchschnittseinkommen


In [None]:
average_loan = round(df['Loan_Amount'].mean())
average_loan

### [KR08] Bildungsstatus höchstes Einkommen


In [None]:
graduated_max_loan = df[df['Education'] == 'Graduate']['Loan_Amount'].max()
not_graduated_max_loan = df[df['Education'] == 'Not Graduate']['Loan_Amount'].max()

print('Höchstes Einkommen von Absolventen :', graduated_max_loan)
print('Höchstes Einkommen von nicht-Absolventen:', not_graduated_max_loan)

### [KR09] Identifikation von Ausreißern beim Einkommen


In [None]:
df['Applicant_Income'].describe()

In [None]:
fig, ax = plt.subplots()

data = df['Applicant_Income']
ax.set_title('Applicant Income')
ax.boxplot(data)

plt.show()

In [None]:
df.loc[
    (df['Applicant_Income'] > 200000) | (df['Applicant_Income'] < 0)
]

In [None]:
median_app_income = df['Applicant_Income'].median()
print(median_app_income)

In [None]:
df.loc[(df['Applicant_Income'] > 200000) | (df['Applicant_Income'] < 0), 'Applicant_Income'] = median_app_income

In [None]:
fig, ax = plt.subplots()

data = df['Applicant_Income']
ax.set_title('Applicant Income')
ax.boxplot(data)

plt.show()

In [None]:
df['Applicant_Income'].describe()

In [None]:
#Co-Applicant
df["CoApplicant_Income"].info()

In [None]:
df["CoApplicant_Income"].isna().sum()

In [None]:
median_co_income = df['Applicant_Income'].median()
print(median_co_income)

In [None]:
df['CoApplicant_Income'] = df['CoApplicant_Income'].fillna(median_co_income)

In [None]:
df["CoApplicant_Income"].isna().sum()

In [None]:
df["CoApplicant_Income"].median()

In [None]:
fig, ax = plt.subplots()

data = df['CoApplicant_Income']
ax.set_title('CoApplicant Income')
data = data.apply(int)
ax.boxplot(data)

plt.show()

### [KR10] Co-Auftragsteller Vergleich


In [None]:
fig, ax = plt.subplots()
ax.boxplot(
    df[['Applicant_Income', 'CoApplicant_Income']],
    labels=['Antragssteller', 'Co-Antragssteller']
    )
ax.set_title('Gegenüberstellung Einkommen')
ax.set_ylabel("Einkommen")

plt.show()

### [KR11] Verteilung der Kredithöhe zum Bildungsabschluss


In [None]:
education=df['Education']
education.sample(3)
education.unique()
LoanAmount = df['Loan_Amount']
LoanAmount.unique()

In [None]:
fig, ax = plt.subplots()

ax.scatter(LoanAmount, education)
ax.set_title("Korrelation education LoanAmount")
ax.set_xlabel("LoanAmount")
ax.set_ylabel("Education (graduated)")
plt.show()

In [None]:
df_graduated = df[df['Education'] == 'Graduate']
df_graduated
df_notGraduate = df[df['Education'] != 'Graduate']
df_notGraduate

In [None]:
fig, (ax, bx) = plt.subplots(ncols=2, figsize=(13,5))
ax.boxplot(df_graduated['Loan_Amount'])
ax.set_title("Graduated")
ax.set_ylabel("LoanAmount")
bx.boxplot(df_notGraduate['Loan_Amount'])
bx.set_title("not Graduated")
bx.set_ylabel("LoanAmount")
plt.show()

In [None]:
averageGraduated = df_graduated['Loan_Amount'].mean()
averageNotGraduate = df_notGraduate['Loan_Amount'].mean()
print("Average loan of graduated:", averageGraduated)
print("Average loan of not graduated:",averageNotGraduate)

### [KR12] Kreditstatus basierend auf Kredithöhe


In [None]:
df.keys()

In [None]:
df['Loan_Status'].unique()

In [None]:
df['Loan_Amount'].unique()

In [None]:
loanGranted = df[df['Loan_Status']==1]
loanNotGranted = df[df['Loan_Status']==0]

In [None]:
fig, ax = plt.subplots()
ax.boxplot([loanGranted['Loan_Amount'],loanNotGranted['Loan_Amount']],
labels = ["granted loans", "not granted loans"])
ax.set_title("Granted Loans vs. Not-granted Loans")
ax.set_ylabel("Loan Amount")
plt.show()

### [KR13] Visualisierung der Zielvariable


In [None]:
df.loc[
    (df['Applicant_Income'] > 200000) | (df['Applicant_Income'] < 0)
]

In [None]:
total_income_median = df['Applicant_Income'].median()
df.loc[(df['Applicant_Income'] > 200000) | (df['Applicant_Income'] < 0), 'Applicant_Income'] = total_income_median

In [None]:
df.loc[[2914, 3518]]

In [None]:
no_loan_status_df = df[df["Loan_Status"] == 0]
loan_status_df = df[df["Loan_Status"] == 1]

fig, ax = plt.subplots()

ax.scatter(loan_status_df["Applicant_Income"], loan_status_df["CoApplicant_Income"], c="g", s=1, label="Loan_Status")
ax.scatter(no_loan_status_df["Applicant_Income"], no_loan_status_df["CoApplicant_Income"], c="r", s=1, label="No_Loan_Status")
ax.set_xlabel("Applicant_Income (in Dollar)")
ax.set_ylabel("CoApplicant_Income (in Dollar)")
ax.set_title("Loan_Status by Applicant_Income and CoApplicant_Income")
ax.legend()
plt.show()

### [KR14] Höhe des Kredits basierend auf Ehestatus


In [None]:
# Vereinheitlichung auf True und False
 
df.loc[df['Married'] == 'True', 'Married'] = True
df.loc[df['Married'] == 'False', 'Married'] = False

df.loc[df['Married'] == 'Yes', 'Married'] = True
df.loc[df['Married'] == 'No', 'Married'] = False

df['Married'].unique()

In [None]:
not_married_df = df[df["Married"] == False]['Loan_Amount']
married_df = df[df["Married"] == True]['Loan_Amount']

fig, ax = plt.subplots(figsize=(5,6))

ax.boxplot([not_married_df, married_df])
ax.set_ylabel('Loan_Amount')
ax.set_xticklabels(['Not Married', 'Married'])
ax.set_title('Box Plot: Loan_Amount and Married')
plt.show()

### [KR15] Standartisierung Gender


In [None]:
print(df["Gender"].unique())
df = df.replace("male", "m").replace("M", "m").replace("Male", "m"). replace("female", "f").replace("Female", "f").replace("F", "f").replace("-", "d").replace(pd.NA, "d")
print(df["Gender"].unique())
anzahl_m = df['Gender'].value_counts().get('m', 0)
anzahl_f = df['Gender'].value_counts().get('f', 0)
anzahl_d = df['Gender'].value_counts().get('d', 0)
print("Anzahl Männer: "), print(anzahl_m)
print("Anzahl Frauen: "), print(anzahl_f)
print("Anzahl Divers: "), print( anzahl_d)

### [KR16] Irrelevante Spalten


In [None]:
df = df.drop(columns=["Num_Kids"])
df

### [CR17] Plotting distribution of Applicant Income


In [None]:
df['Applicant_Income'].unique()

In [None]:
import math
math.sqrt(len(df['Applicant_Income']))

In [None]:
fig, ax = plt.subplots(figsize=(6,4))
ax.hist(df['Applicant_Income'], bins=int(math.sqrt(len(df['Applicant_Income']))), density=False, edgecolor="black")
ax.axvline(df['Applicant_Income'].median(), color="orange")
ax.set_title("Contribution of Applicant Income")
ax.set_ylabel("Frequency")
ax.set_xlabel("Applicant Income in €")

plt.show()

### [KR18] Contribution of female Co-Applicants Income


In [None]:
female = df[df['Gender']=='f']
fCoApplicants = female['CoApplicant_Income']
fCoApplicants

In [None]:
from tkinter import VERTICAL


fig, ax = plt.subplots(2,1, figsize=(6,4))
ax[0].hist(fCoApplicants, bins=int(math.sqrt(len(fCoApplicants))), density=False, edgecolor="black")
ax[0].axvline(fCoApplicants.median(), color="orange")
ax[0].set_title("Distribution of female Co-Applicant Income")
ax[0].set_ylabel("Frequency")
ax[0].set_xlabel("Co-Applicant Income in €")

ax[1].boxplot(fCoApplicants, vert=False)
ax[1].set_yticklabels(["female"])
ax[1].set_xlabel("Co-Applicant Income in €")

plt.show()

# KR19

#### Verteilung Property_District bei Education=Graduate die keinen Loan_Status=0 erhalten


In [None]:
df_complete = df
df_KR19 = df_complete[["Loan_Status", "Education", "Property_District"]]
df_KR19 = df_KR19[(df_KR19['Education'] == 'Graduate')]
df_KR19 = df_KR19[df_KR19['Loan_Status'] != 0]

df_KR19t = df_KR19.pivot_table(index="Property_District", columns="Loan_Status", aggfunc="size")
ax = df_KR19t.plot(kind='bar', figsize=(6, 4))
ax.set_title("Verteilung der Erhaltenen Kredite von Absolventen pro Bezirk")
ax.set_xlabel("Bezirk")
ax.set_ylabel("Anzahl der Kreditnehmern")
ax.set_xticklabels(df_KR19t.index, rotation=.45)
# Ledigliche horizontale Grid Linien
ax.grid(axis='y')
# Horizontale Grid Linie hinter Bars
ax.set_axisbelow(True)
ax.legend(labels=["Approved"])
plt.show()

# KR20

#### Verteilung Ziel Variable (Loan_Status) und Selbstständigkeit(Self_Employed)


In [None]:
df_KR20 = df_complete[["Loan_Status", "Self_Employed"]]
df_KR20 = df_KR20.replace(0, "Rejected").replace(1, "Approved").replace(True, "Self_Employed").replace(False, "Not_Self_Employed")
#print(df_KR20)
anzahl_l = df_KR20['Loan_Status'].value_counts().get("Approved", 0)
anzahl_s = df_KR20['Self_Employed'].value_counts().get("Self_Employed", 0)
print(anzahl_s)
print(anzahl_l)

df_KR20l = df_KR20["Loan_Status"].value_counts()
df_KR20s = df_KR20["Self_Employed"].value_counts()

fig, ax = plt.subplots(1,2, figsize=(15,3))

ax[0].set_title("Verteilung von Variable 'Loan_Status'")
ax[0].set_xlabel("Self_Employed")
ax[0].set_ylabel("Häufigkeit")
ax[0].bar(df_KR20s.index, df_KR20s.values, tick_label=["Self_Employed","Not_Self_Employed"], color="navy")

ax[1].set_title("Verteilung von Variable 'Self_Employed'")
ax[1].set_xlabel("Häufigkeit")
ax[1].set_ylabel("Loan_Status")
ax[1].barh(df_KR20l.index, df_KR20l.values, tick_label=["Approved","Rejected"], color="navy")

plt.show()

# KR21 Verteilung der Selbstständigkeit je Property District


In [None]:
df_KR21 = df_complete[["Property_District", "Self_Employed"]]
df_KR21 = df_KR21.replace(True, "Self_Employed").replace(False, "Not_Self_Employed")
# print(df_KR21)

df_temp = df_KR21.pivot_table(index="Property_District", columns="Self_Employed", aggfunc="size")
ax = df_temp.plot(kind='bar', figsize=(6, 4))
ax.set_title("Verteilung von Employment je District")
ax.set_xlabel("Property_District")
ax.set_ylabel("Anzahl")
ax.set_xticklabels(df_temp.index, rotation=.45)
# Ledigliche horizontale Grid Linien
ax.grid(axis='y')
# Horizontale Grid Linie hinter Bars
ax.set_axisbelow(True)
plt.show()

### [KR23] Verteilung des Einkommens aller verheirateter Antragssteller


In [None]:
df['Married'].info()

In [None]:
df['Married'].unique()

In [None]:
unmarried_income_df = df[df["Married"] == False]['Applicant_Income']
married_income_df = df[df["Married"] == True]['Applicant_Income']
married_income_df

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,7))

ax[0].hist(married_income_df, bins=70, edgecolor="black") 

ax[0].axvline(married_income_df.median(), color="orange", label="Median")

ax[0].set_title("EInkommen aller verheirateter Antragssteller")
ax[0].set_ylabel("Häufigkeit")
ax[0].set_xlabel("Einkommen")
ax[0].legend()

ax[1].boxplot([married_income_df, unmarried_income_df], vert=False)
ax[1].set_yticklabels(["Verheiratet", "Unverheiratet"])
ax[1].set_xlabel("Einkommen")
plt.show()

### [KR25] Integer Encoding


In [None]:
df_encoded = df
df_encoded['Gender'] = pd.factorize(df['Gender'])[0]
df_encoded['Married'] = pd.factorize(df['Married'])[0]
df_encoded['Education'] = pd.factorize(df['Education'])[0]
df_encoded['Self_Employed'] = pd.factorize(df['Self_Employed'])[0]
df_encoded['Property_District'] = pd.factorize(df['Property_District'])[0]


df_encoded

### [KR26] X und y Erstellung


In [None]:
df_encoded = df_encoded.drop(columns=['Loan_ID'])


In [None]:
feature_columns = [
    "Gender",
    "Married",
    "Dependent_No",
    "Education",
    "Self_Employed",
    "Applicant_Income",
    "CoApplicant_Income",
    "Loan_Amount",
    "Loan_Amount_Term",
    "Credit History",
    "Property_District",
]
target_value = 'Loan_Status'

X = df_encoded[feature_columns]
y = df_encoded[target_value]

In [None]:
X

In [None]:
y