In [None]:
# Importing row data by using this libraray: ucimlrepo
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 
  
# metadata 
print(statlog_german_credit_data.metadata) 
  
# variable information 
print(statlog_german_credit_data.variables) 

In [None]:
# Import data as dataframe and also return a csv file for the raw data
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# Features and target as DataFrames
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

# Combine into a single DataFrame
df = pd.concat([X, y], axis=1)

# Save to CSV
#df.to_csv("german_credit_raw_data.csv", index=False)

# Return the DataFrame
df.head()

In [None]:
# Map the names of the columns according to the meta data and return new dataframe called df-renamed
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Step 1: Fetch the dataset
dataset = fetch_ucirepo(id=144)

# Step 2: Combine features and target into one DataFrame
df = pd.concat([dataset.data.features, dataset.data.targets], axis=1)

# Step 3: Extract the metadata table
meta = dataset.variables  # contains name, description, type, etc.

# Step 4: Map coded column names to full descriptions
column_description_map = dict(zip(meta['name'], meta['description']))
df_renamed = df.rename(columns=column_description_map)

# Show first few rows
df_renamed.head()


In [None]:
# Adjust the columns names' 
df_renamed = df_renamed.rename(columns={
    'Status of existing checking account': 'status_checking_account',
    'Duration': 'duration_in_month', "Credit history":"credit_history",
    "Purpose":"purpose", "Credit amount":"credit_amount","Savings account/bonds":"savings_account/bonds", "Other debtors / guarantors":"other_debtors_/_guarantors",
    'Present employment since': 'present_employment', "Property":"property", "Age":"age","Housing":"housing", "Telephone":"telephone", 
    "foreign worker":"foreign_worker",
    'Installment rate in percentage of disposable income': 'installment_rate',
    'Personal status and sex': 'gender_status',
    'Present residence since': 'present_residence',
    'Number of existing credits at this bank': 'existing_credits',
    'Job': 'job_skill',
    'Number of people being liable to provide maintenance for': 'number_of_dependents',
    '1 = Good, 2 = Bad': 'score',
    "Other installment plans":"other_installment"
})
df_renamed.info()

In [None]:
# Map the inputs of the dataset according to the metadeta
credit_history_map = {
    'A30': 'no credits / all paid back',
    'A31': 'all credits at this bank paid back',
    'A32': 'existing credits paid back duly',
    'A33': 'delays in past payments',
    'A34': 'critical account / other credits exist'
}
purpose_map = {
    'A40': 'car (new)',
    'A41': 'car (used)',
    'A42': 'furniture/equipment',
    'A43': 'radio/television',
    'A44': 'domestic appliances',
    'A45': 'repairs',
    'A46': 'education',
    'A47': 'vacation',
    'A48': 'retraining',
    'A49': 'business',
    'A410': 'others'
}
savings_map = {
    'A61': '< 100 DM',
    'A62': '100 <= ... < 500 DM',
    'A63': '500 <= ... < 1000 DM',
    'A64': '>= 1000 DM',
    'A65': 'unknown / no savings'
}
employment_map = {
    'A71': 'unemployed',
    'A72': '< 1 year',
    'A73': '1 <= ... < 4 years',
    'A74': '4 <= ... < 7 years',
    'A75': '>= 7 years'
}
personal_status_sex_map = {
    'A91': 'male : divorced/separated',
    'A92': 'female : divorced/separated/married',
    'A93': 'male : single',
    'A94': 'male : married/widowed',
    'A95': 'female : single'
}
other_debtors_map = {
    'A101': 'none',
    'A102': 'co-applicant',
    'A103': 'guarantor'
}
property_map = {
    'A121': 'real estate',
    'A122': 'building society / life insurance',
    'A123': 'car or other',
    'A124': 'unknown / no property'
}
other_installment_plans_map = {
    'A141': 'bank',
    'A142': 'stores',
    'A143': 'none'
}
housing_map = {
    'A151': 'rent',
    'A152': 'own',
    'A153': 'for free'
}
job_map = {
    'A171': 'unemployed / unskilled – non-resident',
    'A172': 'unskilled – resident',
    'A173': 'skilled employee / official',
    'A174': 'management / self-employed / highly qualified / officer'
}
telephone_map = {
    'A191': 'none',
    'A192': 'yes, registered under customer'
}
foreign_worker_map = {
    'A201': 'yes',
    'A202': 'no'
}
status_checking_account_map = {
    'A11': '< 0 DM',
    'A12': '0 <= ... < 200 DM',
    'A13': '>= 200 DM / salary assignment',
    'A14': 'no checking account'
}

In [None]:
# Adjust the column names
df_renamed['credit_history'] = df_renamed['credit_history'].map(credit_history_map)
df_renamed['purpose'] = df_renamed['purpose'].map(purpose_map)
df_renamed['savings_account/bonds'] = df_renamed['savings_account/bonds'].map(savings_map)
df_renamed['present_employment'] = df_renamed['present_employment'].map(employment_map)
df_renamed['gender_status'] = df_renamed['gender_status'].map(personal_status_sex_map)
df_renamed['other_debtors_/_guarantors'] = df_renamed['other_debtors_/_guarantors'].map(other_debtors_map)
df_renamed['property'] = df_renamed['property'].map(property_map)
df_renamed['other_installment'] = df_renamed['other_installment'].map(other_installment_plans_map)
df_renamed['housing'] = df_renamed['housing'].map(housing_map)
df_renamed['job_skill'] = df_renamed['job_skill'].map(job_map)
df_renamed['telephone'] = df_renamed['telephone'].map(telephone_map)
df_renamed['foreign_worker'] = df_renamed['foreign_worker'].map(foreign_worker_map)
df_renamed['status_checking_account'] = df_renamed['status_checking_account'].map(status_checking_account_map)

In [None]:
# Adjust the score column inputs 
score_mapping = {1: 'good', 2: 'bad'}

# Apply the mapping to the 'score' column
df_renamed['score'] = df_renamed['score'].map(score_mapping)

# Verify the change
print(df_renamed['score'].value_counts())

In [None]:
df_renamed.info()

In [None]:
# EDA for categorical columns that are relevant :
## Plotting credit history agains the score column
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(data=df_renamed, x='credit_history', hue='score')
plt.xticks(rotation=85)
plt.show()

In [None]:
# Comparing the average value counts of each category with the value counts, also the average bad rates with bad rate of each category

#  code for credit history count analysis
credit_history_status = df_renamed['credit_history'].value_counts().reset_index()
credit_history_status.columns = ['credit_history', 'count']  

# Calculate average count
avg_count = credit_history_status['count'].mean()

# Add average comparison column
credit_history_status['vs_avg_count'] = credit_history_status['count'] - avg_count
credit_history_status['vs_avg_count'] = credit_history_status['vs_avg_count'].apply(
    lambda x: f"{x:+.1f} {'(above)' if x > 0 else '(below)' if x < 0 else '(average)'}"
)

# Calculate bad rates
bad_rates = df_renamed.groupby('credit_history')['score'].apply(
    lambda x: (x == "bad").mean() * 100
).reset_index(name='bad_rate')

# Merge count and bad rate stats
credit_history_status = credit_history_status.merge(bad_rates, on='credit_history')

# Add bad rate comparison
avg_bad_rate = credit_history_status['bad_rate'].mean()
credit_history_status['vs_avg_bad_rate'] = credit_history_status['bad_rate'] - avg_bad_rate
credit_history_status['vs_avg_bad_rate'] = credit_history_status['vs_avg_bad_rate'].apply(
    lambda x: f"{x:+.1f}% {'(above)' if x > 0 else '(below)' if x < 0 else '(average)'}"
)

# Sort by count descending
credit_history_status = credit_history_status.sort_values('count', ascending=False)

print("\nPurpose Analysis with Average Comparisons:")
print(credit_history_status.to_string(index=False))

In [None]:
## And we will define a threshold to figure out the risky group
## for business wise we define every category with bad rate higher than the average bad rate is calssified among the risk group
## The average of bad rates is adjusted by the weight of each category (adjusted by the value counts)
### Conclusion: the only group that classified as a risk group is the one with existing credits paid back duly because the have high volume and  abda rate higher than the average

In [None]:

## Plotting the value counts of the credit_history against their average with the bad rate
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

# Calculate statistics - CORRECTED: using consistent variable name
credit_history_stats = df_renamed.groupby('credit_history')['score'].agg(
    count='size',
    bad_rate=lambda x: (x == "bad").mean() * 100
).reset_index().sort_values('count', ascending=False)

# Calculate averages - CORRECTED: using credit_history_stats
avg_count = credit_history_stats['count'].mean()
avg_bad_rate = (df_renamed['score'] == "bad").mean() * 100

# Create custom colormap for the bars - CORRECTED: using credit_history_stats
colors = ['#4e79a7', '#f28e2b', '#e15759', '#76b7b2', '#59a14f']
cmap = LinearSegmentedColormap.from_list('custom', colors, N=len(credit_history_stats))

# Create figure
fig, ax1 = plt.subplots(figsize=(12, 6))
fig.patch.set_facecolor('white')
ax1.set_facecolor('white')
ax1.grid(False)

# Create positions for all bars
x_pos = np.arange(len(credit_history_stats)+1)  # +1 for average position
categories = list(credit_history_stats['credit_history']) + ['Average']

# Bar plot for counts with individual colors - CORRECTED: using credit_history_stats
bars = ax1.bar(x_pos[:-1], credit_history_stats['count'], 
               color=[cmap(i) for i in np.linspace(0, 1, len(credit_history_stats))],
               alpha=0.8, width=0.6, edgecolor='white', linewidth=1)

# Special styling for average bar
avg_bar = ax1.bar(x_pos[-1], avg_count, 
                 color='lightgray', alpha=0.9, width=0.6,
                 hatch='xxx', edgecolor='dimgray', linewidth=1.5,
                 label=f'Average Count ({avg_count:.1f})')

# Line plot for bad rates - CORRECTED: using credit_history_stats
ax2 = ax1.twinx()
ax2.grid(False)
line = ax2.plot(x_pos[:-1], credit_history_stats['bad_rate'], 
               color='red', marker='o', markersize=8, 
               linewidth=2, label='Bad Rate')

# Average bad rate marker
avg_point = ax2.plot(x_pos[-1], avg_bad_rate, 's', 
                    markersize=10, color='white',
                    markeredgecolor='red', markeredgewidth=2,
                    label=f'Avg Bad Rate ({avg_bad_rate:.1f}%)')

# Add value labels
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 5,
            f'{int(height)}', ha='center', va='bottom', 
            fontsize=9, fontweight='bold')

# Add average value label
ax1.text(avg_bar[0].get_x() + avg_bar[0].get_width()/2., 
        avg_count + 5, f'{avg_count:.1f}', 
        ha='center', va='bottom', fontsize=9, fontweight='bold')

# Customize axes
ax1.set_xticks(x_pos)
ax1.set_xticklabels(categories, rotation=45, ha='right', fontsize=10)
ax1.set_ylabel('Number of Applicants', color='dimgray', fontsize=12)
ax2.set_ylabel('Bad Rate (%)', color='red', fontsize=12)
ax1.tick_params(axis='y', labelcolor='dimgray')
ax2.tick_params(axis='y', labelcolor='red')

# Add reference line for average bad rate
ax2.axhline(avg_bad_rate, color='red', linestyle=':', 
           linewidth=1.5, alpha=0.7)

# Legend
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(handles1 + handles2, labels1 + labels2,
          loc='upper left', framealpha=1)

plt.title('Credit History Analysis: Volume vs. Bad Rate', 
         fontsize=14, pad=20, fontweight='bold')
plt.tight_layout()
plt.savefig("credit_history_analysis.png", dpi=300)

plt.show()

In [None]:
# Let's do the same analysis for the credit purpose:

In [None]:
sns.countplot(data=df_renamed, x='purpose', hue='score')
plt.xticks(rotation=85)
plt.show()

In [None]:
# Comparing the value counts of each category with the average value counts. Also the bad rate with Ave bad rate
# purpose count analysis
# Calculate counts
purpose_stats = df_renamed['purpose'].value_counts().reset_index()
purpose_stats.columns = ['purpose', 'count']  # Correct column renaming

# Calculate average count
avg_count = purpose_stats['count'].mean()

# Add average comparison column
purpose_stats['vs_avg_count'] = purpose_stats['count'] - avg_count
purpose_stats['vs_avg_count'] = purpose_stats['vs_avg_count'].apply(
    lambda x: f"{x:+.1f} {'(above)' if x > 0 else '(below)' if x < 0 else '(average)'}"
)

# Calculate bad rates
bad_rates = df_renamed.groupby('purpose')['score'].apply(
    lambda x: (x == "bad").mean() * 100
).reset_index(name='bad_rate')

# Merge count and bad rate stats
purpose_stats = purpose_stats.merge(bad_rates, on='purpose')

# Add bad rate comparison
avg_bad_rate = purpose_stats['bad_rate'].mean()
purpose_stats['vs_avg_bad_rate'] = purpose_stats['bad_rate'] - avg_bad_rate
purpose_stats['vs_avg_bad_rate'] = purpose_stats['vs_avg_bad_rate'].apply(
    lambda x: f"{x:+.1f}% {'(above)' if x > 0 else '(below)' if x < 0 else '(average)'}"
)

# Sort by count descending
purpose_stats = purpose_stats.sort_values('count', ascending=False)

print("\nPurpose Analysis with Average Comparisons:")
print(purpose_stats.to_string(index=False))

In [None]:
## Plotting the value counts of the purpose infront of their average with the bad rate
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

# Calculate statistics
stats = df_renamed.groupby('purpose')['score'].agg(
    count='size',
    bad_rate=lambda x: (x == "bad").mean() * 100
).reset_index().sort_values('count', ascending=False)

# Calculate averages
avg_count = stats['count'].mean()
avg_bad_rate = (df_renamed['score'] == "bad").mean() * 100

# Create custom colormap for the bars
colors = ['#4e79a7', '#f28e2b', '#e15759', '#76b7b2', '#59a14f']
cmap = LinearSegmentedColormap.from_list('custom', colors, N=len(stats))

# Create figure
fig, ax1 = plt.subplots(figsize=(12, 6))
fig.patch.set_facecolor('white')
ax1.set_facecolor('white')
ax1.grid(False)

# Create positions for all bars
x_pos = np.arange(len(stats)+1)  # +1 for average position
categories = list(stats['purpose']) + ['Average']

# Bar plot for counts with individual colors
bars = ax1.bar(x_pos[:-1], stats['count'], 
               color=[cmap(i) for i in np.linspace(0, 1, len(stats))],
               alpha=0.8, width=0.6, edgecolor='white', linewidth=1)

# Special styling for average bar
avg_bar = ax1.bar(x_pos[-1], avg_count, 
                 color='lightgray', alpha=0.9, width=0.6,
                 hatch='xxx', edgecolor='dimgray', linewidth=1.5,
                 label=f'Average Count ({avg_count:.1f})')

# Line plot for bad rates
ax2 = ax1.twinx()
ax2.grid(False)
line = ax2.plot(x_pos[:-1], stats['bad_rate'], 
               color='red', marker='o', markersize=8, 
               linewidth=2, label='Bad Rate')

# Average bad rate marker
avg_point = ax2.plot(x_pos[-1], avg_bad_rate, 's', 
                    markersize=10, color='white',
                    markeredgecolor='red', markeredgewidth=2,
                    label=f'Avg Bad Rate ({avg_bad_rate:.1f}%)')

# Add value labels
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 5,
            f'{int(height)}', ha='center', va='bottom', 
            fontsize=9, fontweight='bold')

# Add average value label
ax1.text(avg_bar[0].get_x() + avg_bar[0].get_width()/2., 
        avg_count + 5, f'{avg_count:.1f}', 
        ha='center', va='bottom', fontsize=9, fontweight='bold')

# Customize axes
ax1.set_xticks(x_pos)
ax1.set_xticklabels(categories, rotation=45, ha='right', fontsize=10)
ax1.set_ylabel('Number of Applicants', color='dimgray', fontsize=12)
ax2.set_ylabel('Bad Rate (%)', color='red', fontsize=12)
ax1.tick_params(axis='y', labelcolor='dimgray')
ax2.tick_params(axis='y', labelcolor='red')

# Add reference line for average bad rate
ax2.axhline(avg_bad_rate, color='red', linestyle=':', 
           linewidth=1.5, alpha=0.7)

# Legend
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(handles1 + handles2, labels1 + labels2,
          loc='upper left', framealpha=1)

plt.title('Purpose of loan Analysis: Volume vs. Bad Rate', 
         fontsize=14, pad=20, fontweight='bold')
plt.tight_layout()
plt.savefig("purpose_of_loan_nalysis.png", dpi=300)

plt.show()

In [None]:
## Analysing the checking account status

In [None]:
sns.countplot(data=df_renamed, x='status_checking_account', hue='score')
plt.xticks(rotation=85)
plt.show()

In [None]:
# Comparing the average bad rates with bad rate of each category and also the value count of each category with its average

status_checking_account_stats = df_renamed['status_checking_account'].value_counts().reset_index()
status_checking_account_stats.columns = ['status_checking_account', 'count']  

# Calculate average count
avg_count =status_checking_account_stats['count'].mean()

# Add average comparison column
status_checking_account_stats['vs_avg_count'] = status_checking_account_stats['count'] - avg_count
status_checking_account_stats['vs_avg_count'] = status_checking_account_stats['vs_avg_count'].apply(
    lambda x: f"{x:+.1f} {'(above)' if x > 0 else '(below)' if x < 0 else '(average)'}"
)

# Calculate bad rates
bad_rates = df_renamed.groupby('status_checking_account')['score'].apply(
    lambda x: (x == "bad").mean() * 100
).reset_index(name='bad_rate')

# Merge count and bad rate stats
status_checking_account_stats = status_checking_account_stats.merge(bad_rates, on='status_checking_account')

# Add bad rate comparison
avg_bad_rate = status_checking_account_stats['bad_rate'].mean()
status_checking_account_stats['vs_avg_bad_rate'] = status_checking_account_stats['bad_rate'] - avg_bad_rate
status_checking_account_stats['vs_avg_bad_rate'] = status_checking_account_stats['vs_avg_bad_rate'].apply(
    lambda x: f"{x:+.1f}% {'(above)' if x > 0 else '(below)' if x < 0 else '(average)'}"
)

# Sort by count descending
status_checking_account_stats = status_checking_account_stats.sort_values('count', ascending=False)

print("\nStatus Checking Account Analysis with Average Comparisons:")
print(status_checking_account_stats.to_string(index=False))

In [None]:
## Plotting the value counts of the checking account infront of their average with the bad rate
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

# Calculate statistics
stats = df_renamed.groupby('status_checking_account')['score'].agg(
    count='size',
    bad_rate=lambda x: (x == "bad").mean() * 100
).reset_index().sort_values('count', ascending=False)

# Calculate averages
avg_count = stats['count'].mean()
avg_bad_rate = (df_renamed['score'] == "bad").mean() * 100

# Create custom colormap for the bars
colors = ['#4e79a7', '#f28e2b', '#e15759', '#76b7b2', '#59a14f']
cmap = LinearSegmentedColormap.from_list('custom', colors, N=len(stats))

# Create figure
fig, ax1 = plt.subplots(figsize=(12, 6))
fig.patch.set_facecolor('white')
ax1.set_facecolor('white')
ax1.grid(False)

# Create positions for all bars
x_pos = np.arange(len(stats)+1)  # +1 for average position
categories = list(stats['status_checking_account']) + ['Average']

# Bar plot for counts with individual colors
bars = ax1.bar(x_pos[:-1], stats['count'], 
               color=[cmap(i) for i in np.linspace(0, 1, len(stats))],
               alpha=0.8, width=0.6, edgecolor='white', linewidth=1)

# Special styling for average bar
avg_bar = ax1.bar(x_pos[-1], avg_count, 
                 color='lightgray', alpha=0.9, width=0.6,
                 hatch='xxx', edgecolor='dimgray', linewidth=1.5,
                 label=f'Average Count ({avg_count:.1f})')

# Line plot for bad rates
ax2 = ax1.twinx()
ax2.grid(False)
line = ax2.plot(x_pos[:-1], stats['bad_rate'], 
               color='red', marker='o', markersize=8, 
               linewidth=2, label='Bad Rate')

# Average bad rate marker
avg_point = ax2.plot(x_pos[-1], avg_bad_rate, 's', 
                    markersize=10, color='white',
                    markeredgecolor='red', markeredgewidth=2,
                    label=f'Avg Bad Rate ({avg_bad_rate:.1f}%)')

# Add value labels
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 5,
            f'{int(height)}', ha='center', va='bottom', 
            fontsize=9, fontweight='bold')

# Add average value label
ax1.text(avg_bar[0].get_x() + avg_bar[0].get_width()/2., 
        avg_count + 5, f'{avg_count:.1f}', 
        ha='center', va='bottom', fontsize=9, fontweight='bold')

# Customize axes
ax1.set_xticks(x_pos)
ax1.set_xticklabels(categories, rotation=45, ha='right', fontsize=10)
ax1.set_ylabel('Number of Applicants', color='dimgray', fontsize=12)
ax2.set_ylabel('Bad Rate (%)', color='red', fontsize=12)
ax1.tick_params(axis='y', labelcolor='dimgray')
ax2.tick_params(axis='y', labelcolor='red')

# Add reference line for average bad rate
ax2.axhline(avg_bad_rate, color='red', linestyle=':', 
           linewidth=1.5, alpha=0.7)

# Legend
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(handles1 + handles2, labels1 + labels2,
          loc='upper left', framealpha=1)

plt.title('Checking Account Analysis: Volume vs. Bad Rate', 
         fontsize=14, pad=20, fontweight='bold')
plt.tight_layout()
plt.savefig("checking_account_analysis.png", dpi=300)

plt.show()

In [None]:
# Save the charts: