# Notebook 01: Data Understanding

In this notebook, we load the South German Credit dataset, inspect its structure, and rename the columns to English.

In [1]:
import pandas as pd

# Load the dataset (space-separated values)
df = pd.read_csv('../data/SouthGermanCredit.asc', delim_whitespace=True)

# Original column names (German)
print(df.columns.tolist())

['laufkont', 'laufzeit', 'moral', 'verw', 'hoehe', 'sparkont', 'beszeit', 'rate', 'famges', 'buerge', 'wohnzeit', 'verm', 'alter', 'weitkred', 'wohn', 'bishkred', 'beruf', 'pers', 'telef', 'gastarb', 'kredit']


  df = pd.read_csv('../data/SouthGermanCredit.asc', delim_whitespace=True)


In [2]:
df.columns

Index(['laufkont', 'laufzeit', 'moral', 'verw', 'hoehe', 'sparkont', 'beszeit',
       'rate', 'famges', 'buerge', 'wohnzeit', 'verm', 'alter', 'weitkred',
       'wohn', 'bishkred', 'beruf', 'pers', 'telef', 'gastarb', 'kredit'],
      dtype='object')

In [3]:
df.rate.value_counts()

rate
4    476
2    231
3    157
1    136
Name: count, dtype: int64

In [4]:
# Assign English column names
df.columns = [
    "checking_account_status", "duration_months", "credit_history", "purpose", "credit_amount",
    "savings_account_balance", "employment_duration", "installment_rate", "personal_status_sex",
    "guarantors", "residence_since", "property", "age", "other_installment_plans", "housing",
    "existing_credits_count", "job", "dependents", "telephone", "foreign_worker", "credit_risk"
]
df.head()

Unnamed: 0,checking_account_status,duration_months,credit_history,purpose,credit_amount,savings_account_balance,employment_duration,installment_rate,personal_status_sex,guarantors,...,property,age,other_installment_plans,housing,existing_credits_count,job,dependents,telephone,foreign_worker,credit_risk
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,2,1,1,1


In [5]:
# Replace numeric codes with descriptive labels based on codetable
df['checking_account_status'] = df['checking_account_status'].map({
    1: 'no checking account', 2: '< 0 DM', 3: '0 <= ... < 200 DM', 4: '>= 200 DM or salary ≥1yr'
})
df['credit_history'] = df['credit_history'].map({
    0: 'delay in past payments', 1: 'critical/other credits', 2: 'no credits taken/all paid duly',
    3: 'existing credits paid duly', 4: 'all credits at bank paid duly'
})
df['purpose'] = df['purpose'].map({
    0: 'others', 1: 'car (new)', 2: 'car (used)', 3: 'furniture/equipment', 4: 'radio/TV',
    5: 'appliances', 6: 'repairs', 7: 'education', 8: 'vacation', 9: 'retraining', 10: 'business'
})
df['savings_account_balance'] = df['savings_account_balance'].map({
    1: 'no savings', 2: '< 100 DM', 3: '100-500 DM', 4: '500-1000 DM', 5: '>= 1000 DM'
})
df['employment_duration'] = df['employment_duration'].map({
    1: 'unemployed', 2: '< 1 yr', 3: '1-4 yrs', 4: '4-7 yrs', 5: '>= 7 yrs'
})


df['personal_status_sex'] = df['personal_status_sex'].map({
    1: 'male: divorced/separated',
    2: 'female: non-single or male: single',
    3: 'male: married/widowed',
    4: 'female: single'
})

df['guarantors'] = df['guarantors'].map({1: 'none', 2: 'co-applicant', 3: 'guarantor'})
df['property'] = df['property'].map({
    1: 'unknown/no property', 2: 'car/other', 3: 'insurance/savings', 4: 'real estate'
})
df['other_installment_plans'] = df['other_installment_plans'].map({1: 'bank', 2: 'stores', 3: 'none'})
df['housing'] = df['housing'].map({1: 'for free', 2: 'rent', 3: 'own'})
df['job'] = df['job'].map({
    1: 'unemployed/unskilled-nonresident', 2: 'unskilled-resident',
    3: 'skilled employee', 4: 'manager/self-employed/highly qualified'
})
df['dependents'] = df['dependents'].map({1: '3 or more', 2: '0 to 2'})
df['telephone'] = df['telephone'].map({1: 'no', 2: 'yes'})
df['foreign_worker'] = df['foreign_worker'].map({1: 'yes', 2: 'no'})
df['credit_risk'] = df['credit_risk'].map({1: 'good', 0: 'bad'})

df['installment_rate'] = df['installment_rate'].map({1: '>=35', 2: '25<= ... < 35', 3:'20<= ... > 25', 4: '<20'})
df['residence_since'] = df['residence_since'].map({1: '< 1yr', 2: '1 <= ... < 4 yrs', 3:'4 <= ... < 7 yrs', 4: '>= 7 yrs'})
df['existing_credits_count'] = df['existing_credits_count'].map({1: '1', 2: '2-3', 3:'4-5', 4: '>= 6'})



In [6]:
# Basic overview
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   checking_account_status  1000 non-null   object
 1   duration_months          1000 non-null   int64 
 2   credit_history           1000 non-null   object
 3   purpose                  1000 non-null   object
 4   credit_amount            1000 non-null   int64 
 5   savings_account_balance  1000 non-null   object
 6   employment_duration      1000 non-null   object
 7   installment_rate         1000 non-null   object
 8   personal_status_sex      1000 non-null   object
 9   guarantors               1000 non-null   object
 10  residence_since          1000 non-null   object
 11  property                 1000 non-null   object
 12  age                      1000 non-null   int64 
 13  other_installment_plans  1000 non-null   object
 14  housing                  1000 non-null   

In [7]:
# Check for missing values and class distribution
print(df.isnull().sum())
df['credit_risk'].value_counts(normalize=True)

checking_account_status    0
duration_months            0
credit_history             0
purpose                    0
credit_amount              0
savings_account_balance    0
employment_duration        0
installment_rate           0
personal_status_sex        0
guarantors                 0
residence_since            0
property                   0
age                        0
other_installment_plans    0
housing                    0
existing_credits_count     0
job                        0
dependents                 0
telephone                  0
foreign_worker             0
credit_risk                0
dtype: int64


credit_risk
good    0.7
bad     0.3
Name: proportion, dtype: float64

In [8]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report",explorative=True)
profile.to_file("./../reports/raw_data.html")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 21/21 [00:00<00:00, 48.07it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

- Categorical Variables:
    - checking_account_status
    - credit_history
    - purpose
    - saving_account_balance
    - employment_duration
    - installment_rate
    - **personal_status_sex** -> CRITICAL
    - guarantors
    - residence_since
    - property
    - other_installment_plans
    - housing
    - existing_credits_count
    - job
    - dependents

- Booleans:
    - telephone
    - **foreign_worker** -> CRITICAL
- Numerical:
    - duration_months
    - credit_amount
    - **age** -> CRITICAL
- Target:
    - credit_risk

In [None]:
df.to_parquet('./../data/silver/SouthGermanCredit_en.parquet', index=False)