# German Credit - 01 - Import

## Setup

In [127]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

In [128]:
DATASET = "German Credit"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Datasets

In [129]:
for filename in ["german.data", "german.doc", "german.data-numeric"]:
    source = f"https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/{filename}"
    target = f"{ROOT}/orig/{filename}"
    if not os.path.isfile(target):
        print (f"Downloading remote file {filename}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename}")

Using local copy of german.data
Using local copy of german.doc
Using local copy of german.data-numeric


In [130]:
df = pd.read_csv(f"{ROOT}/orig/german.data", sep =" ", header=None)
print(df.shape)
df.head(5)

(1000, 21)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2


In [131]:
df.columns = ["Account_Balance", "Loan_Duration", "Credit_History", 
    "Loan_Purpose", "Loan_Amount", "Savings", "Years_Employed", 
    "Installment_Rate", "Gender_and_Status", "Other_Debtors_Guarantors",
    "Year_of_Residence", "Property", "Age", "Installment_Plans", "Housing", "Existing_Credits", "Job", "Dependents", 
    "Telephone", "Foreign", 
    "Credit_Rating"]

df.head(1)

Unnamed: 0,Account_Balance,Loan_Duration,Credit_History,Loan_Purpose,Loan_Amount,Savings,Years_Employed,Installment_Rate,Gender_and_Status,Other_Debtors_Guarantors,Year_of_Residence,Property,Age,Installment_Plans,Housing,Existing_Credits,Job,Dependents,Telephone,Foreign,Credit_Rating
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1


In [132]:
df.isna().sum().sum()

0

In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Account_Balance           1000 non-null   object
 1   Loan_Duration             1000 non-null   int64 
 2   Credit_History            1000 non-null   object
 3   Loan_Purpose              1000 non-null   object
 4   Loan_Amount               1000 non-null   int64 
 5   Savings                   1000 non-null   object
 6   Years_Employed            1000 non-null   object
 7   Installment_Rate          1000 non-null   int64 
 8   Gender_and_Status         1000 non-null   object
 9   Other_Debtors_Guarantors  1000 non-null   object
 10  Year_of_Residence         1000 non-null   int64 
 11  Property                  1000 non-null   object
 12  Age                       1000 non-null   int64 
 13  Installment_Plans         1000 non-null   object
 14  Housing                  

## Clean

In [134]:
lines = open(f"{ROOT}/orig/german.doc").readlines()
# lines = [x for x in lines if x.startswith("\t")]
lines = [x.strip() for x in lines]
lines = [x for x in lines if len(x)>2 and x[0]=="A" and x[1].isdigit()]
lines = [x.split(":", 1) for x in lines]
lines = [[x[0].strip(), x[1].strip()] for x in lines]
if DEBUG:
    pprint(lines)

[['A11', '... <    0 DM'],
 ['A12', '0 <= ... <  200 DM'],
 ['A13', '... >= 200 DM /'],
 ['A14', 'no checking account'],
 ['A30', 'no credits taken/'],
 ['A31', 'all credits at this bank paid back duly'],
 ['A32', 'existing credits paid back duly till now'],
 ['A33', 'delay in paying off in the past'],
 ['A34', 'critical account/'],
 ['A40', 'car (new)'],
 ['A41', 'car (used)'],
 ['A42', 'furniture/equipment'],
 ['A43', 'radio/television'],
 ['A44', 'domestic appliances'],
 ['A45', 'repairs'],
 ['A46', 'education'],
 ['A47', '(vacation - does not exist?)'],
 ['A48', 'retraining'],
 ['A49', 'business'],
 ['A410', 'others'],
 ['A61', '... <  100 DM'],
 ['A62', '100 <= ... <  500 DM'],
 ['A63', '500 <= ... < 1000 DM'],
 ['A64', '.. >= 1000 DM'],
 ['A65', 'unknown/ no savings account'],
 ['A71', 'unemployed'],
 ['A72', '... < 1 year'],
 ['A73', '1  <= ... < 4 years'],
 ['A74', '4  <= ... < 7 years'],
 ['A75', '.. >= 7 years'],
 ['A91', 'male   : divorced/separated'],
 ['A92', 'female : div

In [135]:
mapping = {code : f"({code}) {label}" for code, label in lines} 
if DEBUG:
    pprint(mapping)

{'A101': '(A101) none',
 'A102': '(A102) co-applicant',
 'A103': '(A103) guarantor',
 'A11': '(A11) ... <    0 DM',
 'A12': '(A12) 0 <= ... <  200 DM',
 'A121': '(A121) real estate',
 'A122': '(A122) if not A121 : building society savings agreement/',
 'A123': '(A123) if not A121/A122 : car or other, not in attribute 6',
 'A124': '(A124) unknown / no property',
 'A13': '(A13) ... >= 200 DM /',
 'A14': '(A14) no checking account',
 'A141': '(A141) bank',
 'A142': '(A142) stores',
 'A143': '(A143) none',
 'A151': '(A151) rent',
 'A152': '(A152) own',
 'A153': '(A153) for free',
 'A171': '(A171) unemployed/ unskilled  - non-resident',
 'A172': '(A172) unskilled - resident',
 'A173': '(A173) skilled employee / official',
 'A174': '(A174) management/ self-employed/',
 'A191': '(A191) none',
 'A192': '(A192) yes, registered under the customers name',
 'A201': '(A201) yes',
 'A202': '(A202) no',
 'A30': '(A30) no credits taken/',
 'A31': '(A31) all credits at this bank paid back duly',
 'A32'

In [136]:
df.select_dtypes("O").columns

Index(['Account_Balance', 'Credit_History', 'Loan_Purpose', 'Savings', 'Years_Employed', 'Gender_and_Status', 'Other_Debtors_Guarantors', 'Property', 'Installment_Plans', 'Housing', 'Job', 'Telephone', 'Foreign'], dtype='object')

In [137]:
df_label = df.copy()
for col in df_label.select_dtypes("O").columns:
    df_label[col] = pd.Categorical(df_label[col].map(mapping))
    success = df_label[col].isna().sum()
    print(f"{col} : {success}")


Account_Balance : 0
Credit_History : 0
Loan_Purpose : 0
Savings : 0
Years_Employed : 0
Gender_and_Status : 0
Other_Debtors_Guarantors : 0
Property : 0
Installment_Plans : 0
Housing : 0
Job : 0
Telephone : 0
Foreign : 0


## Output

In [138]:
df_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Account_Balance           1000 non-null   category
 1   Loan_Duration             1000 non-null   int64   
 2   Credit_History            1000 non-null   category
 3   Loan_Purpose              1000 non-null   category
 4   Loan_Amount               1000 non-null   int64   
 5   Savings                   1000 non-null   category
 6   Years_Employed            1000 non-null   category
 7   Installment_Rate          1000 non-null   int64   
 8   Gender_and_Status         1000 non-null   category
 9   Other_Debtors_Guarantors  1000 non-null   category
 10  Year_of_Residence         1000 non-null   int64   
 11  Property                  1000 non-null   category
 12  Age                       1000 non-null   int64   
 13  Installment_Plans         1000 non-null   categor

In [139]:
df_label.to_pickle(f"{ROOT}/data/labeled.pkl")

with open(f"{ROOT}/data/mapping.yaml", "wt") as stream:
    yaml.safe_dump(mapping, stream)