In [3]:
import pandas as pd
import numpy as np
import xgboost
from sklearn.metrics import roc_curve, auc, confusion_matrix, roc_auc_score
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
xgboost.__version__
from scipy.linalg import svd
from scipy.spatial import ConvexHull
from sklearn.decomposition import PCA, NMF
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

### Read and filter data

In [19]:
file_name = "./german.data"

#read file
all_data = pd.read_csv(
  file_name,
  sep = ' ',
  header=None
)

all_data.columns = ['checking_acct', 'duration_month', 'credit_hist',
                    'purpose', 'credit_amount', 'savings', 'employ_years',
                    'install_as_percent_of_disp_income', 'married_and_sex',
                    'other_debtors', 'resident_since', 'property',
                    'age', 'other_installments', 'housing', 'existing_credits',
                    'job', 'num_liable', 'telephone', 'foreign', 'ground_truth']

# translate from codes to descriptive things
all_data.replace({"checking_acct": {'A11' : 'lt_0',
 'A12' : '0_to_200',
 'A13' : 'geq_200',
 'A14' : 'no acct'}}, inplace=True)

all_data.replace({"credit_hist": {'A30' : 'no_cred_all_paid',
 'A31' : 'all_credit_bank_paid',
 'A32' : 'existing_credits_paid',
 'A33' : 'delay_in_past',
 'A34' : 'critical_accts_other_bank_credits'}}, inplace=True)

all_data.replace({"purpose": {
 'A40' : 'car_new',
 'A41' : 'car_used',
 'A42' : 'furniture_equipment',
 'A43' : 'radio_tv',
 'A44' : 'home_appliances',
 'A45' : 'repairs',
 'A46' : 'education',
 'A47' : 'vacation',
 'A48' : 'retraining',
 'A49' : 'business',
 'A410' : 'other'}}, inplace=True)

all_data.replace({"savings": {
 'A61' : 'lt_100',
 'A62' : 'lt_500',
 'A63' : '100_to_500',
 'A64' : '500_to_1000',
 'A65' : 'unknown_or_no_account'}}, inplace=True)

all_data.replace({"employ_years": {
 'A71' : 'unemployed',
 'A72' : 'lt_1',
 'A73' : '1_to_4',
 'A74' : '4_to_7',
 'A75' : 'geq_7'}}, inplace=True)

all_data.replace({"married_and_sex": {
 'A91' : 'male_div_sep',
 'A92' : 'female_div_sep_married',
 'A93' : 'male_single',
 'A94' : 'male_married_widowed',
 'A95' : 'female_single'}}, inplace=True)

all_data.replace({"other_debtors": {
 'A101' : 'none',
 'A102' : 'co_applicant',
 'A103' : 'guarantor'}}, inplace=True)

all_data.replace({"property": {
 'A121' : 'real_estate',
 'A122' : 'building_soc_saving_life_insurance',
 'A123' : 'car_other',
 'A124' : 'unknown_none'}}, inplace=True)

all_data.replace({"other_installments": {
 'A141' : 'bank',
 'A142' : 'stores',
 'A143' : 'none'}}, inplace=True)

all_data.replace({"housing": {
 'A151' : 'rent',
 'A152' : 'own',
 'A123' : 'free'}}, inplace=True)

all_data.replace({"job": {
 'A171' : 'unemploy_unskilled_non_res',
 'A172' : 'unskill_resident',
 'A173' : 'skilled_official',
 'A174' : 'management_self_highQualified_officer'}}, inplace=True)

all_data.replace({"telephone": {
 'A191' : 'none',
 'A192' : 'yes'}}, inplace=True)

all_data.replace({"foreign": {
 'A201' : 'yes',
 'A202' : 'no'}}, inplace=True)

all_data.replace({"ground_truth": {
 1 : 'accepted',
 2 : 'dennied'}}, inplace=True)


all_data.head()

Unnamed: 0,checking_acct,duration_month,credit_hist,purpose,credit_amount,savings,employ_years,install_as_percent_of_disp_income,married_and_sex,other_debtors,...,property,age,other_installments,housing,existing_credits,job,num_liable,telephone,foreign,ground_truth
0,lt_0,6,critical_accts_other_bank_credits,radio_tv,1169,unknown_or_no_account,geq_7,4,male_single,none,...,real_estate,67,none,own,2,skilled_official,1,yes,yes,accepted
1,0_to_200,48,existing_credits_paid,radio_tv,5951,lt_100,1_to_4,2,female_div_sep_married,none,...,real_estate,22,none,own,1,skilled_official,1,none,yes,dennied
2,no acct,12,critical_accts_other_bank_credits,education,2096,lt_100,4_to_7,2,male_single,none,...,real_estate,49,none,own,1,unskill_resident,2,none,yes,accepted
3,lt_0,42,existing_credits_paid,furniture_equipment,7882,lt_100,4_to_7,2,male_single,guarantor,...,building_soc_saving_life_insurance,45,none,A153,1,skilled_official,2,none,yes,accepted
4,lt_0,24,delay_in_past,car_new,4870,lt_100,1_to_4,3,male_single,none,...,unknown_none,53,none,A153,2,skilled_official,2,none,yes,dennied
