In [147]:
import pandas as pd
# Use to not cut columns
pd.set_option('max_columns', None)

# Import data
read_csv = lambda file, dtype={}: pd.read_csv(file, sep=";", dtype=dtype)

account_df = read_csv("data/account.csv")
card_df = read_csv("data/card_train.csv")
client_df = read_csv("data/client.csv")
disp_df = read_csv("data/disp.csv")
district_df = read_csv("data/district.csv")
loan_df = read_csv("data/loan_train.csv")
trans_df = read_csv("data/trans_train.csv", {'bank': 'str'})

# Clean column names (Some columns come with an extra whitespace)
rename = lambda df: df.rename(columns=lambda x: x.strip())
district_df = rename(district_df)

trans_df.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,930113,credit,credit in cash,800.0,800.0,,,
1,1548750,5270,930114,credit,collection from another bank,44749.0,45549.0,,IJ,80269753.0
2,3393738,11265,930114,credit,credit in cash,1000.0,1000.0,,,
3,3122924,10364,930117,credit,credit in cash,1100.0,1100.0,,,
4,1121963,3834,930119,credit,credit in cash,700.0,700.0,,,


In [148]:
# Join data

def join(df1, df2, key1, key2, suff, t="inner"):
    return df1.merge(df2, left_on=key1, right_on=key2, how=t, suffixes=suff)

# Account U
join_df = join(loan_df, account_df, 'account_id', 'account_id', ['_loan', '_account'])
join_df = join(join_df, disp_df, 'account_id', 'account_id', ['', '_disp'])
join_df = join(join_df, card_df, 'disp_id', 'disp_id', ['', '_card'])
join_df = join(join_df, trans_df, 'account_id', 'account_id', ['', '_trans'])
join_df = join(join_df, client_df, 'client_id', 'client_id', ['', '_client'])
join_df = join(join_df, district_df, 'district_id', 'code', ['', '_district'])

df = join_df
join_df.columns

Index(['loan_id', 'account_id', 'date_loan', 'amount', 'duration', 'payments',
       'status', 'district_id', 'frequency', 'date_account', 'disp_id',
       'client_id', 'type', 'card_id', 'type_card', 'issued', 'trans_id',
       'date', 'type_trans', 'operation', 'amount_trans', 'balance',
       'k_symbol', 'bank', 'account', 'birth_number', 'district_id_client',
       'code', 'name', 'region', 'no. of inhabitants',
       'no. of municipalities with inhabitants < 499',
       'no. of municipalities with inhabitants 500-1999',
       'no. of municipalities with inhabitants 2000-9999',
       'no. of municipalities with inhabitants >10000', 'no. of cities',
       'ratio of urban inhabitants', 'average salary', 'unemploymant rate '95',
       'unemploymant rate '96', 'no. of enterpreneurs per 1000 inhabitants',
       'no. of commited crimes '95', 'no. of commited crimes '96'],
      dtype='object')

In [149]:
# This already counts as cleaning data
loan_df

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,930705,96396,12,8033,-1
1,5316,1801,930711,165960,36,4610,1
2,6863,9188,930728,127080,60,2118,1
3,5325,1843,930803,105804,36,2939,1
4,7240,11013,930906,274740,60,4579,1
...,...,...,...,...,...,...,...
323,6818,9030,961212,155616,48,3242,1
324,5625,3189,961215,222180,60,3703,-1
325,6805,8972,961221,45024,48,938,1
326,7233,10963,961225,115812,36,3217,1


In [150]:
df

Unnamed: 0,loan_id,account_id,date_loan,amount,duration,payments,status,district_id,frequency,date_account,disp_id,client_id,type,card_id,type_card,issued,trans_id,date,type_trans,operation,amount_trans,balance,k_symbol,bank,account,birth_number,district_id_client,code,name,region,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
0,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,2349697,930208,credit,credit in cash,600.0,600.0,,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
1,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,2349709,930212,credit,credit in cash,19588.0,20188.0,,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
2,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,2349705,930212,credit,credit in cash,27078.0,47266.0,,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
3,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,3492040,930228,credit,,119.6,47385.6,interest credited,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
4,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,2350078,930310,withdrawal,withdrawal in cash,12000.0,35385.6,,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,7130,10411,961211,123408,24,5142,1,39,issuance after transaction,951123,12502,12810,OWNER,1176,gold,960703,3137169,961130,withdrawal,withdrawal in cash,100.0,23239.1,payment for statement,,,641014,39,39,Most,north Bohemia,119895,17,4,3,2,4,89.9,10446,7.34,9.40,90,4947,4743
1034,7130,10411,961211,123408,24,5142,1,39,issuance after transaction,951123,12502,12810,OWNER,1176,gold,960703,3136995,961205,withdrawal,remittance to another bank,2323.0,20916.1,insurrance payment,CD,33192354.0,641014,39,39,Most,north Bohemia,119895,17,4,3,2,4,89.9,10446,7.34,9.40,90,4947,4743
1035,7130,10411,961211,123408,24,5142,1,39,issuance after transaction,951123,12502,12810,OWNER,1176,gold,960703,3136899,961207,withdrawal,remittance to another bank,3020.0,17896.1,household,ST,41324371.0,641014,39,39,Most,north Bohemia,119895,17,4,3,2,4,89.9,10446,7.34,9.40,90,4947,4743
1036,7130,10411,961211,123408,24,5142,1,39,issuance after transaction,951123,12502,12810,OWNER,1176,gold,960703,3137043,961208,withdrawal,remittance to another bank,7732.0,10164.1,,ST,47737151.0,641014,39,39,Most,north Bohemia,119895,17,4,3,2,4,89.9,10446,7.34,9.40,90,4947,4743
