# Data Preparation

Load Model from dataset folder

In [12]:
from model.model import Model

model = Model()

In [13]:
# Load Dataframes
account_df = model.get_accounts()
# card_df = model.get_cards()
client_df = model.get_clients()
disp_df = model.get_disps()
district_df = model.get_districts()
loan_test_df = model.get_loans('test')
loan_train_df = model.get_loans('train')
trans_test_df = model.get_transactions('test')
trans_train_df = model.get_transactions('train')

In [17]:
# Merge all
def merge_all():
    _disp_df = disp_df[disp_df['type'] == 'OWNER']
    df = _disp_df.merge(client_df, on='client_id')
    df = df.merge(district_df, on='district_id')
    df['date'] = df.merge(account_df, on='account_id')['date']
    return df

def merge_train(df):
    df = df.merge(loan_train_df, on='account_id')
    df = df.merge(trans_train_df, on='account_id')
    return df

def merge_test(df):
    df = df.merge(loan_test_df, on='account_id')
    df = df.merge(trans_test_df, on='account_id')
    return df 

df = merge_all()
df = merge_train(df)
df.head()

Unnamed: 0,client_id,account_id,type,birth_number,district_id,gender,age,district_name,region,no. of inhabitants,...,loan_amount,duration,payments,status,trans_id,trans_date,trans_type,operation,trans_amount,balance
0,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302598,1993-04-20,credit,credit in cash,8897.0,8897.0
1,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302582,1993-04-20,credit,credit in cash,400.0,9297.0
2,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3526454,1993-04-30,credit,interest credited,13.6,9310.6
3,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302588,1993-05-03,credit,credit in cash,25724.0,35034.6
4,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302586,1993-05-15,credit,credit in cash,25060.0,60094.6


In [18]:
import numpy as np

def aggregate(df):
    def count_withdrawal(trans_type):
        return sum(trans_type == 'withdrawal')
    def count_credit(trans_type):
        return sum(trans_type == 'credit')
    def mean_withdrawal(trans_type):
        return np.mean(trans_type == 'withdrawal')
    def mean_credit(trans_type):
        return np.mean(trans_type == 'credit')
    def std_withdrawal(trans_type):
        return np.std(trans_type == 'withdrawal')
    def std_credit(trans_type):
        return np.std(trans_type == 'credit')
    keep_cols = ['loan_id', 'account_id', 'loan_date', 'loan_amount',
                'duration', 'payments', 'status',
                'birth_number', # 'disp_type_count', # mudar para birth_date
                'district_id', 'gender', #'client_age',
                'no. of inhabitants',
                'no. of municipalities with inhabitants < 499 ',
                'no. of municipalities with inhabitants 500-1999',
                'no. of municipalities with inhabitants 2000-9999 ',
                'no. of municipalities with inhabitants >10000 ',
                'no. of cities ', 'ratio of urban inhabitants ',
                'average salary ', 'unemploymant rate \'95 ',
                'unemploymant rate \'96 ',
                'no. of enterpreneurs per 1000 inhabitants ',
                'no. of commited crimes \'95 ',
                'no. of commited crimes \'96 ', 'date']

    df = df.groupby(keep_cols, as_index=False, group_keys=False).agg({
        'trans_amount': ['mean', 'min', 'max', 'std', 'last'],
        'balance': ['mean', 'min', 'max', 'std', 'last'],
        'trans_type': [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit]
    })

    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    return df

df = aggregate(df)
df.head()

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,birth_number,district_id,gender,...,balance_min,balance_max,balance_std,balance_last,trans_type_count_withdrawal,trans_type_count_credit,trans_type_mean_withdrawal,trans_type_mean_credit,trans_type_std_withdrawal,trans_type_std_credit
0,4959,2,1994-01-05,80952,24,3373,1,1945-02-04,1,1,...,1100.0,67529.6,12061.705682,27855.2,29,22,0.537037,0.407407,0.498626,0.491352
1,4961,19,1996-04-29,30276,12,2523,-1,1939-04-23,21,0,...,715.0,58157.5,15039.248405,15854.0,32,46,0.4,0.575,0.489898,0.494343
2,4973,67,1996-05-02,165960,24,6915,1,1944-06-13,16,0,...,700.0,107069.6,20955.646998,23703.8,84,37,0.672,0.296,0.469485,0.456491
3,4996,132,1996-11-06,88440,12,7370,1,1945-07-03,40,0,...,200.0,103239.0,21638.25887,79007.6,14,16,0.451613,0.516129,0.497653,0.49974
4,5002,173,1994-05-31,104808,12,8734,1,1939-11-30,66,1,...,500.0,57865.3,11517.175248,28015.4,18,12,0.6,0.4,0.489898,0.489898


In [21]:
df.shape

(328, 40)