# Data Preparation

Load Model from dataset folder

In [44]:
from model.model import Model

model = Model()

In [45]:
# Load Dataframes
account_df = model.get_accounts()
# card_df = model.get_cards()
client_df = model.get_clients()
disp_df = model.get_disps()
district_df = model.get_districts()
loan_test_df = model.get_loans('test')
loan_train_df = model.get_loans('train')
trans_test_df = model.get_transactions('test')
trans_train_df = model.get_transactions('train')

In [46]:
# Merge all
def merge_all():
    _disp_df = disp_df[disp_df['type'] == 'OWNER']
    df = _disp_df.merge(client_df, on='client_id')
    df = df.merge(district_df, on='district_id')
    df['date'] = df.merge(account_df, on='account_id')['date']
    return df

def merge_train(df):
    df = df.merge(loan_train_df, on='account_id')
    df = df.merge(trans_train_df, on='account_id')
    return df

def merge_test(df):
    df = df.merge(loan_test_df, on='account_id')
    df = df.merge(trans_test_df, on='account_id')
    return df 

df = merge_all()
train_df = merge_train(df)
test_df = merge_test(df)
train_df.head()

Unnamed: 0,client_id,account_id,type,birth_number,district_id,gender,age,district_name,region,no. of inhabitants,...,loan_amount,duration,payments,status,trans_id,trans_date,trans_type,operation,trans_amount,balance
0,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302598,1993-04-20,credit,credit in cash,8897.0,8897.0
1,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302582,1993-04-20,credit,credit in cash,400.0,9297.0
2,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3526454,1993-04-30,credit,interest credited,13.6,9310.6
3,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302588,1993-05-03,credit,credit in cash,25724.0,35034.6
4,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302586,1993-05-15,credit,credit in cash,25060.0,60094.6


In [47]:
# from sklearn.preprocessing import MinMaxScaler
# import numpy as np

# scaler = MinMaxScaler()
# scale_cols = ['loan_amount', 'duration', 'balance',
#        'payments', 'no. of inhabitants', 'no. of municipalities with inhabitants < 499 ',
#        'no. of municipalities with inhabitants 500-1999',
#        'no. of municipalities with inhabitants 2000-9999 ',
#        'no. of municipalities with inhabitants >10000 ', 'no. of cities ',
#        'ratio of urban inhabitants ', 'average salary ',
#        'unemploymant rate \'95 ', 'unemploymant rate \'96 ',
#        'no. of enterpreneurs per 1000 inhabitants ',
#        'no. of commited crimes \'95 ', 'no. of commited crimes \'96 ']

# train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
# test_df[scale_cols] = scaler.fit_transform(test_df[scale_cols])
# train_df.head()


In [48]:
import numpy as np
from agg import *

def aggregate(df):
    keep_cols = ['loan_id', 'account_id', 'loan_date', 'loan_amount',
                 'duration', 'payments', 'status',
                 'birth_number',  # 'disp_type_count', # mudar para birth_date
                 'district_id', 'gender',  # 'client_age',
                 'no. of inhabitants',
                 'no. of municipalities with inhabitants < 499 ',
                 'no. of municipalities with inhabitants 500-1999',
                 'no. of municipalities with inhabitants 2000-9999 ',
                 'no. of municipalities with inhabitants >10000 ',
                 'no. of cities ', 'ratio of urban inhabitants ',
                 'average salary ', 'unemploymant rate \'95 ',
                 'unemploymant rate \'96 ',
                 'no. of enterpreneurs per 1000 inhabitants ',
                 'no. of commited crimes \'95 ',
                 'no. of commited crimes \'96 ', 'date']

    # TODO: add more aggregations
    df = df.groupby(keep_cols, as_index=False, group_keys=False).agg({
        'trans_date': ['max', 'min'],
        'trans_amount': ['mean', 'min', 'max', 'std', 'last'],
        'operation': ['count', 
                        count_credit_op, count_collection_op, count_withdrawal_op, count_remittance_op, count_ccw_op, count_interest_op,
                        mean_credit_op, mean_collection_op, mean_withdrawal_op, mean_remittance_op, mean_ccw_op, mean_interest_op,
                        std_credit_op, std_collection_op, std_withdrawal_op, std_remittance_op, std_ccw_op, std_interest_op],
        'balance': ['mean', 'min', 'max', 'std', 'last'],
        'trans_type': [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit]
    })

    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    df['last_balance_l'] = df['balance_last'] / df['loan_amount']
    df.loc[df['last_balance_l'] == np.inf, 'last_balance_l'] = 0

    df['max_balance_l'] = df['balance_max'] / df['loan_amount']
    df.loc[df['max_balance_l'] == np.inf, 'max_balance_l'] = 0

    df['owner_age_at'] = (df['loan_date'] - df['birth_number']).astype('<m8[Y]')
    df['owner_age_at'] = df['owner_age_at'].astype(int)

    df['account_age'] = ((df['loan_date'] - df['date']).dt.days) / 30

    return df


train_df = aggregate(train_df)
test_df = aggregate(test_df)
train_df.head()


Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,birth_number,district_id,gender,...,trans_type_count_withdrawal,trans_type_count_credit,trans_type_mean_withdrawal,trans_type_mean_credit,trans_type_std_withdrawal,trans_type_std_credit,last_balance_l,max_balance_l,owner_age_at,account_age
0,4959,2,1994-01-05,80952,24,3373,1,1945-02-04,1,1,...,32,22,0.592593,0.407407,0.491352,0.491352,0.344095,0.834193,48,10.433333
1,4961,19,1996-04-29,30276,12,2523,-1,1939-04-23,21,0,...,34,46,0.425,0.575,0.494343,0.494343,0.523649,1.920911,57,12.933333
2,4973,67,1996-05-02,165960,24,6915,1,1944-06-13,16,0,...,88,37,0.704,0.296,0.456491,0.456491,0.142828,0.645153,51,18.7
3,4996,132,1996-11-06,88440,12,7370,1,1945-07-03,40,0,...,15,16,0.483871,0.516129,0.49974,0.49974,0.893347,1.167334,51,5.966667
4,5002,173,1994-05-31,104808,12,8734,1,1939-11-30,66,1,...,18,12,0.6,0.4,0.489898,0.489898,0.267302,0.552108,54,6.2


In [49]:
# scaler = MinMaxScaler()
# scale_cols = [
#        # 'operation_count','operation_count_credit_op', 'operation_count_collection_op',
#        # 'operation_count_withdrawal_op', 'operation_count_remittance_op',
#        # 'operation_count_ccw_op', 'operation_count_interest_op',
#        # 'operation_mean_credit_op', 'operation_mean_collection_op',
#        # 'operation_mean_withdrawal_op', 'operation_mean_remittance_op',
#        # 'operation_mean_ccw_op', 'operation_mean_interest_op',
#        'trans_type_count_withdrawal', 'trans_type_count_credit',
#        'trans_type_mean_withdrawal', 'trans_type_mean_credit',]

# train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
# test_df[scale_cols] = scaler.fit_transform(test_df[scale_cols])
# train_df.head()

In [50]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

def encode_df(df):
    le = LabelEncoder()
    ohe = OneHotEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            df[col] = le.fit_transform(df[col])
    return df

train_df = encode_df(train_df)
test_df = encode_df(test_df)


In [51]:
train_df['average salary ']

0      12541
1       9104
2       8427
3       9317
4       8512
       ...  
323     8754
324     9045
325    12541
326     8814
327     8110
Name: average salary , Length: 328, dtype: int64

In [52]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
import numpy as np

# scaler = QuantileTransformer(n_quantiles=100, random_state=0)
scaler = QuantileTransformer(n_quantiles=100, random_state=1, output_distribution='normal')

cols = [col for col in train_df.columns if col != 'loan_id' and col != 'status']
train_df[cols] = scaler.fit_transform(train_df[cols])
test_df[cols] = scaler.fit_transform(test_df[cols])

train_df['status'] = train_df['status'].astype(int)
test_df['status'] = test_df['status'].astype(int)

train_df.head()


Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,birth_number,district_id,gender,...,trans_type_count_withdrawal,trans_type_count_credit,trans_type_mean_withdrawal,trans_type_mean_credit,trans_type_std_withdrawal,trans_type_std_credit,last_balance_l,max_balance_l,owner_age_at,account_age
0,4959,-5.199338,-1.543098,-0.430727,-0.53022,-0.216904,1,-0.952023,-5.199338,5.199338,...,-0.229884,-0.403108,0.346951,-0.346951,0.063341,0.063341,0.11165,0.236579,0.816627,-0.269066
1,4961,-3.03425,0.604585,-1.335178,-5.199338,-0.622082,-1,-1.619856,-0.458679,-5.199338,...,-0.152506,0.908458,-0.984721,0.984721,0.321971,0.321971,0.544529,1.112094,1.807354,0.114185
2,4973,-2.603792,0.619855,0.356532,-0.53022,1.07555,1,-1.024053,-0.604585,-5.199338,...,1.230477,0.501298,1.54304,-1.54304,-1.119968,-1.119968,-0.935819,-0.112402,1.029957,1.029957
3,4996,-2.356668,1.544916,-0.301747,-5.199338,1.367558,1,-0.902159,0.01266,-5.199338,...,-0.870846,-0.870846,-0.758895,0.758895,1.399657,1.399657,0.925573,0.51689,1.029957,-1.021681
4,5002,-2.272299,-1.006949,-0.067424,-5.199338,2.301079,1,-1.496373,0.987837,5.199338,...,-0.731217,-1.304923,0.452475,-0.452475,-0.048867,-0.048867,-0.21349,-0.265617,1.399657,-0.979545


In [53]:
train_df['average salary ']

0      5.199338
1      0.165327
2     -0.987837
3      0.244419
4     -0.781781
         ...   
323   -0.403108
324    0.088734
325    5.199338
326   -0.321971
327   -5.199338
Name: average salary , Length: 328, dtype: float64

In [54]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)