# Data Preparation

Load Model from dataset folder

In [None]:
from model.model import Model

model = Model()

In [None]:
# Load Dataframes
account_df = model.get_accounts()
# card_df = model.get_cards()
client_df = model.get_clients()
disp_df = model.get_disps()
district_df = model.get_districts()
loan_test_df = model.get_loans('test')
loan_train_df = model.get_loans('train')
trans_test_df = model.get_transactions('test')
trans_train_df = model.get_transactions('train')

In [None]:
# Merge all
def merge_all():
    _disp_df = disp_df[disp_df['type'] == 'OWNER']
    df = _disp_df.merge(client_df, on='client_id')
    df = df.merge(district_df, on='district_id')
    df['date'] = df.merge(account_df, on='account_id')['date']
    return df

def merge_train(df):
    df = df.merge(loan_train_df, on='account_id')
    df = df.merge(trans_train_df, on='account_id')
    return df

def merge_test(df):
    df = df.merge(loan_test_df, on='account_id')
    df = df.merge(trans_test_df, on='account_id')
    return df 

df = merge_all()
train_df = merge_train(df)
test_df = merge_test(df)
train_df.head()

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# import numpy as np

# scaler = MinMaxScaler()
# scale_cols = ['loan_amount', 'duration', 'balance',
#        'payments', 'no. of inhabitants', 'no. of municipalities with inhabitants < 499 ',
#        'no. of municipalities with inhabitants 500-1999',
#        'no. of municipalities with inhabitants 2000-9999 ',
#        'no. of municipalities with inhabitants >10000 ', 'no. of cities ',
#        'ratio of urban inhabitants ', 'average salary ',
#        'unemploymant rate \'95 ', 'unemploymant rate \'96 ',
#        'no. of enterpreneurs per 1000 inhabitants ',
#        'no. of commited crimes \'95 ', 'no. of commited crimes \'96 ']

# train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
# test_df[scale_cols] = scaler.fit_transform(test_df[scale_cols])
# train_df.head()


In [None]:
import numpy as np
from agg import *

def aggregate(df):
    keep_cols = ['loan_id', 'account_id', 'loan_date', 'loan_amount',
                 'duration', 'payments', 'status',
                 'birth_number',  # 'disp_type_count', # mudar para birth_date
                 'district_id', 'gender',  # 'client_age',
                 'no. of inhabitants',
                 'no. of municipalities with inhabitants < 499 ',
                 'no. of municipalities with inhabitants 500-1999',
                 'no. of municipalities with inhabitants 2000-9999 ',
                 'no. of municipalities with inhabitants >10000 ',
                 'no. of cities ', 'ratio of urban inhabitants ',
                 'average salary ', 'unemploymant rate \'95 ',
                 'unemploymant rate \'96 ',
                 'no. of enterpreneurs per 1000 inhabitants ',
                 'no. of commited crimes \'95 ',
                 'no. of commited crimes \'96 ', 'date']

    # TODO: add more aggregations
    df = df.groupby(keep_cols, as_index=False, group_keys=False).agg({
        'trans_date': ['max', 'min'],
        'trans_amount': ['mean', 'min', 'max', 'std', 'last'],
        'operation': ['count', 
                        count_credit_op, count_collection_op, count_withdrawal_op, count_remittance_op, count_ccw_op, count_interest_op,
                        mean_credit_op, mean_collection_op, mean_withdrawal_op, mean_remittance_op, mean_ccw_op, mean_interest_op,
                        std_credit_op, std_collection_op, std_withdrawal_op, std_remittance_op, std_ccw_op, std_interest_op],
        'balance': ['mean', 'min', 'max', 'std', 'last'],
        'trans_type': [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit]
    })

    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    df['last_balance_l'] = df['balance_last'] / df['loan_amount']
    df.loc[df['last_balance_l'] == np.inf, 'last_balance_l'] = 0

    df['max_balance_l'] = df['balance_max'] / df['loan_amount']
    df.loc[df['max_balance_l'] == np.inf, 'max_balance_l'] = 0

    df['owner_age_at'] = (df['loan_date'] - df['birth_number']).astype('<m8[Y]')
    df['owner_age_at'] = df['owner_age_at'].astype(int)

    df['account_age'] = ((df['loan_date'] - df['date']).dt.days) / 30

    return df


train_df = aggregate(train_df)
test_df = aggregate(test_df)
train_df.head()


In [None]:
# scaler = MinMaxScaler()
# scale_cols = [
#        # 'operation_count','operation_count_credit_op', 'operation_count_collection_op',
#        # 'operation_count_withdrawal_op', 'operation_count_remittance_op',
#        # 'operation_count_ccw_op', 'operation_count_interest_op',
#        # 'operation_mean_credit_op', 'operation_mean_collection_op',
#        # 'operation_mean_withdrawal_op', 'operation_mean_remittance_op',
#        # 'operation_mean_ccw_op', 'operation_mean_interest_op',
#        'trans_type_count_withdrawal', 'trans_type_count_credit',
#        'trans_type_mean_withdrawal', 'trans_type_mean_credit',]

# train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
# test_df[scale_cols] = scaler.fit_transform(test_df[scale_cols])
# train_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            df[col] = le.fit_transform(df[col])
    return df

train_df = encode_df(train_df)
test_df = encode_df(test_df)


In [None]:
train_df.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()

cols = [col for col in train_df.columns if col != 'loan_id']

train_df[cols] = scaler.fit_transform(train_df[cols])
test_df[cols] = scaler.fit_transform(test_df[cols])

train_df['status'] = train_df['status'].astype(int)
test_df['status'] = test_df['status'].astype(int)

train_df.head()


In [None]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)