# Data Preparation

Load Model from dataset folder

In [None]:
from model.model import Model

model = Model()

In [None]:
# Load Dataframes
account_df = model.get_accounts()
client_df = model.get_clients()
# card_test_df = model.get_cards('test')
# card_train_df = model.get_cards('train')
disp_df = model.get_disps()
district_df = model.get_districts()
loan_test_df = model.get_loans('test')
loan_train_df = model.get_loans('train')
trans_test_df = model.get_transactions('test')
trans_train_df = model.get_transactions('train')

In [None]:
def merge_all():
    df = disp_df.merge(client_df, on='client_id')
    df = df.merge(district_df, on='district_id')
    df['date'] = df.merge(account_df, on='account_id')['date']
    return df

def merge_train(df):
    df = df.merge(loan_train_df, on='account_id')
    df = df.merge(trans_train_df, on='account_id')
    return df

def merge_test(df):
    df = df.merge(loan_test_df, on='account_id')
    df = df.merge(trans_test_df, on='account_id')
    return df 

df = merge_all()
train_df = merge_train(df)
test_df = merge_test(df)
train_df.head()

df = merge_all()
train_df = merge_train(df)
test_df = merge_test(df)
train_df.head()

In [None]:
test_df.shape

In [None]:
import numpy as np
from agg import *

def agg_features(df):
    agg_columns = ['loan_id', 'account_id', 'loan_date', 'loan_amount',
                 'duration', 'payments', 'status',
                 'birth_number',
                 'district_id', 'gender',
                 'no. of inhabitants',
                 'small_munis_rate', 'medium_munis_rate',
                 'large_munis_rate', 'larger_munis_rate',
                 'inhabitant_rate', 'no. of cities ', 
                 'ratio of urban inhabitants ',
                 'average salary ', 'unemploymant rate \'95 ',
                 'unemploymant rate \'96 ',
                 'no. of enterpreneurs per 1000 inhabitants ',
                 'crime_rate \'95',
                 'crime_rate \'96', 'date']

    df = df.groupby(agg_columns, as_index=False, group_keys=False).agg({
        'trans_date': ['max', 'min', days],
        'trans_amount': ['mean', 'min', 'max', 'std', 'last'],
        'operation': ['count', 
                    count_credit_op, count_collection_op, count_withdrawal_op, count_remittance_op, count_ccw_op, count_interest_op,
                    mean_credit_op, mean_collection_op, mean_withdrawal_op, mean_remittance_op, mean_ccw_op, mean_interest_op,
                    std_credit_op, std_collection_op, std_withdrawal_op, std_remittance_op, std_ccw_op, std_interest_op],
        'balance': ['mean', 'min', 'max', 'std', 'last', bal_range, bal_min],
        'trans_type': [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit]
    })

    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    df['days_last_trans'] = (df['loan_date'] - df['trans_date_max']).dt.days
    df['last_balance_l'] = df['balance_last'] / df['loan_amount']
    df.loc[df['last_balance_l'] == np.inf, 'last_balance_l'] = 0
    df['max_balance_l'] = df['balance_max'] / df['loan_amount']
    df.loc[df['max_balance_l'] == np.inf, 'max_balance_l'] = 0
    df['age_months'] = df['trans_date_days'] / 30
    df['bal_per_month'] = df['balance_bal_range'] / df['age_months']
    df['trans_per_month'] = df['operation_count'] / df['age_months']
    df['owner_age_at'] = (df['loan_date'] - df['birth_number']).astype('<m8[Y]') # age in years
    df['owner_age_at'] = df['owner_age_at'].astype(int)
    df['account_age'] = ((df['loan_date'] - df['date']).dt.days) / 30

    return df


train_df = agg_features(train_df)
test_df = agg_features(test_df)
train_df.head()


In [None]:
train_df.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            df[col] = le.fit_transform(df[col])
    return df

train_df = encode_df(train_df)
test_df = encode_df(test_df)

In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
import numpy as np

# scaler = QuantileTransformer(n_quantiles=100, random_state=0)
scaler = QuantileTransformer(n_quantiles=100, random_state=1, output_distribution='normal')

cols = [col for col in train_df.columns if col != 'loan_id' and col != 'status']
train_df[cols] = scaler.fit_transform(train_df[cols])
test_df[cols] = scaler.fit_transform(test_df[cols])

train_df['status'] = train_df['status'].astype(int)
test_df['status'] = test_df['status'].astype(int)

train_df.head()

In [None]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)