# Data Preparation

Load Model from dataset folder

In [250]:
from model.model import Model

model = Model()

In [251]:
# Load Dataframes
account_df = model.get_accounts()
# card_df = model.get_cards()
client_df = model.get_clients()
disp_df = model.get_disps()
district_df = model.get_districts()
loan_test_df = model.get_loans('test')
loan_train_df = model.get_loans('train')
trans_test_df = model.get_transactions('test')
trans_train_df = model.get_transactions('train')

In [252]:
# Merge all
def merge_all():
    _disp_df = disp_df[disp_df['type'] == 'OWNER']
    df = _disp_df.merge(client_df, on='client_id')
    df = df.merge(district_df, on='district_id')
    df['date'] = df.merge(account_df, on='account_id')['date']
    return df

def merge_train(df):
    df = df.merge(loan_train_df, on='account_id')
    df = df.merge(trans_train_df, on='account_id')
    return df

def merge_test(df):
    df = df.merge(loan_test_df, on='account_id')
    df = df.merge(trans_test_df, on='account_id')
    return df 

df = merge_all()
train_df = merge_train(df)
test_df = merge_test(df)
train_df.head()

Unnamed: 0,client_id,account_id,type,birth_number,district_id,gender,age,district_name,region,no. of inhabitants,...,loan_amount,duration,payments,status,trans_id,trans_date,trans_type,operation,trans_amount,balance
0,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302598,1993-04-20,credit,credit in cash,8897.0,8897.0
1,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302582,1993-04-20,credit,credit in cash,400.0,9297.0
2,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3526454,1993-04-30,credit,interest credited,13.6,9310.6
3,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302588,1993-05-03,credit,credit in cash,25724.0,35034.6
4,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302586,1993-05-15,credit,credit in cash,25060.0,60094.6


In [253]:
train_df.loc[train_df['unemploymant rate \'95 '] == '?', 'unemploymant rate \'95 '] = '0'
train_df.loc[train_df['no. of commited crimes \'95 '] == '?', 'no. of commited crimes \'95 '] = '0'
train_df['unemploymant rate \'95 '] = train_df['unemploymant rate \'95 '].astype('float64')
train_df['no. of commited crimes \'95 '] = train_df['no. of commited crimes \'95 '].astype('float64')
test_df.loc[test_df['unemploymant rate \'95 '] == '?', 'unemploymant rate \'95 '] = '0'
test_df.loc[test_df['no. of commited crimes \'95 '] == '?', 'no. of commited crimes \'95 '] = '0'
test_df['unemploymant rate \'95 '] = test_df['unemploymant rate \'95 '].astype('float64')
test_df['no. of commited crimes \'95 '] = test_df['no. of commited crimes \'95 '].astype('float64')

In [254]:
import numpy as np

train_df.loc[train_df['unemploymant rate \'95 '] == 0, 'unemploymant rate \'95 '] = np.average(train_df.loc[train_df['unemploymant rate \'95 '] != 0, 'unemploymant rate \'95 '])
train_df.loc[train_df['no. of commited crimes \'95 '] == 0, 'no. of commited crimes \'95 '] = np.average(train_df.loc[train_df['no. of commited crimes \'95 '] != 0, 'unemploymant rate \'95 '])
test_df.loc[test_df['unemploymant rate \'95 '] == 0, 'unemploymant rate \'95 '] = np.average(test_df.loc[test_df['unemploymant rate \'95 '] != 0, 'unemploymant rate \'95 '])
test_df.loc[test_df['no. of commited crimes \'95 '] == 0, 'no. of commited crimes \'95 '] = np.average(test_df.loc[test_df['no. of commited crimes \'95 '] != 0, 'unemploymant rate \'95 '])

In [255]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scale_cols = ['loan_amount', 'duration', 'balance',
       'payments', 'no. of inhabitants', 'no. of municipalities with inhabitants < 499 ',
       'no. of municipalities with inhabitants 500-1999',
       'no. of municipalities with inhabitants 2000-9999 ',
       'no. of municipalities with inhabitants >10000 ', 'no. of cities ',
       'ratio of urban inhabitants ', 'average salary ',
       'unemploymant rate \'95 ', 'unemploymant rate \'96 ',
       'no. of enterpreneurs per 1000 inhabitants ',
       'no. of commited crimes \'95 ', 'no. of commited crimes \'96 ']

train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
test_df[scale_cols] = scaler.fit_transform(test_df[scale_cols])
train_df.head()

Unnamed: 0,client_id,account_id,type,birth_number,district_id,gender,age,district_name,region,no. of inhabitants,...,loan_amount,duration,payments,status,trans_id,trans_date,trans_type,operation,trans_amount,balance
0,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3302598,1993-04-20,credit,credit in cash,8897.0,0.06244
1,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3302582,1993-04-20,credit,credit in cash,400.0,0.064467
2,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3526454,1993-04-30,credit,interest credited,13.6,0.064536
3,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3302588,1993-05-03,credit,credit in cash,25724.0,0.194893
4,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3302586,1993-05-15,credit,credit in cash,25060.0,0.321886


In [256]:
import numpy as np
from agg import *

def aggregate(df):
    keep_cols = ['loan_id', 'account_id', 'loan_date', 'loan_amount',
                 'duration', 'payments', 'status',
                 'birth_number',  # 'disp_type_count', # mudar para birth_date
                 'district_id', 'gender',  # 'client_age',
                 'no. of inhabitants',
                 'no. of municipalities with inhabitants < 499 ',
                 'no. of municipalities with inhabitants 500-1999',
                 'no. of municipalities with inhabitants 2000-9999 ',
                 'no. of municipalities with inhabitants >10000 ',
                 'no. of cities ', 'ratio of urban inhabitants ',
                 'average salary ', 'unemploymant rate \'95 ',
                 'unemploymant rate \'96 ',
                 'no. of enterpreneurs per 1000 inhabitants ',
                 'no. of commited crimes \'95 ',
                 'no. of commited crimes \'96 ', 'date']

    # TODO: add more aggregations
    df = df.groupby(keep_cols, as_index=False, group_keys=False).agg({
        'trans_date': ['max', 'min'],
        'trans_amount': ['mean', 'min', 'max', 'std', 'last'],
        'operation': ['count', 
                        count_credit_op, count_collection_op, count_withdrawal_op, count_remittance_op, count_ccw_op, count_interest_op,
                        mean_credit_op, mean_collection_op, mean_withdrawal_op, mean_remittance_op, mean_ccw_op, mean_interest_op,
                        std_credit_op, std_collection_op, std_withdrawal_op, std_remittance_op, std_ccw_op, std_interest_op],
        'balance': ['mean', 'min', 'max', 'std', 'last'],
        'trans_type': [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit]
    })

    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    df['last_balance_l'] = df['balance_last'] / df['loan_amount']
    df.loc[df['last_balance_l'] == np.inf, 'last_balance_l'] = 0

    df['max_balance_l'] = df['balance_max'] / df['loan_amount']
    df.loc[df['max_balance_l'] == np.inf, 'max_balance_l'] = 0

    df['owner_age_at'] = (df['loan_date'] - df['birth_number']).astype('<m8[Y]')
    df['owner_age_at'] = df['owner_age_at'].astype(int)

    df['account_age'] = ((df['loan_date'] - df['date']).dt.days) / 30

    return df


train_df = aggregate(train_df)
test_df = aggregate(test_df)
train_df.head()


Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,birth_number,district_id,gender,...,trans_type_count_withdrawal,trans_type_count_credit,trans_type_mean_withdrawal,trans_type_mean_credit,trans_type_std_withdrawal,trans_type_std_credit,last_balance_l,max_balance_l,owner_age_at,account_age
0,4959,2,1994-01-05,0.142398,0.25,0.325934,1,1945-02-04,1,1,...,32,22,0.592593,0.407407,0.491352,0.491352,1.113161,2.525063,48,10.433333
1,4961,19,1996-04-29,0.047413,0.0,0.235219,-1,1939-04-23,21,0,...,34,46,0.425,0.575,0.494343,0.494343,2.060494,6.581886,57,12.933333
2,4973,67,1996-05-02,0.301732,0.25,0.703949,1,1944-06-13,16,0,...,88,37,0.704,0.296,0.456491,0.456491,0.455617,1.855732,51,18.7
3,4996,132,1996-11-06,0.156433,0.0,0.752508,1,1945-07-03,40,0,...,15,16,0.483871,0.516129,0.49974,0.49974,2.670338,3.455298,51,5.966667
4,5002,173,1994-05-31,0.187112,0.0,0.898079,1,1939-11-30,66,1,...,18,12,0.6,0.4,0.489898,0.489898,0.851487,1.659909,54,6.2


In [257]:
# scaler = MinMaxScaler()
# scale_cols = [
#        # 'operation_count','operation_count_credit_op', 'operation_count_collection_op',
#        # 'operation_count_withdrawal_op', 'operation_count_remittance_op',
#        # 'operation_count_ccw_op', 'operation_count_interest_op',
#        # 'operation_mean_credit_op', 'operation_mean_collection_op',
#        # 'operation_mean_withdrawal_op', 'operation_mean_remittance_op',
#        # 'operation_mean_ccw_op', 'operation_mean_interest_op',
#        'trans_type_count_withdrawal', 'trans_type_count_credit',
#        'trans_type_mean_withdrawal', 'trans_type_mean_credit',]

# train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
# test_df[scale_cols] = scaler.fit_transform(test_df[scale_cols])
# train_df.head()

In [258]:
from sklearn.preprocessing import LabelEncoder

def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            df[col] = le.fit_transform(df[col])
    return df

train_df = encode_df(train_df)
test_df = encode_df(test_df)

In [259]:
train_df.head()

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,birth_number,district_id,gender,...,trans_type_count_withdrawal,trans_type_count_credit,trans_type_mean_withdrawal,trans_type_mean_credit,trans_type_std_withdrawal,trans_type_std_credit,last_balance_l,max_balance_l,owner_age_at,account_age
0,4959,2,19,0.142398,0.25,0.325934,1,52,1,1,...,32,22,0.592593,0.407407,0.491352,0.491352,1.113161,2.525063,48,10.433333
1,4961,19,209,0.047413,0.0,0.235219,-1,17,21,0,...,34,46,0.425,0.575,0.494343,0.494343,2.060494,6.581886,57,12.933333
2,4973,67,210,0.301732,0.25,0.703949,1,48,16,0,...,88,37,0.704,0.296,0.456491,0.456491,0.455617,1.855732,51,18.7
3,4996,132,265,0.156433,0.0,0.752508,1,56,40,0,...,15,16,0.483871,0.516129,0.49974,0.49974,2.670338,3.455298,51,5.966667
4,5002,173,48,0.187112,0.0,0.898079,1,21,66,1,...,18,12,0.6,0.4,0.489898,0.489898,0.851487,1.659909,54,6.2


In [260]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neural_network import MLPClassifier

# clf = SVC(probability=True)
clf = LogisticRegression(max_iter=10000)
# clf = KNeighborsClassifier()
# clf = MLPClassifier(random_state=1, activation='relu', max_iter=10000, solver='sgd')
x = train_df.drop(['status', 'loan_id', 'account_id', 'district_id'], axis=1)
y = train_df['status']
clf.fit(x, y)

LogisticRegression(max_iter=10000)

In [269]:
import pandas as pd
df = pd.DataFrame(data={'Id': test_df.loan_id,
                  'Predicted': clf.predict_proba(test_df.drop(['status', 'loan_id', 'account_id', 'district_id'], axis=1))[:, 0]})
df.to_csv('submission.csv', index=False)

In [270]:
from sklearn.metrics import roc_auc_score, confusion_matrix
roc_auc_score(y, clf.predict_proba(x)[:, 0])

0.1952667283379587

In [271]:
tn, fp, fn, tp = confusion_matrix(y, clf.predict(x)).ravel()
print(f'True positives: {tp}')
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')

True positives: 278
True negatives: 6
False positives: 40
False negatives: 4
