# Data Preparation

Load Model from dataset folder

In [137]:
from model.model import Model

model = Model()

In [138]:
# Load Dataframes
account_df = model.get_accounts()
# card_df = model.get_cards()
client_df = model.get_clients()
disp_df = model.get_disps()
district_df = model.get_districts()
loan_test_df = model.get_loans('test')
loan_train_df = model.get_loans('train')
trans_test_df = model.get_transactions('test')
trans_train_df = model.get_transactions('train')

In [139]:
# Merge all
def merge_all():
    _disp_df = disp_df[disp_df['type'] == 'OWNER']
    df = _disp_df.merge(client_df, on='client_id')
    df = df.merge(district_df, on='district_id')
    df['date'] = df.merge(account_df, on='account_id')['date']
    return df

def merge_train(df):
    df = df.merge(loan_train_df, on='account_id')
    df = df.merge(trans_train_df, on='account_id')
    return df

def merge_test(df):
    df = df.merge(loan_test_df, on='account_id')
    df = df.merge(trans_test_df, on='account_id')
    return df 

df = merge_all()
train_df = merge_train(df)
test_df = merge_test(df)
train_df.head()

Unnamed: 0,client_id,account_id,type,birth_number,district_id,gender,age,district_name,region,no. of inhabitants,...,loan_amount,duration,payments,status,trans_id,trans_date,trans_type,operation,trans_amount,balance
0,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302598,1993-04-20,credit,credit in cash,8897.0,8897.0
1,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302582,1993-04-20,credit,credit in cash,400.0,9297.0
2,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3526454,1993-04-30,credit,interest credited,13.6,9310.6
3,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302588,1993-05-03,credit,credit in cash,25724.0,35034.6
4,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,70699,...,154416,48,3217,1,3302586,1993-05-15,credit,credit in cash,25060.0,60094.6


In [140]:
train_df.loc[train_df['unemploymant rate \'95 '] == '?', 'unemploymant rate \'95 '] = '0'
train_df.loc[train_df['no. of commited crimes \'95 '] == '?', 'no. of commited crimes \'95 '] = '0'
train_df['unemploymant rate \'95 '] = train_df['unemploymant rate \'95 '].astype('float64')
train_df['no. of commited crimes \'95 '] = train_df['no. of commited crimes \'95 '].astype('float64')
test_df.loc[test_df['unemploymant rate \'95 '] == '?', 'unemploymant rate \'95 '] = '0'
test_df.loc[test_df['no. of commited crimes \'95 '] == '?', 'no. of commited crimes \'95 '] = '0'
test_df['unemploymant rate \'95 '] = test_df['unemploymant rate \'95 '].astype('float64')
test_df['no. of commited crimes \'95 '] = test_df['no. of commited crimes \'95 '].astype('float64')

In [141]:
train_df.loc[train_df['unemploymant rate \'95 '] == 0, 'unemploymant rate \'95 '] = np.average(train_df.loc[train_df['unemploymant rate \'95 '] != 0, 'unemploymant rate \'95 '])
train_df.loc[train_df['no. of commited crimes \'95 '] == 0, 'no. of commited crimes \'95 '] = np.average(train_df.loc[train_df['no. of commited crimes \'95 '] != 0, 'unemploymant rate \'95 '])
test_df.loc[test_df['unemploymant rate \'95 '] == 0, 'unemploymant rate \'95 '] = np.average(test_df.loc[test_df['unemploymant rate \'95 '] != 0, 'unemploymant rate \'95 '])
test_df.loc[test_df['no. of commited crimes \'95 '] == 0, 'no. of commited crimes \'95 '] = np.average(test_df.loc[test_df['no. of commited crimes \'95 '] != 0, 'unemploymant rate \'95 '])

In [142]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scale_cols = ['loan_amount', 'duration', 'balance',
       'payments', 'no. of inhabitants', 'no. of municipalities with inhabitants < 499 ',
       'no. of municipalities with inhabitants 500-1999',
       'no. of municipalities with inhabitants 2000-9999 ',
       'no. of municipalities with inhabitants >10000 ', 'no. of cities ',
       'ratio of urban inhabitants ', 'average salary ',
       'unemploymant rate \'95 ', 'unemploymant rate \'96 ',
       'no. of enterpreneurs per 1000 inhabitants ',
       'no. of commited crimes \'95 ', 'no. of commited crimes \'96 ']

train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
test_df[scale_cols] = scaler.fit_transform(test_df[scale_cols])
train_df.head()

Unnamed: 0,client_id,account_id,type,birth_number,district_id,gender,age,district_name,region,no. of inhabitants,...,loan_amount,duration,payments,status,trans_id,trans_date,trans_type,operation,trans_amount,balance
0,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3302598,1993-04-20,credit,credit in cash,8897.0,0.06244
1,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3302582,1993-04-20,credit,credit in cash,400.0,0.064467
2,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3526454,1993-04-30,credit,interest credited,13.6,0.064536
3,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3302588,1993-05-03,credit,credit in cash,25724.0,0.194893
4,13490,10973,OWNER,1969-05-25,18,0,52,Pisek,south Bohemia,0.023989,...,0.280094,0.75,0.309285,1,3302586,1993-05-15,credit,credit in cash,25060.0,0.321886


In [143]:
import numpy as np

def aggregate(df):
    def count_withdrawal(trans_type):
        return sum(trans_type == 'withdrawal')
    def count_credit(trans_type):
        return sum(trans_type == 'credit')
    def mean_withdrawal(trans_type):
        return np.mean(trans_type == 'withdrawal')
    def mean_credit(trans_type):
        return np.mean(trans_type == 'credit')
    def std_withdrawal(trans_type):
        return np.std(trans_type == 'withdrawal')
    def std_credit(trans_type):
        return np.std(trans_type == 'credit')
    keep_cols = ['loan_id', 'account_id', 'loan_date', 'loan_amount',
                'duration', 'payments', 'status',
                'birth_number', # 'disp_type_count', # mudar para birth_date
                'district_id', 'gender', #'client_age',
                'no. of inhabitants',
                'no. of municipalities with inhabitants < 499 ',
                'no. of municipalities with inhabitants 500-1999',
                'no. of municipalities with inhabitants 2000-9999 ',
                'no. of municipalities with inhabitants >10000 ',
                'no. of cities ', 'ratio of urban inhabitants ',
                'average salary ', 'unemploymant rate \'95 ',
                'unemploymant rate \'96 ',
                'no. of enterpreneurs per 1000 inhabitants ',
                'no. of commited crimes \'95 ',
                'no. of commited crimes \'96 ', 'date']

    # TODO: add more aggregations
    df = df.groupby(keep_cols, as_index=False, group_keys=False).agg({
        'trans_amount': ['mean', 'min', 'max', 'std', 'last'],
        'balance': ['mean', 'min', 'max', 'std', 'last'],
        'trans_type': [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit]
    })

    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    return df

train_df = aggregate(train_df)
test_df = aggregate(test_df)
train_df.head()

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,birth_number,district_id,gender,...,balance_min,balance_max,balance_std,balance_last,trans_type_count_withdrawal,trans_type_count_credit,trans_type_mean_withdrawal,trans_type_mean_credit,trans_type_std_withdrawal,trans_type_std_credit
0,4959,2,1994-01-05,0.142398,0.25,0.325934,1,1945-02-04,1,1,...,0.022929,0.359563,0.061123,0.158512,29,22,0.537037,0.407407,0.498626,0.491352
1,4961,19,1996-04-29,0.047413,0.0,0.235219,-1,1939-04-23,21,0,...,0.020978,0.31207,0.076212,0.097695,32,46,0.4,0.575,0.489898,0.494343
2,4973,67,1996-05-02,0.301732,0.25,0.703949,1,1944-06-13,16,0,...,0.020902,0.559934,0.106194,0.137474,84,37,0.672,0.296,0.469485,0.456491
3,4996,132,1996-11-06,0.156433,0.0,0.752508,1,1945-07-03,40,0,...,0.018368,0.540522,0.109653,0.417728,14,16,0.451613,0.516129,0.497653,0.49974
4,5002,173,1994-05-31,0.187112,0.0,0.898079,1,1939-11-30,66,1,...,0.019888,0.310589,0.058364,0.159323,18,12,0.6,0.4,0.489898,0.489898


In [144]:
from sklearn.preprocessing import LabelEncoder

def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            df[col] = le.fit_transform(df[col])
    return df

train_df = encode_df(train_df)
test_df = encode_df(test_df)

In [145]:
train_df.head()

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,birth_number,district_id,gender,...,balance_min,balance_max,balance_std,balance_last,trans_type_count_withdrawal,trans_type_count_credit,trans_type_mean_withdrawal,trans_type_mean_credit,trans_type_std_withdrawal,trans_type_std_credit
0,4959,2,19,0.142398,0.25,0.325934,1,52,1,1,...,0.022929,0.359563,0.061123,0.158512,29,22,0.537037,0.407407,0.498626,0.491352
1,4961,19,209,0.047413,0.0,0.235219,-1,17,21,0,...,0.020978,0.31207,0.076212,0.097695,32,46,0.4,0.575,0.489898,0.494343
2,4973,67,210,0.301732,0.25,0.703949,1,48,16,0,...,0.020902,0.559934,0.106194,0.137474,84,37,0.672,0.296,0.469485,0.456491
3,4996,132,265,0.156433,0.0,0.752508,1,56,40,0,...,0.018368,0.540522,0.109653,0.417728,14,16,0.451613,0.516129,0.497653,0.49974
4,5002,173,48,0.187112,0.0,0.898079,1,21,66,1,...,0.019888,0.310589,0.058364,0.159323,18,12,0.6,0.4,0.489898,0.489898


In [153]:
# from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier

# clf = SVC(probability=True)
clf = LogisticRegression(max_iter=10000)
# clf = DecisionTreeClassifier()
x = train_df.drop(['status'], axis=1)
y = train_df['status']
clf.fit(x, y)

LogisticRegression(max_iter=10000)

In [154]:
clf.predict_proba(test_df.drop(['status'], axis=1))[:, 0]

array([0.04292512, 0.08501369, 0.36555032, 0.05336762, 0.62597533,
       0.42604701, 0.12242745, 0.36187672, 0.26934435, 0.15894958,
       0.05798482, 0.07276976, 0.28167627, 0.04580894, 0.07318776,
       0.10215817, 0.30784879, 0.03281269, 0.15011571, 0.54234325,
       0.10437669, 0.49664187, 0.01837153, 0.23244318, 0.2840185 ,
       0.35294819, 0.2550009 , 0.23596367, 0.59905819, 0.34588847,
       0.05629717, 0.12193905, 0.04457139, 0.12000767, 0.37432264,
       0.3229964 , 0.10238782, 0.257211  , 0.25440618, 0.35629743,
       0.21650669, 0.23569076, 0.37604192, 0.19066999, 0.56407741,
       0.11996498, 0.60823254, 0.14452899, 0.13441632, 0.3462253 ,
       0.05711369, 0.13764668, 0.21444654, 0.25163288, 0.3761132 ,
       0.03521504, 0.20892098, 0.27797188, 0.11427906, 0.19609781,
       0.07721263, 0.32866393, 0.00445443, 0.34516247, 0.15100638,
       0.39904712, 0.16675611, 0.14592725, 0.31463312, 0.35966293,
       0.28162262, 0.47745816, 0.1075125 , 0.28272426, 0.10190

In [155]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y, clf.predict_proba(x)[:, 0])

0.24691643539932162