In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [9]:
from model.model import Model

model = Model()
model.load()

<model.model.Model at 0x11a6518e0>

In [10]:
def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object':
            df[col] = le.fit_transform(df[col])
    return df


def is_female(number):
    return str(number)[2] == '5' or str(number)[2] == '6'


def get_client_df():
    df = open_csv('client')
    df['gender'] = df.apply(lambda row: 0 if is_female(row['birth_number']) else 1, axis=1)
    df['birth_number'] = df.apply(lambda row: row['birth_number'] - 5000 if is_female(row['birth_number']) else row['birth_number'], axis=1)
    return df


In [11]:
def merge_all(model, op='test'):
    def modify_trans(df, op):
        df.loc[df[f'trans_{op}_type'] == 'withdrawal', f'trans_{op}_amount'] *= -1
        df = df.drop([f'trans_{op}_k_symbol'], axis=1)
        df = df.drop([f'trans_{op}_bank', f'trans_{op}_account'], axis=1)
        return df

    def modify_client(df):
        df['client_gender'] = df.apply(lambda row: 0 if is_female(row['client_birth_number']) else 1, axis=1)
        df['client_age'] = df.apply(lambda row: 2021 - ((row['client_birth_number'] // 10000) + 1900), axis=1)
        df['client_birth_number'] = df.apply(lambda row: row['client_birth_number'] - 5000 if is_female(row['client_birth_number']) else row['client_birth_number'], axis=1)
        return df

    account_df = model.datasets['account']
    
    client_df = model.datasets['client']
    client_df = modify_client(client_df)
    
    disp_df = model.datasets['disp']
    district_df = model.datasets['district'].rename(columns={'district_code': 'district_id'})
    card_df = model.datasets[op][f'card_{op}']
    loan_df = model.datasets[op][f'loan_{op}']
    
    trans_df = model.datasets[op][f'trans_{op}']
    trans_df = modify_trans(trans_df, op)

    ownership_df = disp_df.groupby('account_id', as_index=False, group_keys=False).agg({'disp_type': ['count']})
    ownership_df.columns = [f'{col_name}_{agg_name}' if 'id' not in col_name else col_name for col_name, agg_name in ownership_df.columns]
    disp_df = disp_df.merge(ownership_df, on='account_id')
    disp_df = disp_df[disp_df.disp_type == 'OWNER'].drop(['disp_type'], axis=1)

    df = pd.merge(loan_df, disp_df, on='account_id')
    df = pd.merge(df, client_df, on='client_id')
    df = pd.merge(df, district_df, on='district_id')
    df['account_date'] = pd.merge(df, account_df, on='account_id')['account_date']
    df = pd.merge(df, trans_df, on='account_id')
    return df.sort_values(by=[f'trans_{op}_date'])


train_df = merge_all(model, 'train')
test_df = merge_all(model, 'train')
# train_df.head()
# test_df = merge_all('test')
# print(get_trans_train_df()['trans_amount'])


In [12]:
train_df = encode_df(train_df)
test_df = encode_df(test_df)
train_df.head()

Unnamed: 0,loan_id,account_id,loan_train_date,loan_train_amount,loan_train_duration,loan_train_payments,loan_train_status,disp_id,client_id,disp_type_count,...,district_no._of_enterpreneurs_per_1000_inhabitants,district_no._of_commited_crimes_'95,district_no._of_commited_crimes_'96,account_date,trans_id,trans_train_date,trans_train_type,trans_train_operation,trans_train_amount,trans_train_balance
1988,6077,5270,931122,79608,24,3317,1,6367,6367,1,...,97,22,2325,930113,1548749,930113,0,2,800.0,800.0
1030,7284,11265,930915,52788,12,4399,1,13537,13845,1,...,124,11,1879,930114,3393738,930114,0,2,1000.0,1000.0
1989,6077,5270,931122,79608,24,3317,1,6367,6367,1,...,97,22,2325,930113,1548750,930114,0,0,44749.0,45549.0
1574,7121,10364,931110,21924,36,609,1,12446,12754,2,...,111,41,3894,930117,3122924,930117,0,2,1100.0,1100.0
6977,5754,3834,940928,23052,12,1921,1,4620,4620,2,...,140,13,18696,930119,1121963,930119,0,2,700.0,700.0


In [13]:
train_df.columns

Index(['loan_id', 'account_id', 'loan_train_date', 'loan_train_amount',
       'loan_train_duration', 'loan_train_payments', 'loan_train_status',
       'disp_id', 'client_id', 'disp_type_count', 'client_birth_number',
       'district_id', 'client_gender', 'client_age', 'district_name',
       'district_region', 'district_no._of_inhabitants',
       'district_no._of_municipalities_with_inhabitants_<_499',
       'district_no._of_municipalities_with_inhabitants_500-1999',
       'district_no._of_municipalities_with_inhabitants_2000-9999',
       'district_no._of_municipalities_with_inhabitants_>10000',
       'district_no._of_cities', 'district_ratio_of_urban_inhabitants',
       'district_average_salary', 'district_unemploymant_rate_'95',
       'district_unemploymant_rate_'96',
       'district_no._of_enterpreneurs_per_1000_inhabitants',
       'district_no._of_commited_crimes_'95',
       'district_no._of_commited_crimes_'96', 'account_date', 'trans_id',
       'trans_train_date', '

In [14]:
from agg import *
import numpy as np

def create_stuff(df):
    keep_cols = ['loan_id', 'account_id', 'loan_train_date', 'loan_train_amount',
                 'loan_train_duration', 'loan_train_payments', 'loan_train_status',
                 'disp_id', 'client_id', 'disp_type_count', 'client_birth_number',
                 'district_id', 'client_gender', 'client_age', 'district_name',
                 'district_region', 'district_no._of_inhabitants',
                 'district_no._of_municipalities_with_inhabitants_<_499',
                 'district_no._of_municipalities_with_inhabitants_500-1999',
                 'district_no._of_municipalities_with_inhabitants_2000-9999',
                 'district_no._of_municipalities_with_inhabitants_>10000',
                 'district_no._of_cities', 'district_ratio_of_urban_inhabitants',
                 'district_average_salary', 'district_unemploymant_rate_\'95',
                 'district_unemploymant_rate_\'96',
                 'district_no._of_enterpreneurs_per_1000_inhabitants',
                 'district_no._of_commited_crimes_\'95',
                 'district_no._of_commited_crimes_\'96', 'account_date', 'trans_id',
                 'trans_train_date', 'trans_train_type', 'trans_train_operation',
                 'trans_train_amount', 'trans_train_balance']
    df = df.groupby(keep_cols, as_index=False, group_keys=False).agg({
        'trans_train_amount': ['mean', 'min', 'max', 'std', 'last'],
        'trans_train_balance': ['mean', 'min', 'max', 'std', 'last'],
        'trans_train_type': [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit]
    })
    # df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    return df

train_df = create_stuff(train_df)
test_df = create_stuff(test_df)

train_df.head(5)
# train_df.dtypes


Unnamed: 0_level_0,loan_id,account_id,loan_train_date,loan_train_amount,loan_train_duration,loan_train_payments,loan_train_status,disp_id,client_id,disp_type_count,...,trans_train_balance,trans_train_balance,trans_train_balance,trans_train_balance,trans_train_type,trans_train_type,trans_train_type,trans_train_type,trans_train_type,trans_train_type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,min,max,std,last,count_withdrawal,count_credit,mean_withdrawal,mean_credit,std_withdrawal,std_credit
0,4959,2,940105,80952,24,3373,1,2,2,2,...,1100.0,1100.0,,1100.0,0,0,0.0,0.0,0.0,0.0
1,4959,2,940105,80952,24,3373,1,2,2,2,...,21336.0,21336.0,,21336.0,0,0,0.0,0.0,0.0,0.0
2,4959,2,940105,80952,24,3373,1,2,2,2,...,45285.5,45285.5,,45285.5,0,0,0.0,0.0,0.0,0.0
3,4959,2,940105,80952,24,3373,1,2,2,2,...,54630.9,54630.9,,54630.9,0,0,0.0,0.0,0.0,0.0
4,4959,2,940105,80952,24,3373,1,2,2,2,...,67529.6,67529.6,,67529.6,0,0,0.0,0.0,0.0,0.0


In [16]:
from sklearn.svm import SVC

# test_df.columns = [col_name.replace('test', 'train') for col_name in test_df.columns]
clf = SVC(probability=True)
x = train_df.drop(['loan_train_status'], axis=1).fillna(0)
y = train_df.loan_train_status
clf.fit(x,y)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


SVC(probability=True)

In [18]:
df = pd.DataFrame(data={'Id': test_df.loan_id,
                  'Predicted': clf.predict_proba(test_df.drop(['loan_train_status'], axis=1).fillna(0))[:, -1]})
df.to_csv('submission.csv', index=False)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [19]:
df = df.groupby('Id').max()
df.to_csv('submission.csv', index=True)

In [None]:
train_df.to_csv('data/train.csv', index=False, sep=';')