In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
from model.model import Model

model = Model()
model.load()

<model.model.Model at 0x1ee64e90b50>

In [3]:
def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object':
            df[col] = le.fit_transform(df[col])
    return df


def is_female(number):
    return str(number)[2] == '5' or str(number)[2] == '6'


def get_client_df():
    df = open_csv('client')
    df['gender'] = df.apply(lambda row: 0 if is_female(row['birth_number']) else 1, axis=1)
    df['birth_number'] = df.apply(lambda row: row['birth_number'] - 5000 if is_female(row['birth_number']) else row['birth_number'], axis=1)
    return df


In [23]:
def merge_all(model, op='test'):
    def modify_trans(df, op):
        df.loc[df[f'trans_{op}_type'] == 'withdrawal', f'trans_{op}_amount'] *= -1
        df = df.drop([f'trans_{op}_k_symbol'], axis=1)
        df = df.drop([f'trans_{op}_bank', f'trans_{op}_account'], axis=1)
        return df

    def modify_client(df):
        df['client_gender'] = df.apply(lambda row: 0 if is_female(row['client_birth_number']) else 1, axis=1)
        df['client_age'] = df.apply(lambda row: 2021 - ((row['client_birth_number'] // 10000) + 1900), axis=1)
        df['client_birth_number'] = df.apply(lambda row: row['client_birth_number'] - 5000 if is_female(row['client_birth_number']) else row['client_birth_number'], axis=1)
        return df

    account_df = model.datasets['account']
    
    client_df = model.datasets['client']
    client_df = modify_client(client_df)
    
    disp_df = model.datasets['disp']
    district_df = model.datasets['district'].rename(columns={'district_code': 'district_id'})
    card_df = model.datasets[op][f'card_{op}']
    loan_df = model.datasets[op][f'loan_{op}']
    
    trans_df = model.datasets[op][f'trans_{op}']
    trans_df = modify_trans(trans_df, op)

    ownership_df = disp_df.groupby('account_id', as_index=False, group_keys=False).agg({'disp_type': ['count']})
    ownership_df.columns = [f'{col_name}_{agg_name}' if 'id' not in col_name else col_name for col_name, agg_name in ownership_df.columns]
    disp_df = disp_df.merge(ownership_df, on='account_id')
    disp_df = disp_df[disp_df.disp_type == 'OWNER'].drop(['disp_type'], axis=1)

    df = pd.merge(loan_df, disp_df, on='account_id')
    df = pd.merge(df, client_df, on='client_id')
    df = pd.merge(df, district_df, on='district_id')
    df['account_date'] = pd.merge(df, account_df, on='account_id')['account_date']
    df = pd.merge(df, trans_df, on='account_id')
    return df.sort_values(by=[f'trans_{op}_date'])


train_df = merge_all(model, 'train')
test_df = merge_all(model, 'test')
# train_df.head()
# test_df = merge_all('test')
# print(get_trans_train_df()['trans_amount'])


   client_id  client_birth_number  district_id  client_gender  client_age
0          1               701213           18              1          51
1          2               450204            1              1          76
2          3               401009            1              1          81
3          4               561201            5              1          65
4          5               600703            5              1          61
   client_id  client_birth_number  district_id  client_gender  client_age
0          1               701213           18              1          51
1          2               450204            1              1          76
2          3               401009            1              1          81
3          4               561201            5              1          65
4          5               600703            5              1          61


In [9]:
train_df = encode_df(train_df)
test_df = encode_df(test_df)
train_df.head()

Unnamed: 0,loan_id,account_id,loan_train_date,loan_train_amount,loan_train_duration,loan_train_payments,loan_train_status,disp_id,client_id,disp_type_count,...,district_no._of_enterpreneurs_per_1000_inhabitants,district_no._of_commited_crimes_'95,district_no._of_commited_crimes_'96,account_date,trans_id,trans_train_date,trans_train_type,trans_train_operation,trans_train_amount,trans_train_balance
1988,6077,5270,931122,79608,24,3317,1,6367,6367,1,...,97,22,2325,930113,1548749,930113,0,2,800.0,800.0
1030,7284,11265,930915,52788,12,4399,1,13537,13845,1,...,124,11,1879,930114,3393738,930114,0,2,1000.0,1000.0
1989,6077,5270,931122,79608,24,3317,1,6367,6367,1,...,97,22,2325,930113,1548750,930114,0,0,44749.0,45549.0
1574,7121,10364,931110,21924,36,609,1,12446,12754,2,...,111,41,3894,930117,3122924,930117,0,2,1100.0,1100.0
6977,5754,3834,940928,23052,12,1921,1,4620,4620,2,...,140,13,18696,930119,1121963,930119,0,2,700.0,700.0


In [6]:
from sklearn.svm import SVC

test_df.columns = [col_name.replace('test', 'train') for col_name in test_df.columns]
clf = SVC(probability=True)
x = train_df.drop(['loan_train_status'], axis=1)
y = train_df.loan_train_status
clf.fit(x,y)

SVC(probability=True)

In [14]:
df = pd.DataFrame(data={'Id': test_df.loan_id,
                  'Predicted': clf.predict_proba(test_df.drop(['loan_train_status'], axis=1))[:, -1]})
df.to_csv('submission.csv', index=False)

In [27]:
df = df.groupby('Id').max()
df.to_csv('submission.csv', index=True)

In [16]:
from agg import *
import numpy as np

def create_stuff(df):
    keep_cols = ['loan_id', 'account_id', 'status', 'loan_date', 'account_date', 'loan_amount', 'duration', 'payments', 'gender', 'birth_number', 'disp_type_count', 'district_id', 'no. of inhabitants', 'no. of municipalities with inhabitants < 499 ', 'no. of municipalities with inhabitants 500-1999',
                    'no. of municipalities with inhabitants 2000-9999 ', 'no. of municipalities with inhabitants >10000 ', 'no. of cities ', 'ratio of urban inhabitants ', 'average salary ', 'unemploymant rate \'95 ', 'unemploymant rate \'96 ', 'no. of enterpreneurs per 1000 inhabitants ', 'no. of commited crimes \'95 ', 'no. of commited crimes \'96 ']

    df = df.groupby(keep_cols, as_index=False, group_keys=False).agg({
        'trans_amount': ['mean','min','max','std','last'],
        'balance': ['mean','min','max','std','last'],
        'trans_type': [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit, cov_withdrawal, cov_credit]
    })
    # df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    return df

train_df = create_stuff(train_df)


train_df.head(5)
# train_df.dtypes

KeyError: 'status'

In [None]:
train_df.to_csv('data/train.csv', index=False, sep=';')