In [26]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [27]:
from model.model import Model

model = Model()
model.load()

<model.model.Model at 0x2553d4339a0>

In [28]:
def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object':
            df[col] = le.fit_transform(df[col])
    return df


def is_female(number):
    return str(number)[2] == '5' or str(number)[2] == '6'


def get_client_df():
    df = open_csv('client')
    df['gender'] = df.apply(lambda row: 0 if is_female(row['birth_number']) else 1, axis=1)
    df['birth_number'] = df.apply(lambda row: row['birth_number'] - 5000 if is_female(row['birth_number']) else row['birth_number'], axis=1)
    return df


In [29]:
def merge_all(model, op='test'):
    def modify_trans(df, op):
        df.loc[df[f'trans_{op}_type'] == 'withdrawal', f'trans_{op}_amount'] *= -1
        df = df.drop([f'trans_{op}_k_symbol'], axis=1)
        df = df.drop([f'trans_{op}_bank', f'trans_{op}_account'], axis=1)
        return df

    def modify_client(df):
        df['client_gender'] = df.apply(lambda row: 0 if is_female(row['client_birth_number']) else 1, axis=1)
        df['client_age'] = df.apply(lambda row: 2021 - ((row['client_birth_number'] // 10000) + 1900), axis=1)
        df['client_birth_number'] = df.apply(lambda row: row['client_birth_number'] - 5000 if is_female(row['client_birth_number']) else row['client_birth_number'], axis=1)
        return df

    account_df = model.datasets['account']
    
    client_df = model.datasets['client']
    client_df = modify_client(client_df)
    
    disp_df = model.datasets['disp']
    district_df = model.datasets['district'].rename(columns={'district_code': 'district_id'})
    card_df = model.datasets[op][f'card_{op}']
    loan_df = model.datasets[op][f'loan_{op}']
    
    trans_df = model.datasets[op][f'trans_{op}']
    trans_df = modify_trans(trans_df, op)

    ownership_df = disp_df.groupby('account_id', as_index=False, group_keys=False).agg({'disp_type': ['count']})
    ownership_df.columns = [f'{col_name}_{agg_name}' if 'id' not in col_name else col_name for col_name, agg_name in ownership_df.columns]
    disp_df = disp_df.merge(ownership_df, on='account_id')
    disp_df = disp_df[disp_df.disp_type == 'OWNER'].drop(['disp_type'], axis=1)

    df = pd.merge(loan_df, disp_df, on='account_id')
    df = pd.merge(df, client_df, on='client_id')
    df = pd.merge(df, district_df, on='district_id')
    df['account_date'] = pd.merge(df, account_df, on='account_id')['account_date']
    df = pd.merge(df, trans_df, on='account_id')
    return df.sort_values(by=[f'trans_{op}_date'])


train_df = merge_all(model, 'train')
test_df = merge_all(model, 'train')
# train_df.head()
# test_df = merge_all('test')
# print(get_trans_train_df()['trans_amount'])


In [30]:
train_df.shape

(24494, 36)

In [31]:
from agg import *
import numpy as np

def create_stuff(df):
    keep_cols = ['loan_id', 'account_id', 'loan_train_date', 'loan_train_amount',
                 'loan_train_duration', 'loan_train_payments', 'loan_train_status',
                 'disp_type_count', 'client_birth_number', # mudar para birth_date
                 'district_id', 'client_gender', #'client_age',
                 'district_no._of_inhabitants',
                 'district_no._of_municipalities_with_inhabitants_<_499',
                 'district_no._of_municipalities_with_inhabitants_500-1999',
                 'district_no._of_municipalities_with_inhabitants_2000-9999',
                 'district_no._of_municipalities_with_inhabitants_>10000',
                 'district_no._of_cities', 'district_ratio_of_urban_inhabitants',
                 'district_average_salary', 'district_unemploymant_rate_\'95',
                 'district_unemploymant_rate_\'96',
                 'district_no._of_enterpreneurs_per_1000_inhabitants',
                 'district_no._of_commited_crimes_\'95',
                 'district_no._of_commited_crimes_\'96', 'account_date']
    # keep_cols = ["loan_id", "account_id", "status", "loan_date", "creation_date", "loan_amount", "duration", "payments", "gender", "birthdate", "ownership_count", "district_id", "num_inhabitants", 'num_municipalities_with_inhabitants<499', 'num_municipalities_with_inhabitants_500-1999',
    #              'num_municipalities_with_inhabitants_2000-9999', 'num_municipalities_with_inhabitants>10000', 'num_cities', 'ratio_urban_inhabitants', 'average_salary', 'unemployment_rate_95', 'unemployment_rate_96', 'num_entrepreneurs_per_1000_inhabitants', 'num_crimes_95', 'num_crimes_96']

    df = df.groupby(keep_cols, as_index=False, group_keys=False).agg({
        'trans_train_amount': ['mean', 'min', 'max', 'std', 'last'],
        'trans_train_balance': ['mean', 'min', 'max', 'std', 'last'],
        'trans_train_type': [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit]
    })
    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    return df

train_df = create_stuff(train_df)
test_df = create_stuff(test_df)

train_df = encode_df(train_df)
test_df = encode_df(test_df)
train_df.head(5)
# train_df.dtypes


Unnamed: 0,loan_id,account_id,loan_train_date,loan_train_amount,loan_train_duration,loan_train_payments,loan_train_status,disp_type_count,client_birth_number,district_id,...,trans_train_balance_min,trans_train_balance_max,trans_train_balance_std,trans_train_balance_last,trans_train_type_count_withdrawal,trans_train_type_count_credit,trans_train_type_mean_withdrawal,trans_train_type_mean_credit,trans_train_type_std_withdrawal,trans_train_type_std_credit
0,4959,2,940105,80952,24,3373,1,2,450204,1,...,1100.0,67529.6,12061.705682,27855.2,29,22,0.537037,0.407407,0.498626,0.491352
1,4961,19,960429,30276,12,2523,-1,1,390423,21,...,715.0,58157.5,15039.248405,15854.0,32,46,0.4,0.575,0.489898,0.494343
2,4973,67,960502,165960,24,6915,1,1,440613,16,...,700.0,107069.6,20955.646998,23703.8,84,37,0.672,0.296,0.469485,0.456491
3,4996,132,961106,88440,12,7370,1,2,450703,40,...,200.0,103239.0,21638.25887,78907.6,14,16,0.451613,0.516129,0.497653,0.49974
4,5002,173,940531,104808,12,8734,1,2,391130,66,...,500.0,57865.3,11517.175248,28015.4,18,12,0.6,0.4,0.489898,0.489898


In [32]:
train_df.shape

(328, 41)

In [33]:
from sklearn.svm import SVC

# test_df.columns = [col_name.replace('test', 'train') for col_name in test_df.columns]
clf = SVC(probability=True)
x = train_df.drop(['loan_train_status'], axis=1)
y = train_df.loan_train_status
clf.fit(x,y)

SVC(probability=True)

In [34]:
df = pd.DataFrame(data={'Id': test_df.loan_id,
                  'Predicted': clf.predict_proba(test_df.drop(['loan_train_status'], axis=1))[:, -1]})
df.to_csv('submission.csv', index=False)

In [40]:
print(len(df['Id'].unique()))
df.to_csv('submission.csv', index=True)

KeyError: 'Id'

In [36]:
len(train_df['loan_id'].unique())

328