In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [20]:
def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object':
            df[col] = le.fit_transform(df[col])
    return df


def is_female(number):
    return str(number)[2] == '5' or str(number)[2] == '6'

def open_csv(filename):
    df = pd.read_csv('data/%s.csv' % filename, sep=';', low_memory=False)
    return df#encode_df(df)

def get_account_df():
    return open_csv('account').rename(columns={'date': 'account_date'})

def get_card_test_df():
    return open_csv('card_test').rename(columns={'type': 'card_type'})

def get_card_train_df():
    return open_csv('card_train').rename(columns={'type': 'card_type'})

def get_client_df():
    df = open_csv('client')
    df['gender'] = df.apply(lambda row: 0 if is_female(row['birth_number']) else 1, axis=1)


    df['birth_number'] = df.apply(lambda row: row['birth_number'] - 5000 if is_female(row['birth_number']) else row['birth_number'], axis=1)
    return df#.rename(columns={'district_id': 'client_district_id'})

def get_disp_df():
    return open_csv('disp').drop(['disp_id'], axis=1).rename(columns={'type': 'disp_type'})

def get_district_df():
    return open_csv('district').rename(columns={'code ': 'district_id'})

def get_loan_test_df():
    return open_csv('loan_test').rename(columns={'date': 'loan_date', 'amount': 'loan_amount'}).fillna('')

def get_loan_train_df():
    return open_csv('loan_train').rename(columns={'date': 'loan_date', 'amount': 'loan_amount'})

def get_trans_test_df():
    df = open_csv('trans_test')
    df.loc[df["operation"].isna(),"operation"] = df.loc[df["operation"].isna(),"k_symbol"]
    df.loc[df["type"]=="withdrawal","amount"] *=-1
    df = df.drop(['k_symbol'], axis=1)
    df = df.drop(['bank', 'account'], axis=1)
    return df.rename(columns={'type': 'trans_type', 'date': 'trans_date', 'amount': 'trans_amount'})

def get_trans_train_df():
    df = open_csv('trans_train')
    df.loc[df["operation"].isna(),"operation"] = df.loc[df["operation"].isna(),"k_symbol"]
    df.loc[df["type"]=="withdrawal","amount"] *=-1
    df = df.drop(['k_symbol'], axis=1)
    df = df.drop(['bank', 'account'], axis=1)
    return df.rename(columns={'type': 'trans_type', 'date': 'trans_date', 'amount': 'trans_amount'})


In [21]:
def merge_all(op='test'):
    account_df = get_account_df()
    card_df = get_card_test_df() if op == 'test' else get_card_train_df()
    client_df = get_client_df()
    disp_df = get_disp_df()
    district_df = get_district_df()
    loan_df = get_loan_test_df() if op == 'test' else get_loan_train_df()
    trans_df = get_trans_test_df() if op == 'test' else get_trans_train_df()

    ownership_df = disp_df.groupby("account_id", as_index=False, group_keys=False).agg({"disp_type": ["count"]})
    ownership_df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in ownership_df.columns]
    disp_df = pd.merge(disp_df, ownership_df, on='account_id')
    disp_df = disp_df[disp_df.disp_type == 'OWNER'].drop(['disp_type'], axis=1)

    df = pd.merge(loan_df, disp_df, on='account_id')
    df = pd.merge(df, client_df, on='client_id')
    df = pd.merge(df, district_df, on='district_id')
    df['account_date'] = pd.merge(df, account_df, on='account_id')['account_date']
    df = pd.merge(df, trans_df, on='account_id')
    return df.sort_values(by=["trans_date"])


train_df = merge_all('train')
# test_df = merge_all('test')
# print(get_trans_train_df()['trans_amount'])


In [22]:
from agg import *
import numpy as np

def create_stuff(df):
    keep_cols = ["loan_id", "account_id", "status", "loan_date", "account_date", "loan_amount", "duration", "payments", "gender", "birth_number", "disp_type_count", "district_id", "no. of inhabitants", 'no. of municipalities with inhabitants < 499 ', 'no. of municipalities with inhabitants 500-1999',
                    'no. of municipalities with inhabitants 2000-9999 ', 'no. of municipalities with inhabitants >10000 ', 'no. of cities ', 'ratio of urban inhabitants ', 'average salary ', 'unemploymant rate \'95 ', 'unemploymant rate \'96 ', 'no. of enterpreneurs per 1000 inhabitants ', 'no. of commited crimes \'95 ', 'no. of commited crimes \'96 ']

    df = df.groupby(keep_cols, as_index=False, group_keys=False).agg({
        "trans_amount": ["mean","min","max","std","last"],
        "balance": ["mean","min","max","std","last"],
        "trans_type": [count_withdrawal, count_credit, mean_withdrawal, mean_credit, std_withdrawal, std_credit, cov_withdrawal, cov_credit]
    })
    # df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    return df

train_df = create_stuff(train_df)


train_df.head(5)
# train_df.dtypes

Unnamed: 0_level_0,loan_id,account_id,status,loan_date,account_date,loan_amount,duration,payments,gender,birth_number,...,balance,balance,trans_type,trans_type,trans_type,trans_type,trans_type,trans_type,trans_type,trans_type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,std,last,count_withdrawal,count_credit,mean_withdrawal,mean_credit,std_withdrawal,std_credit,cov_withdrawal,cov_credit
0,4959,2,1,940105,930226,80952,24,3373,1,450204,...,12061.705682,27855.2,29,22,0.537037,0.407407,0.498626,0.491352,0.253319357092942,0.2459818308874912
1,4961,19,-1,960429,950407,30276,12,2523,0,390423,...,15039.248405,15854.0,32,46,0.4,0.575,0.489898,0.494343,0.2430379746835443,0.2474683544303797
2,4973,67,1,960502,941019,165960,24,6915,0,440613,...,20955.646998,23703.8,84,37,0.672,0.296,0.469485,0.456491,0.2221935483870967,0.2100645161290322
3,4996,132,1,961106,960511,88440,12,7370,0,450703,...,21638.25887,78907.6,14,16,0.451613,0.516129,0.497653,0.49974,0.2559139784946235,0.2580645161290323
4,5002,173,1,940531,931126,104808,12,8734,1,391130,...,11517.175248,28015.4,18,12,0.6,0.4,0.489898,0.489898,0.2482758620689656,0.2482758620689656


In [23]:
train_df.to_csv('data/train.csv', index=False, sep=';')