In [77]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [78]:
def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object':
            df[col] = le.fit_transform(df[col])
    return df

def open_csv(filename):
    df = pd.read_csv('data/%s.csv' % filename, sep=';', low_memory=False)
    return df#encode_df(df)

def get_account_df():
    return open_csv('account').rename(columns={'date': 'account_date', 'district_id': 'account_district_id'})

def get_card_test_df():
    return open_csv('card_test').rename(columns={'type': 'card_type'})

def get_card_train_df():
    return open_csv('card_train').rename(columns={'type': 'card_type'})

def get_client_df():
    return open_csv('client').rename(columns={'district_id': 'client_district_id'})

def get_disp_df():
    return open_csv('disp').drop(['disp_id'], axis=1).rename(columns={'type': 'disp_type'})

def get_district_df():
    return open_csv('district')

def get_loan_test_df():
    return open_csv('loan_test').rename(columns={'date': 'loan_date', 'amount': 'loan_amount'})

def get_loan_train_df():
    return open_csv('loan_train').rename(columns={'date': 'loan_date', 'amount': 'loan_amount'})

def get_trans_test_df():
    return open_csv('trans_test').rename(columns={'type': 'trans_type', 'date': 'trans_date', 'amount': 'trans_amount'})

def get_trans_train_df():
    return open_csv('trans_train').rename(columns={'type': 'trans_type', 'date': 'trans_date', 'amount': 'trans_amount'})


In [79]:
def merge_all(op='test'):
    account_df = get_account_df()
    card_df = get_card_test_df() if op == 'test' else get_card_train_df()
    client_df = get_client_df()
    disp_df = get_disp_df()
    district_df = get_district_df()
    loan_df = get_loan_test_df() if op == 'test' else get_loan_train_df()
    trans_df = get_trans_test_df() if op == 'test' else get_trans_train_df()

    ownership_df = disp_df.groupby("account_id", as_index=False, group_keys=False).agg({"disp_type": ["count"]})
    ownership_df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in ownership_df.columns]
    disp_df = pd.merge(disp_df, ownership_df, on='account_id')
    disp_df = disp_df[disp_df.disp_type == 'OWNER'].drop(['disp_type'], axis=1)
    print(disp_df)


merge_all('test')

      client_id  account_id disp_type  disp_type_count
0             1           1     OWNER                1
1             2           2     OWNER                2
3             4           3     OWNER                2
5             6           4     OWNER                1
6             7           5     OWNER                1
...         ...         ...       ...              ...
5363      13931       11333     OWNER                1
5364      13955       11349     OWNER                2
5366      13968       11359     OWNER                1
5367      13971       11362     OWNER                1
5368      13998       11382     OWNER                1

[4500 rows x 4 columns]
