# Data Mining Project - Data Preparation

### Imports

In [1]:
import pandas as pd
import utils.clean_utils as cu

### Assemble Train DF

In [2]:
account = pd.read_csv('../data/account.csv', delimiter=';')
disp = pd.read_csv('../data/disp.csv', delimiter=';')
client = pd.read_csv('../data/client.csv', delimiter=';')
district = pd.read_csv('../data/district.csv', delimiter=';')
card_dev = pd.read_csv('../data/card_dev.csv', delimiter=';')
loan_dev = pd.read_csv('../data/loan_dev.csv', delimiter=';')
trans_dev = pd.read_csv('../data/trans_dev.csv', delimiter=';', dtype={'bank': str})

In [3]:
account = cu.clean_accounts(account)
disp = cu.clean_disp(disp)
client = cu.clean_clients(client)
district = cu.clean_districts(district)
card_dev = cu.clean_cards(card_dev, disp)
loan_dev = cu.clean_loans(loan_dev)
trans_dev = cu.clean_transactions(trans_dev, op=False, k_symbol=False)

dfs = [
    account,
    disp,
    client,
    district,
    card_dev,
    loan_dev,
    trans_dev
]

In [4]:
df = cu.merge_dfs(dfs)
df = cu.extract_other_features(df)
df = cu.clean_columns(df)
df = cu.transform_status(df)

In [5]:
df.to_csv('../data/clean/df-train.csv')

### Assemble Other Training DFs

In [6]:
amount_outliers_df = cu.drop_outliers(df, 'amount')
age_loan_outliers_df = df[df['age_at_loan'] > 18]

In [7]:
amount_outliers_df.to_csv('../data/clean/df-amount_outliers.csv')
age_loan_outliers_df.to_csv('../data/clean/df-adult_loans.csv')

### Assemble Competition DF

In [8]:
account = pd.read_csv('../data/account.csv', delimiter=';')
disp = pd.read_csv('../data/disp.csv', delimiter=';')
client = pd.read_csv('../data/client.csv', delimiter=';')
district = pd.read_csv('../data/district.csv', delimiter=';')
card_comp = pd.read_csv('../data/competition/card_comp.csv', delimiter=';')
loan_comp = pd.read_csv('../data/competition/loan_comp.csv', delimiter=';')
trans_comp = pd.read_csv('../data/competition/trans_comp.csv', delimiter=';', dtype={'bank': str})

In [9]:
account = cu.clean_accounts(account)
disp = cu.clean_disp(disp)
client = cu.clean_clients(client)
district = cu.clean_districts(district)
card_comp = cu.clean_cards(card_comp, disp)
loan_comp = cu.clean_loans(loan_comp)
trans_comp = cu.clean_transactions(trans_comp)

dfs = [
    account,
    disp,
    client,
    district,
    card_comp,
    loan_comp,
    trans_comp
]

In [10]:
df = cu.merge_dfs(dfs)
df = cu.extract_other_features(df)
df = cu.clean_columns(df)

In [11]:
df.to_csv('../data/clean/df-comp.csv')