# Data Mining Project - Data Preparation

### Imports

In [1]:
import pandas as pd
import data_preparation as dp

### Assemble Train DF

In [2]:
account = pd.read_csv('../data/account.csv', delimiter=';')
disp = pd.read_csv('../data/disp.csv', delimiter=';')
card_dev = pd.read_csv('../data/card_dev.csv', delimiter=';')
client = pd.read_csv('../data/client.csv', delimiter=';')
district = pd.read_csv('../data/district.csv', delimiter=';')
loan_dev = pd.read_csv('../data/loan_dev.csv', delimiter=';')
trans_dev = pd.read_csv('../data/trans_dev.csv', delimiter=';', dtype={'bank': str})

card_comp = pd.read_csv('../data/competition/card_comp.csv', delimiter=';')
loan_comp = pd.read_csv('../data/competition/loan_comp.csv', delimiter=';')
trans_comp = pd.read_csv('../data/competition/trans_comp.csv', delimiter=';', dtype={'bank': str})

In [3]:
account = dp.clean_accounts(account)
disp = dp.clean_disp(disp)
card_dev = dp.clean_cards(card_dev, disp)
client = dp.clean_clients(client)
district = dp.clean_districts(district)
loan_dev = dp.clean_loans(loan_dev)
trans_dev = dp.clean_transactions(trans_dev)

dfs = [
    account,
    disp,
    card_dev,
    client,
    district,
    loan_dev,
    trans_dev
]

In [4]:
df = dp.merge_dfs(dfs)
df = dp.extract_other_features(df)
df = dp.clean_columns(df)

In [5]:
df.to_csv('../data/clean/df-train.csv', index=False)

### Assemble Competition DF

In [None]:
account = pd.read_csv('../data/account.csv', delimiter=';')
disp = pd.read_csv('../data/disp.csv', delimiter=';')
client = pd.read_csv('../data/client.csv', delimiter=';')
district = pd.read_csv('../data/district.csv', delimiter=';')
card_comp = pd.read_csv('../data/competition/card_comp.csv', delimiter=';')
loan_comp = pd.read_csv('../data/competition/loan_comp.csv', delimiter=';')
trans_comp = pd.read_csv('../data/competition/trans_comp.csv', delimiter=';', dtype={'bank': str})

In [None]:
account = dp.clean_accounts(account)
disp = dp.clean_disp(disp)
card_comp = dp.clean_cards(card_comp, disp)
client = dp.clean_clients(client)
district = dp.clean_districts(district)
loan_comp = dp.clean_loans(loan_comp)
trans_comp = dp.clean_transactions(trans_comp)

dfs = [
    account,
    disp,
    card_comp,
    client,
    district,
    loan_comp,
    trans_comp
]

In [None]:
df = dp.merge_dfs(dfs)
df = dp.extract_other_features(df)
df = dp.clean_columns(df)

In [None]:
df.to_csv('../data/clean/df-comp.csv', index=False)

**ASSEMBLE TESTING DFS**

In [None]:
amount_outliers_df = dp.drop_outliers(df, 'amount')


In [None]:
demographic_df = dp.drop_demographic_columns_from_transactions_df(df)

In [None]:
age_loan_outliers_df = df[df['age_loan'] > 18]

In [None]:
irrelevant_columns_df = dp.drop_irrelevant_columns(df, ['duration'])

In [None]:
amount_outliers_df.to_csv('../test_ml/amount_outliers_df.csv', index=False)
age_loan_outliers_df.to_csv('../test_ml/age_loan_outliers_df.csv', index=False)
irrelevant_columns_df.to_csv('../test_ml/irrelevant_columns_df.csv', index=False)

- assessment of dimensions of data quality
- (cleaning): redundancy
- (cleaning): missing data
- (cleaning): outliers
- data transformation for compatibility with algorithms
- feature engineering from tabular data
- sampling for domain-specific purposes
- sampling for development
- imbalanced data
- feature selection

##### **Redundancy**

explain what has been done regarding redundant data

##### **Missing Data**

explain what has been done regarding missing data

##### **Outliers**

explain what has been done regarding outliers

##### **Other data preparation operations**

explain what has been done additionally