# Data Mining Project - Data Preparation

### Imports and Datasets cleaning

In [1]:
import pandas as pd
import data_preparation as dp
from toolz.functoolz import pipe
import numpy as np

In [2]:
account = pd.read_csv('../data/account.csv',delimiter=';')
card_dev = pd.read_csv('../data/card_dev.csv',delimiter=';')
client = pd.read_csv('../data/client.csv',delimiter=';')
disp = pd.read_csv('../data/disp.csv',delimiter=';')
district = pd.read_csv('../data/district.csv',delimiter=';')
loan_dev = pd.read_csv('../data/loan_dev.csv',delimiter=';')
trans_dev = pd.read_csv('../data/trans_dev.csv',delimiter=';', dtype={'bank': str})

In [3]:
district = dp.clean_district(district)
trans_dev = dp.clean_transactions(trans_dev)
disp = dp.clean_disp(disp)

In [4]:
disp

Unnamed: 0,disp_id,client_id,account_id,has_disponent
0,1,1,1,0
1,2,2,2,1
2,4,4,3,1
3,6,6,4,0
4,7,7,5,0
...,...,...,...,...
4495,13623,13931,11333,0
4496,13647,13955,11349,1
4497,13660,13968,11359,0
4498,13663,13971,11362,0


**ASSEMBLE DF**

In [9]:
# join prepared dataframe, loan, disposition and client
df = trans_dev.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='right')
df = df.merge(disp, on='account_id', suffixes=('','_disp'), how='left')
df = df.merge(client, on='client_id',suffixes=('','_client'), how='left')
df

Unnamed: 0,account_id,avg_amount_credit,avg_amount_withdrawal,avg_amount_total,min_amount,max_amount,credit_ratio,num_trans,avg_balance,min_balance,...,date,amount,duration,payments,status,disp_id,client_id,type,birth_number,district_id
0,1787,5025.000000,0.000000,5025.000000,1100.0,9900.0,1.000000,4,12250.000000,1100.0,...,930705,96396,12,8033,-1,2166,2166,OWNER,475722,30
1,1801,13523.158824,-8884.240000,1411.051351,-54300.0,36574.0,0.459459,37,52083.859459,700.0,...,930711,165960,36,4610,1,2181,2181,OWNER,680722,46
2,9188,5009.733333,-6097.000000,844.708333,-14800.0,19065.0,0.625000,24,30060.954167,800.0,...,930728,127080,60,2118,1,11006,11314,OWNER,360602,45
3,1843,9254.600000,-7168.100000,1371.704000,-15600.0,26448.0,0.520000,25,41297.480000,1000.0,...,930803,105804,36,2939,1,2235,2235,OWNER,405420,14
4,11013,21255.930769,-16801.000000,1522.707407,-36700.0,63366.0,0.481481,27,57188.211111,600.0,...,930906,274740,60,4579,1,13231,13539,OWNER,780907,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,9030,6448.582857,-3830.458824,352.872093,-36960.0,26388.0,0.406977,172,44197.509884,200.0,...,961212,155616,48,3242,1,10814,11122,DISPONENT,790610,72
399,3189,13417.557143,-10197.180645,1009.813559,-52600.0,44352.0,0.474576,59,55230.444068,800.0,...,961215,222180,60,3703,-1,3855,3855,OWNER,390320,29
400,8972,8544.930000,-6974.431579,984.215385,-22100.0,31636.5,0.512821,39,41994.907692,800.0,...,961221,45024,48,938,1,10742,11050,OWNER,575504,70
401,10963,16554.986275,-10992.139726,337.726613,-50800.0,49887.0,0.411290,124,56646.516129,1100.0,...,961225,115812,36,3217,1,13172,13480,OWNER,530601,16


In [10]:
# create age_loan and gender column
df = df.apply(lambda row: dp.calculate_age_loan(row), axis=1)

# join demograph
df = df.merge(district, left_on='district_id', right_on='code', how='left')
df

Unnamed: 0,account_id,avg_amount_credit,avg_amount_withdrawal,avg_amount_total,min_amount,max_amount,credit_ratio,num_trans,avg_balance,min_balance,...,no._of_municipalities_with_inhabitants_2000-9999,no._of_municipalities_with_inhabitants_>10000,no._of_cities,ratio_of_urban_inhabitants,average_salary,unemployment_rate,avg_commited_crimes,ratio_entrepreneurs,criminality_growth,unemployment_growth
0,1787,5025.000000,0.000000,5025.000000,1100.0,9900.0,1.000000,4,12250.000000,1100.0,...,8,2,10,0.818,9650,3.525,0.030529,0.100,-0.001909,0.29
1,1801,13523.158824,-8884.240000,1411.051351,-54300.0,36574.0,0.459459,37,52083.859459,700.0,...,7,3,10,0.735,8369,2.050,0.024275,0.117,-0.002094,0.52
2,9188,5009.733333,-6097.000000,844.708333,-14800.0,19065.0,0.625000,24,30060.954167,800.0,...,6,1,5,0.535,8390,2.585,0.026965,0.132,0.000539,0.61
3,1843,9254.600000,-7168.100000,1371.704000,-15600.0,26448.0,0.520000,25,41297.480000,1000.0,...,10,1,9,0.748,10045,1.565,0.036297,0.135,-0.001739,0.29
4,11013,21255.930769,-16801.000000,1522.707407,-36700.0,63366.0,0.481481,27,57188.211111,600.0,...,5,1,5,0.505,8288,4.155,0.017466,0.110,-0.001179,0.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,9030,6448.582857,-3830.458824,352.872093,-36960.0,26388.0,0.406977,172,44197.509884,200.0,...,7,4,4,0.626,8994,4.295,0.041747,0.110,-0.002052,0.99
399,3189,13417.557143,-10197.180645,1009.813559,-52600.0,44352.0,0.474576,59,55230.444068,800.0,...,5,1,6,0.556,8843,3.210,0.018659,0.113,0.001531,0.78
400,8972,8544.930000,-6974.431579,984.215385,-22100.0,31636.5,0.512821,39,41994.907692,800.0,...,8,5,7,0.899,10177,7.190,0.035016,0.081,0.000806,1.12
401,10963,16554.986275,-10992.139726,337.726613,-50800.0,49887.0,0.411290,124,56646.516129,1100.0,...,10,1,8,0.569,8427,1.330,0.020158,0.107,0.000415,0.42


In [11]:
df = pipe(df,
        dp.calculate_number_of_disponents,
        dp.calculate_diff_salary_loan,
        dp.drop_duplicated_accounts,
        dp.drop_irrelevant_columns_from_df,
        dp.drop_demographic_columns_from_df)
df

KeyError: 'average salary'

In [10]:
# convert to numerical values
df = dp.convert_n_numerical_to_numerical(df, 'frequency')
df = dp.convert_n_numerical_to_numerical(df, 'type')
df = dp.convert_n_numerical_to_numerical(df, 'gender')

In [5]:
district.to_csv('../data/clean/district.csv', index=False)
trans_dev.to_csv('../data/clean/trans_dev.csv', index=False)

**ASSEMBLE MAIN_DF**

In [None]:
# join account, loan, disposition and client
main_df = account.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='right')
main_df = main_df.merge(disp, on='account_id', suffixes=('','_disp'), how='left')
main_df = main_df.merge(client, on='client_id',suffixes=('','_client'), how='left')

main_df.drop(columns='district_id', axis=1, inplace=True)

# create age_loan and gender column
main_df = main_df.apply(lambda row: dp.calculate_age_loan(row), axis=1)

# join demograph
main_df = main_df.merge(district, left_on='district_id_client', right_on='code', how='left')

# join creditcard
main_df = main_df.merge(card_dev, on='disp_id', suffixes=('', '_card'), how='left')

In [None]:
main_df = pipe(main_df,
               dp.calculate_number_of_disponents,
               dp.calculate_diff_salary_loan,
               dp.drop_duplicated_accounts,
               dp.drop_irrelevant_columns_from_main_df,
               dp.rename_main_df_columns)

In [None]:
main_df = dp.convert_n_numerical_to_numerical(main_df, 'frequency')
main_df = dp.convert_n_numerical_to_numerical(main_df, 'type')

**ASSEMBLE TRANSACTIONS_DF**

In [None]:
transactions_df = account.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='right').merge(trans_dev, on='account_id', suffixes=('', '_transaction'), how='left')

In [None]:
transactions_df = pipe(transactions_df,
                       dp.calculate_transaction_count,
                       dp.calculate_credit_debit_ratio,
                       dp.drop_irrelevant_columns_from_transactions_df,
                       dp.rename_transactions_df_columns)
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'frequency')
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'operation')
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'type')
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'bank')

**ASSEMBLE TESTING DFS**

In [None]:
amount_outliers_df = dp.drop_outliers(main_df, 'amount')


In [None]:
demographic_df = dp.drop_demographic_columns_from_transactions_df(main_df)

In [None]:
age_loan_outliers_df = main_df[main_df['age_loan'] > 18]

In [None]:
irrelevant_columns_df = dp.drop_irrelevant_columns(main_df, ['duration'])

In [None]:
main_df.to_csv('../test_ml/main_df.csv', index=False)
transactions_df.to_csv('../test_ml/transactions_df.csv', index=False)
amount_outliers_df.to_csv('../test_ml/amount_outliers_df.csv', index=False)
age_loan_outliers_df.to_csv('../test_ml/age_loan_outliers_df.csv', index=False)
irrelevant_columns_df.to_csv('../test_ml/irrelevant_columns_df.csv', index=False)

- assessment of dimensions of data quality
- (cleaning): redundancy
- (cleaning): missing data
- (cleaning): outliers
- data transformation for compatibility with algorithms
- feature engineering from tabular data
- sampling for domain-specific purposes
- sampling for development
- imbalanced data
- feature selection

##### **Redundancy**

explain what has been done regarding redundant data

##### **Missing Data**

explain what has been done regarding missing data

##### **Outliers**

explain what has been done regarding outliers

##### **Other data preparation operations**

explain what has been done additionally