# Data Mining Project - Data Preparation

### Imports and Datasets

In [None]:
import pandas as pd
import data_preparation as dp
from toolz.functoolz import pipe

In [105]:
account = pd.read_csv('../data/account.csv',delimiter=';')
card_dev = pd.read_csv('../data/card_dev.csv',delimiter=';')
client = pd.read_csv('../data/client.csv',delimiter=';')
disp = pd.read_csv('../data/disp.csv',delimiter=';')
district = pd.read_csv('../data/district.csv',delimiter=';')
loan_dev = pd.read_csv('../data/loan_dev.csv',delimiter=';')
trans_dev = pd.read_csv('../data/trans_dev.csv',delimiter=';', dtype={'bank': str})

In [106]:
district = dp.calculate_average_unemployment_rate(district)
district = dp.calculate_average_commited_crimes(district)

**ASSEMBLE DF**

In [120]:
# replace withdrawal in cash by withdrawal
trans_dev['type'] = trans_dev['type'].replace('withdrawal in cash','withdrawal')

In [121]:
nr_credits_per_account = trans_dev.groupby(['account_id','type']).size().reset_index(name='count')

credit = nr_credits_per_account[nr_credits_per_account['type'] == 'credit']
withdrawal = nr_credits_per_account[(nr_credits_per_account['type'] == 'withdrawal')]

df= account.merge(credit[['account_id','count']], on='account_id')
df = df.rename(columns={'count':'credit'})
df= df.merge(withdrawal[['account_id','count']], on='account_id')
df = df.rename(columns={'count':'withdrawal'})

In [122]:
amounts = trans_dev.groupby(['account_id', 'type'], as_index=False)['amount'].mean()

credit_amount_mean = amounts[amounts['type'] == 'credit']
withdrawal_amount_mean = amounts[(amounts['type'] == 'withdrawal')]

df= df.merge(credit_amount_mean[['account_id','amount']], on='account_id')
df = df.rename(columns={'amount':'credit_amount_avg'})
df= df.merge(withdrawal_amount_mean[['account_id','amount']], on='account_id')
df = df.rename(columns={'amount':'withdrawal_amount_avg'})


In [123]:
df

Unnamed: 0,account_id,district_id,frequency,date,credit,withdrawal,credit_amount_avg,withdrawal_amount_avg
0,576,55,monthly issuance,930101,97,132,3126.310309,2005.684848
1,3818,74,monthly issuance,930101,101,265,9136.695050,3297.292075
2,704,55,monthly issuance,930101,98,222,9388.220408,3960.704505
3,2378,16,monthly issuance,930101,132,175,16213.851515,11707.206857
4,2632,24,monthly issuance,930102,98,270,7700.421429,2696.123704
...,...,...,...,...,...,...,...,...
3089,3799,74,monthly issuance,961129,3,1,8335.333333,300.000000
3090,1435,1,monthly issuance,961201,3,1,11358.966667,6200.000000
3091,3507,9,monthly issuance,961201,6,2,23283.800000,32850.000000
3092,712,72,monthly issuance,961201,4,1,22363.150000,34000.000000


In [125]:
df = df.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='left')
df['withdrawal'] = df['withdrawal'].fillna(0)
df['credit'] = df['credit'].fillna(0)
df['credit_amount_avg'] = df['credit_amount_avg'].fillna(0)

df = df.merge(disp, on='account_id', suffixes=('','_disp'), how='left')
df = df.merge(client, on='client_id',suffixes=('','_client'), how='left')
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['withdrawal'].fillna(0,inplace=True)


Unnamed: 0,account_id,district_id,frequency,date,credit,withdrawal,credit_amount_avg,withdrawal_amount_avg,loan_id,date_loan,...,date_loan.1,amount_loan,duration_loan,payments_loan,status_loan,disp_id,client_id,type,birth_number,district_id_client
0,1787,,,,,,,,5314,930705,...,930705,96396,12,8033,-1,2166,2166,OWNER,475722,30
1,1801,46.0,monthly issuance,930213.0,17.0,20.0,13523.158824,8884.240000,5316,930711,...,930711,165960,36,4610,1,2181,2181,OWNER,680722,46
2,9188,45.0,monthly issuance,930208.0,15.0,9.0,5009.733333,6097.000000,6863,930728,...,930728,127080,60,2118,1,11006,11314,OWNER,360602,45
3,1843,12.0,monthly issuance,930130.0,13.0,12.0,9254.600000,7168.100000,5325,930803,...,930803,105804,36,2939,1,2235,2235,OWNER,405420,14
4,11013,1.0,weekly issuance,930214.0,13.0,14.0,21255.930769,16801.000000,7240,930906,...,930906,274740,60,4579,1,13231,13539,OWNER,780907,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,9030,72.0,monthly issuance,950121.0,70.0,102.0,6448.582857,3830.458824,6818,961212,...,961212,155616,48,3242,1,10814,11122,DISPONENT,790610,72
399,3189,29.0,monthly issuance,951129.0,28.0,31.0,13417.557143,10197.180645,5625,961215,...,961215,222180,60,3703,-1,3855,3855,OWNER,390320,29
400,8972,70.0,monthly issuance,960521.0,20.0,19.0,8544.930000,6974.431579,6805,961221,...,961221,45024,48,938,1,10742,11050,OWNER,575504,70
401,10963,16.0,monthly issuance,950520.0,51.0,73.0,16554.986275,10992.139726,7233,961225,...,961225,115812,36,3217,1,13172,13480,OWNER,530601,16


In [None]:
df

**ASSEMBLE MAIN_DF**

In [None]:
# join account, loan, disposition and client
main_df = account.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='right')
main_df = main_df.merge(disp, on='account_id', suffixes=('','_disp'), how='left')
main_df = main_df.merge(client, on='client_id',suffixes=('','_client'), how='left')

main_df.drop(columns='district_id', axis=1, inplace=True)

# create age_at_loan and gender column
main_df = main_df.apply(lambda row: dp.calculate_age_loan(row), axis=1)

# join demograph
district.rename(columns={'code ':'code'}, inplace=True)
main_df = main_df.merge(district, left_on='district_id_client', right_on='code', how='left')

# join creditcard
main_df = main_df.merge(card_dev, on='disp_id', suffixes=('', '_card'), how='left')

In [None]:
main_df = pipe(main_df,
               dp.calculate_number_of_disponents,
               dp.calculate_diff_salary_loan,
               dp.drop_duplicated_accounts,
               dp.drop_irrelevant_columns_from_main_df,
               dp.rename_main_df_columns)

In [None]:
main_df = dp.convert_n_numerical_to_numerical(main_df, 'frequency')
main_df = dp.convert_n_numerical_to_numerical(main_df, 'type')

**ASSEMBLE TRANSACTIONS_DF**

In [None]:
transactions_df = account.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='right').merge(trans_dev, on='account_id', suffixes=('', '_transaction'), how='left')

In [None]:
transactions_df = pipe(transactions_df,
                       dp.calculate_transaction_count,
                       dp.calculate_credit_debit_ratio,
                       dp.drop_irrelevant_columns_from_transactions_df,
                       dp.rename_transactions_df_columns)
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'frequency')
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'operation')
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'type')
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'bank')

**ASSEMBLE TESTING DFS**

In [None]:
amount_outliers_df = dp.drop_outliers(main_df, 'amount')


In [None]:
demographic_df = dp.drop_demographic_columns_from_transactions_df(main_df)

In [None]:
age_loan_outliers_df = main_df[main_df['age_loan'] > 18]

In [None]:
irrelevant_columns_df = dp.drop_irrelevant_columns(main_df, ['duration'])

In [None]:
main_df.to_csv('../test_ml/main_df.csv', index=False)
transactions_df.to_csv('../test_ml/transactions_df.csv', index=False)
amount_outliers_df.to_csv('../test_ml/amount_outliers_df.csv', index=False)
age_loan_outliers_df.to_csv('../test_ml/age_loan_outliers_df.csv', index=False)
irrelevant_columns_df.to_csv('../test_ml/irrelevant_columns_df.csv', index=False)

- assessment of dimensions of data quality
- (cleaning): redundancy
- (cleaning): missing data
- (cleaning): outliers
- data transformation for compatibility with algorithms
- feature engineering from tabular data
- sampling for domain-specific purposes
- sampling for development
- imbalanced data
- feature selection

##### **Redundancy**

explain what has been done regarding redundant data

##### **Missing Data**

explain what has been done regarding missing data

##### **Outliers**

explain what has been done regarding outliers

##### **Other data preparation operations**

explain what has been done additionally