# Data Mining Project - Data Preparation

### Imports and Datasets

In [1]:
import pandas as pd
import data_preparation as dp
from toolz.functoolz import pipe

In [2]:
account = pd.read_csv('../data/account.csv',delimiter=';')
card_dev = pd.read_csv('../data/card_dev.csv',delimiter=';')
client = pd.read_csv('../data/client.csv',delimiter=';')
disp = pd.read_csv('../data/disp.csv',delimiter=';')
district = pd.read_csv('../data/district.csv',delimiter=';')
loan_dev = pd.read_csv('../data/loan_dev.csv',delimiter=';')
trans_dev = pd.read_csv('../data/trans_dev.csv',delimiter=';', dtype={'bank': str})

In [3]:
# rename wrongly named columns
district.rename(str.strip, axis='columns', inplace=True)
district = dp.calculate_average_unemployment_rate(district)
district = dp.calculate_average_commited_crimes(district)

# replace 'withdrawal in cash' by 'withdrawal'
trans_dev['type'] = trans_dev['type'].replace('withdrawal in cash','withdrawal')

**ASSEMBLE DF**

In [4]:
# find number of credits and withdrawals per account
nr_credits_per_account = trans_dev.groupby(['account_id','type']).size().reset_index(name='count')

credit = nr_credits_per_account[nr_credits_per_account['type'] == 'credit']
withdrawal = nr_credits_per_account[(nr_credits_per_account['type'] == 'withdrawal')]

# merge counts with account info
df = credit[['account_id','count']].merge(account, on='account_id', how='right')
df = df.rename(columns={'count':'credit'})

df= df.merge(withdrawal[['account_id','count']], on='account_id', how='left', sort=True)
df = df.rename(columns={'count':'withdrawal'})

In [5]:
amounts = trans_dev.groupby(['account_id', 'type'], as_index=False)['amount'].mean()

credit_amount_mean = amounts[amounts['type'] == 'credit']
withdrawal_amount_mean = amounts[amounts['type'] == 'withdrawal']

df= df.merge(credit_amount_mean[['account_id','amount']], on='account_id', how='left')
df = df.rename(columns={'amount':'credit_amount_avg'})

df= df.merge(withdrawal_amount_mean[['account_id','amount']], on='account_id', how='left')
df = df.rename(columns={'amount':'withdrawal_amount_avg'})

In [6]:
# fill NaN values with 0
df[['credit', 'withdrawal', 'credit_amount_avg', 'withdrawal_amount_avg']] = df[['credit', 'withdrawal', 'credit_amount_avg', 'withdrawal_amount_avg']].fillna(0)
df

Unnamed: 0,account_id,credit,district_id,frequency,date,withdrawal,credit_amount_avg,withdrawal_amount_avg
0,1,50.0,18,monthly issuance,950324,63.0,1992.602000,1380.479365
1,2,22.0,1,monthly issuance,930226,32.0,10394.786364,6276.393750
2,3,0.0,5,monthly issuance,970707,0.0,0.000000,0.000000
3,4,17.0,12,monthly issuance,960221,22.0,3348.264706,1453.527273
4,5,0.0,15,monthly issuance,970530,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...
4495,11333,101.0,8,monthly issuance,940526,107.0,8962.814851,8275.402804
4496,11349,11.0,1,weekly issuance,950526,7.0,21108.227273,29640.857143
4497,11359,45.0,61,monthly issuance,941001,102.0,14509.608889,6149.507843
4498,11362,29.0,67,monthly issuance,951014,78.0,8811.758621,2965.915385


In [7]:
# join prepared dataframe, loan, disposition and client
df = df.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='right')
df = df.merge(disp, on='account_id', suffixes=('','_disp'), how='left')
df = df.merge(client, on='client_id',suffixes=('','_client'), how='left')
df

Unnamed: 0,account_id,credit,district_id,frequency,date,withdrawal,credit_amount_avg,withdrawal_amount_avg,loan_id,date_loan,amount,duration,payments,status,disp_id,client_id,type,birth_number,district_id_client
0,1787,4.0,30,weekly issuance,930322,0.0,5025.000000,0.000000,5314,930705,96396,12,8033,-1,2166,2166,OWNER,475722,30
1,1801,17.0,46,monthly issuance,930213,20.0,13523.158824,8884.240000,5316,930711,165960,36,4610,1,2181,2181,OWNER,680722,46
2,9188,15.0,45,monthly issuance,930208,9.0,5009.733333,6097.000000,6863,930728,127080,60,2118,1,11006,11314,OWNER,360602,45
3,1843,13.0,12,monthly issuance,930130,12.0,9254.600000,7168.100000,5325,930803,105804,36,2939,1,2235,2235,OWNER,405420,14
4,11013,13.0,1,weekly issuance,930214,14.0,21255.930769,16801.000000,7240,930906,274740,60,4579,1,13231,13539,OWNER,780907,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,9030,70.0,72,monthly issuance,950121,102.0,6448.582857,3830.458824,6818,961212,155616,48,3242,1,10814,11122,DISPONENT,790610,72
399,3189,28.0,29,monthly issuance,951129,31.0,13417.557143,10197.180645,5625,961215,222180,60,3703,-1,3855,3855,OWNER,390320,29
400,8972,20.0,70,monthly issuance,960521,19.0,8544.930000,6974.431579,6805,961221,45024,48,938,1,10742,11050,OWNER,575504,70
401,10963,51.0,16,monthly issuance,950520,73.0,16554.986275,10992.139726,7233,961225,115812,36,3217,1,13172,13480,OWNER,530601,16


In [8]:
# create age_loan and gender column
df = df.apply(lambda row: dp.calculate_age_loan(row), axis=1)

# join demograph
df = df.merge(district, left_on='district_id_client', right_on='code', how='left')
df

Unnamed: 0,account_id,credit,district_id,frequency,date,withdrawal,credit_amount_avg,withdrawal_amount_avg,loan_id,date_loan,...,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,unemployment_rate,commited_crimes
0,1787,4.0,30,weekly issuance,930322,0.0,5025.000000,0.000000,5314,930705,...,10,81.8,9650,3.38,3.67,100,2985.0,2804,3.525,2894.5
1,1801,17.0,46,monthly issuance,930213,20.0,13523.158824,8884.240000,5316,930711,...,10,73.5,8369,1.79,2.31,117,2854.0,2618,2.050,2736.0
2,9188,15.0,45,monthly issuance,930208,9.0,5009.733333,6097.000000,6863,930728,...,5,53.5,8390,2.28,2.89,132,2080.0,2122,2.585,2101.0
3,1843,13.0,12,monthly issuance,930130,12.0,9254.600000,7168.100000,5325,930803,...,9,74.8,10045,1.42,1.71,135,6604.0,6295,1.565,6449.5
4,11013,13.0,1,weekly issuance,930214,14.0,21255.930769,16801.000000,7240,930906,...,5,50.5,8288,3.79,4.52,110,1562.0,1460,4.155,1511.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,9030,70.0,72,monthly issuance,950121,102.0,6448.582857,3830.458824,6818,961212,...,4,62.6,8994,3.80,4.79,110,9672.0,9208,4.295,9440.0
399,3189,28.0,29,monthly issuance,951129,31.0,13417.557143,10197.180645,5625,961215,...,6,55.6,8843,2.82,3.60,113,818.0,888,3.210,853.0
400,8972,20.0,70,monthly issuance,960521,19.0,8544.930000,6974.431579,6805,961221,...,7,89.9,10177,6.63,7.75,81,9878.0,10108,7.190,9993.0
401,10963,51.0,16,monthly issuance,950520,73.0,16554.986275,10992.139726,7233,961225,...,8,56.9,8427,1.12,1.54,107,1874.0,1913,1.330,1893.5


In [9]:
df = pipe(df,
        dp.calculate_number_of_disponents,
        dp.calculate_diff_salary_loan,
        dp.drop_duplicated_accounts,
        dp.drop_irrelevant_columns_from_df,
        dp.drop_demographic_columns_from_df)
df

Unnamed: 0,credit,frequency,withdrawal,credit_amount_avg,withdrawal_amount_avg,loan_id,amount,duration,payments,status,type,gender,age_loan,unemployment_rate,commited_crimes,disp_id_count,diff_salary_loan
0,4.0,weekly issuance,0.0,5025.000000,0.000000,5314,96396,12,8033,-1,OWNER,female,45,3.525,2894.5,1,1617
1,17.0,monthly issuance,20.0,13523.158824,8884.240000,5316,165960,36,4610,1,OWNER,male,24,2.050,2736.0,1,3759
2,15.0,monthly issuance,9.0,5009.733333,6097.000000,6863,127080,60,2118,1,OWNER,male,57,2.585,2101.0,1,6272
3,13.0,monthly issuance,12.0,9254.600000,7168.100000,5325,105804,36,2939,1,OWNER,female,53,1.565,6449.5,1,7106
4,13.0,weekly issuance,14.0,21255.930769,16801.000000,7240,274740,60,4579,1,OWNER,male,14,4.155,1511.0,1,3709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,70.0,monthly issuance,102.0,6448.582857,3830.458824,6818,155616,48,3242,1,OWNER,female,25,4.295,9440.0,2,5752
324,28.0,monthly issuance,31.0,13417.557143,10197.180645,5625,222180,60,3703,-1,OWNER,male,57,3.210,853.0,1,5140
325,20.0,monthly issuance,19.0,8544.930000,6974.431579,6805,45024,48,938,1,OWNER,female,39,7.190,9993.0,1,9239
326,51.0,monthly issuance,73.0,16554.986275,10992.139726,7233,115812,36,3217,1,OWNER,male,43,1.330,1893.5,1,5210


In [10]:
# convert to numerical values
df = dp.convert_n_numerical_to_numerical(df, 'frequency')
df = dp.convert_n_numerical_to_numerical(df, 'type')
df = dp.convert_n_numerical_to_numerical(df, 'gender')

In [12]:
df.to_csv('../test_ml/df.csv', index=False)

**ASSEMBLE MAIN_DF**

In [None]:
# join account, loan, disposition and client
main_df = account.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='right')
main_df = main_df.merge(disp, on='account_id', suffixes=('','_disp'), how='left')
main_df = main_df.merge(client, on='client_id',suffixes=('','_client'), how='left')

main_df.drop(columns='district_id', axis=1, inplace=True)

# create age_loan and gender column
main_df = main_df.apply(lambda row: dp.calculate_age_loan(row), axis=1)

# join demograph
main_df = main_df.merge(district, left_on='district_id_client', right_on='code', how='left')

# join creditcard
main_df = main_df.merge(card_dev, on='disp_id', suffixes=('', '_card'), how='left')

In [None]:
main_df = pipe(main_df,
               dp.calculate_number_of_disponents,
               dp.calculate_diff_salary_loan,
               dp.drop_duplicated_accounts,
               dp.drop_irrelevant_columns_from_main_df,
               dp.rename_main_df_columns)

In [None]:
main_df = dp.convert_n_numerical_to_numerical(main_df, 'frequency')
main_df = dp.convert_n_numerical_to_numerical(main_df, 'type')

**ASSEMBLE TRANSACTIONS_DF**

In [None]:
transactions_df = account.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='right').merge(trans_dev, on='account_id', suffixes=('', '_transaction'), how='left')

In [None]:
transactions_df = pipe(transactions_df,
                       dp.calculate_transaction_count,
                       dp.calculate_credit_debit_ratio,
                       dp.drop_irrelevant_columns_from_transactions_df,
                       dp.rename_transactions_df_columns)
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'frequency')
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'operation')
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'type')
transactions_df = dp.convert_n_numerical_to_numerical(transactions_df, 'bank')

**ASSEMBLE TESTING DFS**

In [None]:
amount_outliers_df = dp.drop_outliers(main_df, 'amount')


In [None]:
demographic_df = dp.drop_demographic_columns_from_transactions_df(main_df)

In [None]:
age_loan_outliers_df = main_df[main_df['age_loan'] > 18]

In [None]:
irrelevant_columns_df = dp.drop_irrelevant_columns(main_df, ['duration'])

In [None]:
main_df.to_csv('../test_ml/main_df.csv', index=False)
transactions_df.to_csv('../test_ml/transactions_df.csv', index=False)
amount_outliers_df.to_csv('../test_ml/amount_outliers_df.csv', index=False)
age_loan_outliers_df.to_csv('../test_ml/age_loan_outliers_df.csv', index=False)
irrelevant_columns_df.to_csv('../test_ml/irrelevant_columns_df.csv', index=False)

- assessment of dimensions of data quality
- (cleaning): redundancy
- (cleaning): missing data
- (cleaning): outliers
- data transformation for compatibility with algorithms
- feature engineering from tabular data
- sampling for domain-specific purposes
- sampling for development
- imbalanced data
- feature selection

##### **Redundancy**

explain what has been done regarding redundant data

##### **Missing Data**

explain what has been done regarding missing data

##### **Outliers**

explain what has been done regarding outliers

##### **Other data preparation operations**

explain what has been done additionally