# Featuretools with DASK

In [1]:
import pandas as pd
import numpy as np
import featuretools as ft
import time
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

In [None]:
DATA_PATH = 'Home_credit_data/'
app_train = pd.read_csv(DATA_PATH + 'application_train.csv')
# app_test = pd.read_csv(DATA_PATH + 'application_test.csv')
bureau = pd.read_csv(DATA_PATH + 'bureau.csv')
bureau_balance = pd.read_csv(DATA_PATH + 'bureau_balance.csv')
cash = pd.read_csv(DATA_PATH + 'POS_CASH_balance.csv')
credit = pd.read_csv(DATA_PATH + 'credit_card_balance.csv')
previous = pd.read_csv(DATA_PATH + 'previous_application.csv')
installments = pd.read_csv(DATA_PATH + 'installments_payments.csv')

In [None]:
# quick cleaning: unrealistic days value (365243 days is > 1000 years)
app_train = app_train.replace({365243: np.nan})
# app_test = app_test.replace({365243: np.nan})
bureau = bureau.replace({365243: np.nan})
bureau_balance = bureau_balance.replace({365243: np.nan})
cash = cash.replace({365243: np.nan})
credit = credit.replace({365243: np.nan})
previous = previous.replace({365243: np.nan})
installments = installments.replace({365243: np.nan})

### Changing types to save memory

In [None]:
def change_variable_types(df):
    """Changes data types to be memory efficient"""
    for col in df:
        # IDs and bool to int
        if ('SK_ID' in col):
            df[col] = df[col].fillna(0).astype(np.int32)   
        # objects to category
        elif (df[col].dtype == 'object') and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype('category')    
        # float64 to float32
        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)
        # int64 to int32
        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)
        # bool to int
        elif set(df[col].unique()) == {0, 1}:
            df[col] = df[col].astype(bool) 
    return df

In [None]:
app_train = change_variable_types(app_train)
bureau = change_variable_types(bureau)
bureau_balance = change_variable_types(bureau_balance)
cash = change_variable_types(cash)
credit = change_variable_types(credit)
previous = change_variable_types(previous)
installments = change_variable_types(installments)

### Partitioning

In [None]:
# adding SK_ID_CURR to bureau_balance
bureau_balance = bureau_balance.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], 
                                      on='SK_ID_BUREAU',
                                      how='left')

# SK_ID_CURR has been added as a float64 so we call the change_variable_types function again
bureau_balance = change_variable_types(bureau_balance)

In [None]:
def partition_datasets(clients_list, num_partition):
    """Partitions the datasets based on who is in the clients_list argument"""
    # subsetting
    app_train_subset = app_train[app_train.index.isin(clients_list)].copy()
    bureau_subset = bureau[bureau.index.isin(clients_list)].copy()
    bureau_balance_subset = bureau_balance[bureau_balance.index.isin(clients_list)].copy()
    cash_subset = cash[cash.index.isin(clients_list)].copy()
    credit_subset = credit[credit.index.isin(clients_list)].copy()
    previous_subset = previous[previous.index.isin(clients_list)].copy()
    installments_subset = installments[installments.index.isin(clients_list)].copy()
    
    # resetting indexes
    app_train_subset = app_train_subset.reset_index()
    bureau_subset = bureau_subset.reset_index()
    bureau_balance_subset = bureau_balance_subset.reset_index(drop=True)
    cash_subset = cash_subset.reset_index(drop=True)
    credit_subset = credit_subset.reset_index(drop=True)
    previous_subset = previous_subset.reset_index(drop=True)
    installments_subset = installments_subset.reset_index(drop=True)
    
    # saving
    app_train_subset.to_csv('partitions/app_train' + str(num_partition) + '.csv', index=False)
    bureau_subset.to_csv('partitions/bureau' + str(num_partition) + '.csv', index=False)
    bureau_balance_subset.to_csv('partitions/bureau_balance' + str(num_partition) + '.csv', index=False)
    cash_subset.to_csv('partitions/cash' + str(num_partition) + '.csv', index=False)
    credit_subset.to_csv('partitions/credit' + str(num_partition) + '.csv', index=False)
    previous_subset.to_csv('partitions/previous' + str(num_partition) + '.csv', index=False)
    installments_subset.to_csv('partitions/installments' + str(num_partition) + '.csv', index=False)
    return

In [None]:
batch_size = app_train.shape[0] // 60
clients_ids_list = [list(app_train.iloc[i : i+batch_size].index) for i in range(0, app_train.shape[0], batch_size)]

start = time.time()
for n, clients_ids in enumerate(clients_ids_list):
    partition_datasets(clients_ids, n)
end = time.time()
print((end - start) / 60, 'minutes')