In [1]:
import pandas as pd
import numpy as np

import featuretools as ft
import dask
from dask.distributed import Client
import dask.dataframe as dd



In [2]:
#!pip install --upgrade numpy pandas matplotlib seaborn woodwork featuretools scikit-learn pyarrow

In [3]:
#!pip install --upgrade dask distributed nodejs dask-labextension

In [4]:
app_train = pd.read_csv('./application_train.csv')
app_test = pd.read_csv('./application_test.csv')
bureau = pd.read_csv('./bureau.csv')
bureau_balance = pd.read_csv('./bureau_balance.csv')
cash = pd.read_csv('./POS_CASH_balance.csv')
credit = pd.read_csv('./credit_card_balance.csv')
previous = pd.read_csv('./previous_application.csv')
installments = pd.read_csv('./installments_payments.csv')

In [5]:
app_test['TARGET'] = np.nan
app = pd.concat([app_train, app_test], ignore_index=True, sort=True)
number_clients = app.shape[0]

In [6]:
import sys

def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df):
    """Convert pandas data types for memory reduction."""
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif set(df[c].unique()) == {0, 1}:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    return df

In [7]:
app = convert_types(app)
bureau = convert_types(bureau)
bureau_balance = convert_types(bureau_balance)
cash = convert_types(cash)
credit = convert_types(credit)
previous = convert_types(previous)
installments = convert_types(installments)

In [8]:
bureau_balance = bureau_balance.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], 
                                      on = 'SK_ID_BUREAU', how = 'left')

In [9]:
for dataset in [app, bureau, bureau_balance, cash, credit, previous, installments]:
    dataset.set_index('SK_ID_CURR', inplace = True)

In [10]:
def replace_day_outliers(df):
    """Replace 365243 with np.nan in any columns with DAYS"""
    for col in df.columns:
        if "DAYS" in col:
            df[col] = df[col].replace({365243: np.nan})

    return df

app_train = replace_day_outliers(app_train)
app_test = replace_day_outliers(app_test)
bureau = replace_day_outliers(bureau)
bureau_balance = replace_day_outliers(bureau_balance)
credit = replace_day_outliers(credit)
cash = replace_day_outliers(cash)
previous = replace_day_outliers(previous)
installments = replace_day_outliers(installments)

In [11]:
def create_partition(user_list, partition):
    """Creates and saves a dataset with only the users in `user_list`."""
    
    # Make the directory
    directory = './input/partitions/p%d' % (partition + 1)
    if os.path.exists(directory):
        return
    
    else:
        os.makedirs(directory)
        
        # Subset based on user list
        app_subset = app[app.index.isin(user_list)].copy().reset_index()
        bureau_subset = bureau[bureau.index.isin(user_list)].copy().reset_index()

        # Drop SK_ID_CURR from bureau_balance, cash, credit, and installments
        bureau_balance_subset = bureau_balance[bureau_balance.index.isin(user_list)].copy().reset_index(drop = True)
        cash_subset = cash[cash.index.isin(user_list)].copy().reset_index(drop = True)
        credit_subset = credit[credit.index.isin(user_list)].copy().reset_index(drop = True)
        previous_subset = previous[previous.index.isin(user_list)].copy().reset_index()
        installments_subset = installments[installments.index.isin(user_list)].copy().reset_index(drop = True)
        
        
        # Save data to the directory
        app_subset.to_csv('%s/app.csv' % directory, index = False)
        bureau_subset.to_csv('%s/bureau.csv' % directory, index = False)
        bureau_balance_subset.to_csv('%s/bureau_balance.csv' % directory, index = False)
        cash_subset.to_csv('%s/cash.csv' % directory, index = False)
        credit_subset.to_csv('%s/credit.csv' % directory, index = False)
        previous_subset.to_csv('%s/previous.csv' % directory, index = False)
        installments_subset.to_csv('%s/installments.csv' % directory, index = False)

        if partition % 10 == 0:
            print('Saved all files in partition {} to {}.'.format(partition + 1, directory))

In [12]:
# Break into 104 chunks
chunk_size = app.shape[0] // 103

# Construct an id list
id_list = [list(app.iloc[i:i+chunk_size].index) for i in range(0, app.shape[0], chunk_size)]

In [13]:
import os 

for i, ids in enumerate(id_list):
    # Create a partition based on the ids
    create_partition(ids, i)

Saved all files in partition 1 to ./input/partitions/p1.
Saved all files in partition 11 to ./input/partitions/p11.
Saved all files in partition 21 to ./input/partitions/p21.
Saved all files in partition 31 to ./input/partitions/p31.
Saved all files in partition 41 to ./input/partitions/p41.
Saved all files in partition 51 to ./input/partitions/p51.
Saved all files in partition 61 to ./input/partitions/p61.
Saved all files in partition 71 to ./input/partitions/p71.
Saved all files in partition 81 to ./input/partitions/p81.
Saved all files in partition 91 to ./input/partitions/p91.
Saved all files in partition 101 to ./input/partitions/p101.


In [None]:
#!rm -rf ./input
