In [1]:
import os
import logging
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
RANDOM_STATE = 42
TOT_MEMORY_IN = 0
TOT_MEMORY_OUT = 0
ori_dir = '../original_data/'
app_dir = '../app_data/'
add_dir = '../add_data/'
sample_dir = '../sample_data/'

directories = [app_dir, add_dir, sample_dir]
for dir in directories:
    if not os.path.exists(dir):
        os.makedirs(dir, exist_ok=True)

In [3]:
def reduce_memory_usage(df: pd.DataFrame,
                        card_threshold=0.2):
    """
    Reduces memory usage of a DataFrame by downcasting numeric columns to more efficient types.
    Converts int64 -> int32 and float64 -> float32 where applicable.
    """    

    columns_by_dtype = {
    'int_columns': df.select_dtypes(include='int').columns.tolist(),
    'float_columns': df.select_dtypes(include='float').columns.tolist(),
    'object_columns': df.select_dtypes(include='object').columns.tolist()
    }

    for col in columns_by_dtype['int_columns']:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in columns_by_dtype['float_columns']:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in columns_by_dtype['object_columns']:
        num_unique_values = df[col].nunique()
        num_total_values = len(df[col])
        if num_unique_values / num_total_values < card_threshold:  
            df[col] = df[col].astype('category')

    return df

In [4]:
full_application = pd.read_csv(ori_dir + 'application_train.csv')
full_application = reduce_memory_usage(full_application)

train_application, test_application = train_test_split(full_application, 
                                                       train_size=0.8,
                                                       stratify=full_application['TARGET'],
                                                       shuffle=True,
                                                       random_state=RANDOM_STATE)

sample_train_application, _ = train_test_split(train_application, 
                                               train_size=0.25,
                                               stratify=train_application['TARGET'],
                                               shuffle=True,
                                               random_state=RANDOM_STATE)

train_application.to_parquet(app_dir + 'application_train.parquet')
test_application.to_parquet(app_dir + 'application_test.parquet')
sample_train_application.to_parquet(sample_dir + 'sample_application.parquet')

del full_application

In [5]:
full_bureau = pd.read_csv(ori_dir + 'bureau.csv')
full_bureau = reduce_memory_usage(full_bureau)

sample_train_bureau = full_bureau[full_bureau['SK_ID_CURR'].isin(sample_train_application['SK_ID_CURR'])]

full_bureau.to_parquet(add_dir + 'bureau.parquet')
sample_train_bureau.to_parquet(sample_dir + 'sample_bureau.parquet')

del full_bureau

In [6]:
full_bureau_balance = pd.read_csv(ori_dir + 'bureau_balance.csv')
full_bureau_balance = reduce_memory_usage(full_bureau_balance)

sample_train_bureau_balance = full_bureau_balance[full_bureau_balance['SK_ID_BUREAU'].isin(sample_train_bureau['SK_ID_BUREAU'])]

full_bureau_balance.to_parquet(add_dir + 'bureau_balance.parquet')
sample_train_bureau_balance.to_parquet(sample_dir + 'sample_bureau_balance.parquet')

del full_bureau_balance

In [7]:
full_previous_application = pd.read_csv(ori_dir + 'previous_application.csv')
full_previous_application = reduce_memory_usage(full_previous_application)

sample_train_previous_application = (full_previous_application[full_previous_application['SK_ID_CURR']
                                     .isin(sample_train_application['SK_ID_CURR'])])

full_previous_application.to_parquet(add_dir + 'previous_application.parquet')
sample_train_previous_application.to_parquet(sample_dir + 'sample_previous_application.parquet')


del full_previous_application

In [8]:
full_POS_CASH_balance = pd.read_csv(ori_dir + 'POS_CASH_balance.csv')
full_POS_CASH_balance = reduce_memory_usage(full_POS_CASH_balance)

sample_train_POS_CASH_balance = (full_POS_CASH_balance[full_POS_CASH_balance['SK_ID_CURR']
                                 .isin(sample_train_application['SK_ID_CURR'])])

full_POS_CASH_balance.to_parquet(add_dir + 'POS_CASH_balance.parquet')
sample_train_POS_CASH_balance.to_parquet(sample_dir + 'sample_POS_CASH_balance.parquet')


del full_POS_CASH_balance

In [9]:
full_installments_payments = pd.read_csv(ori_dir + 'installments_payments.csv')
full_installments_payments = reduce_memory_usage(full_installments_payments)

sample_train_installments_payments = (full_installments_payments[full_installments_payments['SK_ID_CURR']
                                 .isin(sample_train_application['SK_ID_CURR'])])

full_installments_payments.to_parquet(add_dir + 'installments_payments.parquet')
#test_installments_payments.to_parquet(test_dir + 'installments_payments_test.parquet')
sample_train_installments_payments.to_parquet(sample_dir + 'sample_installments_payments.parquet')


del full_installments_payments

In [10]:
full_credit_card_balance = pd.read_csv(ori_dir + 'credit_card_balance.csv')
full_credit_card_balance = reduce_memory_usage(full_credit_card_balance)

sample_train_credit_card_balance = (full_credit_card_balance[full_credit_card_balance['SK_ID_CURR']
                                    .isin(sample_train_application['SK_ID_CURR'])])

full_credit_card_balance.to_parquet(add_dir + 'credit_card_balance.parquet')
#test_credit_card_balance.to_parquet(test_dir + 'credit_card_balance_test.parquet')
sample_train_credit_card_balance.to_parquet(sample_dir + 'sample_credit_card_balance.parquet')

del full_credit_card_balance