In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import pickle

In [3]:
# Load dataset
df = pd.read_csv('base.csv')

In [4]:
# Drop device fraud count due to no positive examples in this dataset
df = df.drop(columns=['device_fraud_count'])


In [18]:
# Convert fraud_bool to boolean
df['fraud_bool'] = df['fraud_bool'].astype(bool)

In [19]:
# Split test and train, months 0-5 are train, 6-7 are test
train = df[df['month'].isin([0,1,2,3,4,5])].reset_index(drop=True)
test = df[df['month'].isin([6,7])].reset_index(drop=True)

In [20]:
# Drop month column
train = train.drop(columns=['month'])
test = test.drop(columns=['month'])


In [21]:
# Create new feature income / proposed_credit_limit
train['income_to_credit_limit'] = train['income'] / train['proposed_credit_limit']
test['income_to_credit_limit'] = test['income'] / test['proposed_credit_limit']


In [22]:
# Log scaling 'days_since_request', 'zip_count_4w', and 'proposed_credit_limit' due to skewed distributions
log_scale_columns = ['days_since_request', 'zip_count_4w', 'proposed_credit_limit']
log_train_cols = pd.DataFrame()
log_test_cols = pd.DataFrame()

for col in log_scale_columns:
    log_train_cols[col] = np.log(train[col])
    log_test_cols[col] = np.log(test[col])

# Temporarily remove log scaled columns while robust scaler is applied to the other numerical columns
train = train.drop(columns=log_scale_columns)
test = test.drop(columns=log_scale_columns)

In [23]:
# Numerical columns
numerical_columns = train.select_dtypes(include=['int64', 'float64']).columns

# Apply robust scaler
scaler = RobustScaler()
train[numerical_columns] = scaler.fit_transform(train[numerical_columns])
test[numerical_columns] = scaler.transform(test[numerical_columns])


In [24]:
# Add log scaled columns back into train and test
train = pd.concat([train, log_train_cols], axis=1)
test = pd.concat([test, log_test_cols], axis=1)

In [25]:
# List objects type columns
object_columns_train = train.select_dtypes(include=['object']).columns
object_columns_test = test.select_dtypes(include=['object']).columns

# One-hot encode object columns
train = pd.get_dummies(train, columns=object_columns_train)
test = pd.get_dummies(test, columns=object_columns_test)

In [26]:
# Memory reduction
def reduce_mem_usage(df):
    """ 
    Iterate through all the numerical columns of a dataframe and modify the data type
    to reduce memory usage. 
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type != bool: 
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is {end_mem:.2f} MB")
    print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")

    return df

# Apply memory reduction
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)


Memory usage of dataframe is 172.10 MB
Memory usage after optimization is 58.38 MB
Decreased by 66.1%
Memory usage of dataframe is 44.38 MB
Memory usage after optimization is 15.05 MB
Decreased by 66.1%


In [None]:
# Save train and test to pickle
train.to_pickle('data/train.pkl')
test.to_pickle('data/test.pkl')