## Data Preprocessing
--- 
This notebook sets out to clean up the data

Mainly, we are looking to encode any string columns, and fill any NaN entries, and balance the dataset

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE

pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

In [2]:
df_train = pd.read_csv(r"raw_data/application_train.csv", index_col=0)


In [3]:
df_train.shape

(307511, 121)

In [4]:
# Shape of the data
df_train.shape


(307511, 121)

---
### Encode the dataset

In [5]:
# function to created and implement an encoding dictionary


def encode_data(df_train):
    key = {}
    key_columns = []

    print("Creating encoding dictionary")
    # create a key for data in columns
    for col in df_train.columns:
        # check if the column is a string
        if df_train[col].dtype == "O":
            key_columns.append(col)
            # loop over the unique strings in the dataframe
            for col_name in df_train[col].unique():
                # check if the string is not in the dictionary
                if col_name not in key:
                    # add to the dictionary with a unique value
                    key[col_name] = len(key) + 1

    print("Dictionary created...")
    print("Integrating keys into dataframe...")
    # replace the string values in the dataframe with the key created
    for col in key_columns:
        df_train = df_train.replace({str(col): key}) 

    print("Done")

    return df_train, key

In [6]:
df_train, key = encode_data(df_train)
# key


Creating encoding dictionary
Dictionary created...
Integrating keys into dataframe...
Done


In [7]:
# check that all the columns are numbers (empty is good)
col_dtype = len(
    list(df_train.select_dtypes(exclude=["int64", "float64"]).columns)
)

print(f"There are {col_dtype} columns that are not int or float")

There are 0 columns that are not int or float


---
### Fill any NaN entries

In [8]:
# check and return for any columns with NaN
print(
    f"There are {len(df_train.columns[df_train.isna().any()])} columns containing NaN values"
)

There are 61 columns containing NaN values


In [9]:
# function to fill NaN values with 0


def process_data(df):
    df = df.fillna(value=0)

    return df

In [10]:
df_train = process_data(df_train)

# check that there are not any NaN's (empty is good)
nan_col = len(df_train.columns[df_train.isna().any()])

print(f"There are {nan_col} columns containing NaN values")

There are 0 columns containing NaN values


---
### Balance the dataset

In [11]:
# OVERSAMPLING
x = df_train.drop(["TARGET"], axis=1)
y = df_train.filter(["TARGET"], axis=1)

smote = SMOTE()
x_resampled, y_resampled = smote.fit_resample(x, y)

oversampled_df = pd.DataFrame()
oversampled_df = pd.concat([x_resampled, y_resampled], axis=1)
# shuffle dataframe
oversampled_df = oversampled_df.sample(frac=1, random_state=42)

# oversampled_df.head()

In [17]:
# UNDERSAMPLING

# split dataframe by defualt/non_default
default = df_train[df_train["TARGET"] == 1]
non_default = df_train[df_train["TARGET"] == 0]

# reduce the larger sample to the smaller sample size
non_default = non_default.sample(len(default), random_state=42)

# add to new dataframe
undersampled_df = pd.DataFrame()
undersampled_df = pd.concat([default, non_default], axis=0)
# shuffle dataframe
undersampled_df = undersampled_df.sample(frac=1, random_state=42)

# undersampled_df.head()

---
### Split the dataset train/test csv

In [18]:
def split_dataset(df, test_size=0.2):
    train_size = 1 - test_size
    train = df[:int(len(df)*train_size)]
    test = df[int(len(df)*train_size):]
    
    return train, test

In [19]:
undersampled_train, undersampled_test = split_dataset(undersampled_df, test_size=0.15)
oversampled_train, oversampled_test = split_dataset(oversampled_df, test_size=0.15)

# oversampled_train.shape, oversampled_test.shape

---
### Export the datasets

In [20]:
# export the cleaned data to a new file

# undersampled
undersampled_train.to_csv(r"processed_data/undersampled_train.csv")
undersampled_test.to_csv(r"processed_data/undersampled_test.csv")

# oversampled
oversampled_train.to_csv(r"processed_data/oversampled_train.csv")
oversampled_test.to_csv(r"processed_data/oversampled_test.csv")

print("Data exported")

Data exported
