## Data Preprocessing
--- 
This notebook sets out to clean up the data

Mainly, we are looking to encode any string columns, and fill any NaN entries, and balance the dataset

In [1]:
import pandas as pd

pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

In [2]:
df_train = pd.read_csv(r"raw_data/application_train.csv", index_col=0)
df_test = pd.read_csv(r"raw_data/application_test.csv", index_col=0)


In [3]:
df.shape

(356255, 121)

In [76]:
# Shape of the data
df_train.shape, df_test.shape


((307511, 121), (48744, 120))

In [77]:
# function to created and implement an encoding dictionary


def encode_data(df_train, df_test):
    key = {}
    key_columns = []

    print("Creating encoding dictionary")
    # create a key for data in columns
    for col in df_train.columns:
        # check if the column is a string
        if df_train[col].dtype == "O":
            key_columns.append(col)
            # loop over the unique strings in the dataframe
            for col_name in df_train[col].unique():
                # check if the string is not in the dictionary
                if col_name not in key:
                    # add to the dictionary with a unique value
                    key[col_name] = len(key) + 1

    print("Dictionary created...")
    print("Integrating keys into dataframe...")
    # replace the string values in the dataframe with the key created
    for col in key_columns:
        df_train = df_train.replace({str(col): key})  # train
    for col in key_columns:
        df_test = df_test.replace({str(col): key})  # test

    print("Done")

    return df_train, df_test, key

In [78]:
df_train, df_test, key = encode_data(df_train, df_test)
# key


Creating encoding dictionary
Dictionary created...
Integrating keys into dataframe...
Done


In [79]:
# check that all the columns are numbers (empty is good)
col_dtype = len(
    list(df_train.select_dtypes(exclude=["int64", "float64"]).columns)
) + len(list(df_test.select_dtypes(exclude=["int64", "float64"]).columns))

print(f"There are {col_dtype} columns that are not int or float")

There are 0 columns that are not int or float


In [80]:
# check and return for any columns with NaN
print(
    f"There are {len(df_train.columns[df_train.isna().any()])} columns containing NaN values"
)

There are 61 columns containing NaN values


In [81]:
# function to fill NaN values with 0


def process_data(df):
    df = df.fillna(value=0)

    return df

In [82]:
df_train = process_data(df_train)
df_test = process_data(df_test)

# check that there are not any NaN's (empty is good)
nan_col = len(df_train.columns[df_train.isna().any()]) + len(
    df_test.columns[df_test.isna().any()]
)

print(f"There are {nan_col} columns containing NaN values")

There are 0 columns containing NaN values


In [83]:
# create a new dataframe
undersampled_data = pd.DataFrame()

# split dataframe by defualt/non_default
default = df_train[df_train["TARGET"] == 1]
non_default = df_train[df_train["TARGET"] == 0]

# reduce the larger sample to the smaller sample size
non_default = non_default.sample(len(default), random_state=42)

# add to new dataframe
undersampled_data = pd.concat([default, non_default], axis=0)

# shuffle dataframe
undersampled_data = undersampled_data.sample(frac=1)

# undersampled_data[undersampled_data["TARGET"] == 1].shape, undersampled_data[undersampled_data["TARGET"] == 0].shape

In [84]:
# export the cleaned data to a new file

undersampled_data.to_csv(r"processed_data/df_train.csv")
df_test.to_csv(r"processed_data/df_test.csv")

print("Data exported")

Data exported
