## Data Preprocessing
--- 
This notebook sets out to clean up the data

Mainly, we are looking to encode any string columns, and fill any NaN entries, and balance the dataset

In [21]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from imblearn.over_sampling import SMOTE

pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

In [22]:
data = pd.read_csv(r"raw_data/application_train.csv", index_col=0)


In [23]:
# Shape of the data
data.shape


(307511, 121)

---
### NaN entries

In [24]:
# remove any columns with more than 80% missing

data = data[data.columns[data.isnull().mean() < 0.80]]

In [35]:
# fill NaN values with linear regression

with_null = data.loc[:, data.isnull().any()]
without_null = data.loc[:, data.notnull().all()]
features_with_null = with_null.columns

for i, temp_feature in enumerate(features_with_null):
    print('For now, {} features have null data'.format(data.isnull().any().sum()))
    print('{} have {} null data'.format(temp_feature, data[temp_feature].isnull().sum()))
    
    temp_train = without_null.copy()
    temp_train[temp_feature] = with_null[temp_feature]
    
    new_train = temp_train.loc[temp_train[temp_feature].notnull(), :]
    new_test = temp_train.loc[temp_train[temp_feature].isnull(), :]
    
    temp_target = new_train[temp_feature].values
    
    new_train.drop([temp_feature], axis=1, inplace=True)
    new_test.drop([temp_feature], axis=1, inplace=True)
    
    print('-'*30,  '{} : Start Linear regression'.format(i), '-'*30)
    lr = LinearRegression()
    lr.fit(new_train, temp_target)
    
    temp_pred = lr.predict(new_test)

    new_train[temp_feature] = temp_target
    print('Prediction and concat')
    
    data[temp_feature] = new_train[temp_feature]
    
    del new_train


For now, 67 features have null data
AMT_ANNUITY have 12 null data
------------------------------ 0 : Start Linear regression ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train.drop([temp_feature], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test.drop([temp_feature], axis=1, inplace=True)


ValueError: could not convert string to float: 'Cash loans'

In [22]:
# check and return for any columns with NaN
print(
    f"There are {len(data.columns[data.isna().any()])} columns containing NaN values"
)

There are 67 columns containing NaN values


In [23]:
# function to fill NaN values with 0


def process_data(df):
    df = df.fillna(value=0)

    return df

In [24]:
data = process_data(data)

# check that there are not any NaN's (empty is good)
nan_col = len(data.columns[data.isna().any()])

print(f"There are {nan_col} columns containing NaN values")

There are 0 columns containing NaN values


---
### Encode the dataset

In [25]:
# function to created and implement an encoding dictionary


def encode_data(data):
    key = {}
    key_columns = []

    print("Creating encoding dictionary")
    # create a key for data in columns
    for col in data.columns:
        # check if the column is a string
        if data[col].dtype == "O":
            key_columns.append(col)
            # loop over the unique strings in the dataframe
            for col_name in data[col].unique():
                # check if the string is not in the dictionary
                if col_name not in key:
                    # add to the dictionary with a unique value
                    key[col_name] = len(key) + 1

    print("Dictionary created...")
    print("Integrating keys into dataframe...")
    # replace the string values in the dataframe with the key created
    for col in key_columns:
        data = data.replace({str(col): key}) 

    print("Done")

    return data, key

In [26]:
data, key = encode_data(data)
# key


Creating encoding dictionary
Dictionary created...
Integrating keys into dataframe...
Done


In [27]:
# check that all the columns are numbers (empty is good)
col_dtype = len(
    list(data.select_dtypes(exclude=["int64", "float64"]).columns)
)

print(f"There are {col_dtype} columns that are not int or float")

There are 0 columns that are not int or float


---
### Balance the dataset

In [28]:
# OVERSAMPLING
x = data.drop(["TARGET"], axis=1)
y = data.filter(["TARGET"], axis=1)

smote = SMOTE()
x_resampled, y_resampled = smote.fit_resample(x, y)

oversampled_df = pd.DataFrame()
oversampled_df = pd.concat([x_resampled, y_resampled], axis=1)
# shuffle dataframe
oversampled_df = oversampled_df.sample(frac=1, random_state=42)

# oversampled_df.head()

In [29]:
# UNDERSAMPLING

# split dataframe by defualt/non_default
default = data[data["TARGET"] == 1]
non_default = data[data["TARGET"] == 0]

# reduce the larger sample to the smaller sample size
non_default = non_default.sample(len(default), random_state=42)

# add to new dataframe
undersampled_df = pd.DataFrame()
undersampled_df = pd.concat([default, non_default], axis=0)
# shuffle dataframe
undersampled_df = undersampled_df.sample(frac=1, random_state=42)

# undersampled_df.head()

---
### Split the dataset train/test csv

In [30]:
def split_dataset(df, test_size=0.2):
    train_size = 1 - test_size
    train = df[:int(len(df)*train_size)]
    test = df[int(len(df)*train_size):]
    
    return train, test

In [31]:
undersampled_train, undersampled_test = split_dataset(undersampled_df, test_size=0.15)
oversampled_train, oversampled_test = split_dataset(oversampled_df, test_size=0.15)

# oversampled_train.shape, oversampled_test.shape

---
### Export the datasets

In [32]:
# export the cleaned data to a new file

# undersampled
undersampled_train.to_csv(r"processed_data/undersampled_train.csv")
undersampled_test.to_csv(r"processed_data/undersampled_test.csv")

# oversampled
oversampled_train.to_csv(r"processed_data/oversampled_train.csv")
oversampled_test.to_csv(r"processed_data/oversampled_test.csv")

print("Data exported")

Data exported
