## Data Preprocessing
--- 
This notebook sets out to clean up the data

Mainly, we are looking to encode any string columns, and fill any NaN entries, and balance the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from imblearn.over_sampling import SMOTE

pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv(r"raw_data/application_train.csv", index_col=0)


In [3]:
# Shape of the data
data.shape


(307511, 121)

---
### Encode the dataset

In [4]:
categorical_features = [col for col in data.columns if data[col].dtype == 'object']

In [5]:
data = pd.get_dummies(data, columns=categorical_features)

---
### NaN entries

In [6]:
# remove any columns with more than 80% missing

data = data[data.columns[data.isnull().mean() < 0.80]]

In [7]:
# fill NaN values with linear regression

with_null = data.loc[:, data.isnull().any()]
without_null = data.loc[:, data.notnull().all()]
features_with_null = with_null.columns

for i, temp_feature in enumerate(features_with_null):
    print('For now, {} features have null data'.format(data.isnull().any().sum()))
    print('{} have {} null data'.format(temp_feature, data[temp_feature].isnull().sum()))
    
    temp_train = without_null.copy()
    temp_train[temp_feature] = with_null[temp_feature]
    
    new_train = temp_train.loc[temp_train[temp_feature].notnull(), :]
    new_test = temp_train.loc[temp_train[temp_feature].isnull(), :]
    
    temp_target = new_train[temp_feature].values
    
    new_train.drop([temp_feature], axis=1, inplace=True)
    new_test.drop([temp_feature], axis=1, inplace=True)
    
    print('-'*30,  '{} : Start Linear regression'.format(i), '-'*30)
    lr = LinearRegression()
    lr.fit(new_train, temp_target)
    
    temp_pred = lr.predict(new_test)

    new_train[temp_feature] = temp_target
    new_test[temp_feature] = temp_pred
    print('Prediction and concat')
    foo = pd.concat([new_train, new_test]).sort_index()
    
    data[temp_feature] = foo[temp_feature]
    
    del foo

For now, 61 features have null data
AMT_ANNUITY have 12 null data
------------------------------ 0 : Start Linear regression ------------------------------
Prediction and concat
For now, 60 features have null data
AMT_GOODS_PRICE have 278 null data
------------------------------ 1 : Start Linear regression ------------------------------
Prediction and concat
For now, 59 features have null data
OWN_CAR_AGE have 202929 null data
------------------------------ 2 : Start Linear regression ------------------------------
Prediction and concat
For now, 58 features have null data
CNT_FAM_MEMBERS have 2 null data
------------------------------ 3 : Start Linear regression ------------------------------
Prediction and concat
For now, 57 features have null data
EXT_SOURCE_1 have 173378 null data
------------------------------ 4 : Start Linear regression ------------------------------
Prediction and concat
For now, 56 features have null data
EXT_SOURCE_2 have 660 null data
-------------------------

In [8]:
# check and return for any columns with NaN
print(
    f"There are {len(data.columns[data.isna().any()])} columns containing NaN values"
)

There are 0 columns containing NaN values


---
### Check dtypes

In [9]:
# check that all float columns are supposed to be floats

float_columns = data.columns[data.dtypes == "float64"]

data[float_columns]

data[["DAYS_REGISTRATION", "CNT_FAM_MEMBERS"]] = data[["DAYS_REGISTRATION", "CNT_FAM_MEMBERS"]].astype(int)


In [53]:
# Lets also check that all the values in the dataframe as numerical

data.select_dtypes(exclude=["int64", "int32", "float64", "uint8"])

# this is just the index, so all is good

100002
100003
100004
100006
100007
...
456251
456252
456253
456254
456255


---
### Balance the dataset

In [10]:
# OVERSAMPLING
x = data.drop(["TARGET"], axis=1)
y = data.filter(["TARGET"], axis=1)

smote = SMOTE()
x_resampled, y_resampled = smote.fit_resample(x, y)

oversampled_df = pd.DataFrame()
oversampled_df = pd.concat([x_resampled, y_resampled], axis=1)
# shuffle dataframe
oversampled_df = oversampled_df.sample(frac=1, random_state=42)

# oversampled_df.head()

---
### Correlations

In [12]:
def drop_correlated_columns(data, threshold, column_dropped=False):
    """
    This function drops any columns that have a high correlation to each other
    It will only leave one of the highly correlated columns
    """
    
    # Create correlation matrix
    corr_matrix = data.corr().abs()
    
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    
    # Find index of feature columns with correlation greater than threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    # Drop the highly correlated columns
    for column in to_drop:
        data.drop(column, axis=1, inplace=True)
        
    if column_dropped:
        print(f"Columns dropped:\n{to_drop}")
    return data


In [13]:
oversampled_df.shape

(565372, 245)

In [14]:
# drop any columns with a high correlation

oversampled_df = drop_correlated_columns(oversampled_df, threshold=0.8, column_dropped=True)

Columns dropped:
['AMT_GOODS_PRICE', 'FLAG_EMP_PHONE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT_W_CITY', 'LIVE_REGION_NOT_WORK_REGION', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'NAME_CONTRACT_TYPE_Revolving loans', 'NAME_INCOME_TYPE_Pensioner', 'ORGANIZATION_TYPE_XNA', 'EMERGENCYSTATE_MODE_No']


In [15]:
oversampled_df.shape

(565372, 203)

---
### Split the dataset train/test csv

In [16]:
def split_dataset(df, test_size=0.2):
    train_size = 1 - test_size
    train = df[:int(len(df)*train_size)]
    test = df[int(len(df)*train_size):]
    
    return train, test

In [54]:
oversampled_train, oversampled_test = split_dataset(oversampled_df, test_size=0.15)

print(f"Train dataset: {oversampled_train.shape}")
print(f"Test dataset: {oversampled_test.shape}")

Train dataset: (480566, 203)
Test dataset: (84806, 203)


---
### Export the datasets

In [18]:
# export the cleaned data to a new file

oversampled_train.to_csv(r"processed_data/oversampled_train.csv")
oversampled_test.to_csv(r"processed_data/oversampled_test.csv")

print("Data exported")

Data exported
