In [2]:
# Import the libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer

In [3]:
# Outlier Scaling using .quantile() Pandas methods
def scale_outlier(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_bound = Q1 - 1.5*IQR
    max_bound = Q3 + 1.5*IQR
    df[column] = np.where(df[column] > max_bound, max_bound, df[column])
    df[column] = np.where(df[column] < min_bound, min_bound, df[column])

# Min-Max Scaling using .min() and .max() Pandas methods
def min_max_scaling(df):    
    df_norm = df.copy()
    for column in df_norm.columns:
        df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())        
    return df_norm

In [4]:
df = pd.read_csv("../Datasets/Cleaned_data.csv", index_col = [0]) # =>(55990, 34)
target_col = ['BoEth', 'CoEth']
remove_col = ['Bank', 'MSA', 'LocMedY', 'Assigned.ID']
binary_encoding = ['SpcHsgGoals', 'Self', 'First']

# ordinal_cols = ['CoBoCreditScor', 'BoCreditScor', 'NumBor']
# categorical_cols = ['BoRace', 'CoRace', 'BoGender',\
#                   'CoGender', 'Geog', 'PropType', 'BoEth',\
#                   'CoEth', 'SpcHsgGoals', 'AcqTyp',' Bank',\
#                   'FedGuar', 'First', 'Self', 'MSA']
# numerical_cols = ['Assigned.ID', 'BoAge', 'CoAge', 'MinPer', 'LocMedY', 'Tractrat', 'Income',\
#                  'IncRat', 'UPB', 'LTV', 'Term', 'Rate', 'Front','Back', 'PMI']

# Removing columns not used for analysis
df = df.drop(remove_col, axis = 1)

# Extracting and removing the target column
target = df[target_col]
df = df.drop(target_col, axis = 1)

# Binary encoding for specific columns
df[binary_encoding] = df[binary_encoding].replace([2], 0)

# print(df.shape) => (55990, 28)

ordinal_cols = ['CoBoCreditScor', 'BoCreditScor', 'NumBor']
# ordinal cols = 3

categorical_cols = ['BoRace', 'CoRace', 'BoGender',\
                  'CoGender', 'Geog', 'PropType',\
                  'AcqTyp', 'FedGuar']
# categorical cols  = 8

numerical_cols = ['BoAge', 'CoAge', 'MinPer', 'Tractrat', 'Income',\
                 'IncRat', 'UPB', 'LTV', 'Term', 'Rate', 'Front','Back', 'PMI']
#numerical cols = 13
print(df.shape)

(55990, 27)


In [284]:
# One hot encoding - Convert categorical columns to One hot encoding
df = pd.get_dummies(df, columns = categorical_cols)
print(df.shape)

(55990, 56)


In [285]:
# Normalise numerical columns and add it to the dataset
scale_columns = min_max_scaling(df[numerical_cols])
df = df.drop(numerical_cols, axis = 1)
df = pd.concat([df, scale_columns], axis=1)

In [286]:
# Drop one hot encoded columns that have values in less than 5% of total number of rows or more than 95% of rows
oheEncodedCols = ['BoRace_1', 'BoRace_2', 'BoRace_3', 'BoRace_4', 'BoRace_5', 'BoRace_7',\
                  'CoRace_1', 'CoRace_2', 'CoRace_3', 'CoRace_4', 'CoRace_5', 'CoRace_7', 'CoRace_8',\
                  'BoGender_1', 'BoGender_2', 'BoGender_3', 'CoGender_1', 'CoGender_2', 'CoGender_3', 'CoGender_4',\
                  'Geog_1', 'Geog_2', 'PropType_PT01', 'PropType_PT02', 'PropType_PT04', 'PropType_PT06',\
                  'PropType_PT07', 'PropType_PT09', 'PropType_PT10', 'PropType_PT11', 'PropType_PT12',\
                  'AcqTyp_1', 'AcqTyp_4', 'FedGuar_0', 'FedGuar_1', 'FedGuar_2', 'FedGuar_3']
for val in oheEncodedCols:
  filteredLen = df[df[val] == 1].count()[0]
  datasetLen = df.count()[0]
  if( filteredLen < 0.03 * datasetLen or filteredLen > 0.97 * datasetLen):
    df = df.drop([val], axis=1)
print(df.shape)

(55990, 43)


In [287]:
new_df = pd.concat([df, target], axis = 1)
new_df.to_csv(r'../Datasets/Encoded_data_without_PCA.csv', index = True)

In [288]:
from sklearn.decomposition import PCA
pca = PCA()
df_pca = pca.fit_transform(df)
# Store as dataframe and print
df_pca = pd.DataFrame(df_pca)
df_pca.round(10).head()

print(pca.explained_variance_ratio_.round(3))

[0.67  0.078 0.045 0.031 0.029 0.022 0.019 0.014 0.009 0.009 0.009 0.008
 0.007 0.006 0.006 0.005 0.005 0.004 0.004 0.003 0.003 0.003 0.003 0.001
 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.   ]


In [289]:
total_variance = 0
pca_columns = 0
for i in pca.explained_variance_ratio_:
    total_variance += i
    pca_columns += 1
    if total_variance > 0.9:
        break
print(pca_columns)

8


In [290]:
print(df_pca.shape)
df_pca.drop(df_pca.iloc[:, pca_columns:], inplace = True, axis = 1) 
df_pca = pd.concat([df_pca, target], axis = 1)
print(df_pca.shape)
df_pca.to_csv(r'../Datasets/Encoded_data_with_PCA.csv', index = False)

(55990, 43)
(55991, 10)
