In [16]:
# Import the libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer

In [17]:
# Outlier Scaling using .quantile() Pandas methods
def scale_outlier(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_bound = Q1 - 1.5*IQR
    max_bound = Q3 + 1.5*IQR
    df[column] = np.where(df[column] > max_bound, max_bound, df[column])
    df[column] = np.where(df[column] < min_bound, min_bound, df[column])

# Min-Max Scaling using .min() and .max() Pandas methods
def min_max_scaling(df):    
    df_norm = df.copy()
    for column in df_norm.columns:
        df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())        
    return df_norm

In [18]:
df = pd.read_csv("Cleaned_data.csv")
categorical_cols = ['BoRace', 'CoRace', 'BoGender',\
                                      'CoGender', 'Geog', 'BoCreditScor',\
                                      'CoBoCreditScor', 'PropType', 'BoEth',\
                                      'CoEth', 'SpcHsgGoals', 'AcqTyp', 'Bank',\
                                      'FedGuar', 'First', 'Self', 'NumBor']
numerical_cols = ['BoAge', 'CoAge','MSA', 'MinPer', 'LocMedY', 'Tractrat', 'Income',\
                 'IncRat', 'UPB', 'LTV', 'Term', 'Rate', 'Front','Back', 'PMI']

In [19]:
# One hot encoding - Convert categorical columns to One hot encoding
df = pd.get_dummies(df, columns = categorical_cols)

In [20]:
# Normalise numerical columns and add it to the dataset
scale_columns = min_max_scaling(df[numerical_cols])
df = df.drop(numerical_cols, axis = 1)
df = pd.concat([df, scale_columns], axis=1)

In [21]:
# Drop one hot encoded columns that have values in less than 5% of total number of rows or more than 95% of rows
oheEncodedCols = ['BoRace_1', 'BoRace_2', 'BoRace_3', 'BoRace_4', 'BoRace_5', 'BoRace_7', \
                  'CoRace_1', 'CoRace_2', 'CoRace_3', 'CoRace_4', 'CoRace_5', 'CoRace_7', 'CoRace_8',\
                  'BoGender_1', 'BoGender_2', 'BoGender_3', 'CoGender_1', 'CoGender_2' , 'CoGender_3', 'CoGender_4',\
                  'BoCreditScor_1', 'BoCreditScor_2', 'BoCreditScor_3', 'BoCreditScor_4', 'BoCreditScor_5', 'BoCreditScor_9', \
                  'CoBoCreditScor_1', 'CoBoCreditScor_2', 'CoBoCreditScor_3', 'CoBoCreditScor_4', 'CoBoCreditScor_5', \
                  'CoBoCreditScor_9', 'BoEth_1', 'BoEth_2', 'BoEth_3', 'CoEth_1', 'CoEth_2', 'CoEth_3', 'CoEth_5',\
                  'Geog_1', 'Geog_2', 'PropType_PT01', 'PropType_PT02', 'PropType_PT04', 'PropType_PT06', 'PropType_PT07',\
                  'PropType_PT09', 'PropType_PT10', 'PropType_PT11', 'PropType_PT12', 'SpcHsgGoals_1', 'SpcHsgGoals_2',\
                  'AcqTyp_1', 'AcqTyp_4', 'Bank_Atlanta', 'Bank_Boston', 'Bank_Chicago', 'Bank_Cincinnati', 'Bank_Dallas',\
                  'Bank_Des Moines', 'Bank_Indianapolis', 'Bank_New York', 'Bank_Pittsburgh', 'Bank_San Francisco', \
                  'Bank_Topeka', 'FedGuar_0', 'FedGuar_1', 'FedGuar_2', 'FedGuar_3', 'First_1', 'First_2', 'Self_1',\
                  'Self_2', 'NumBor_1', 'NumBor_2', 'NumBor_3', 'NumBor_4',]
for val in oheEncodedCols:
  filteredLen = df[df[val] == 1].count()[0]
  datasetLen = df.count()[0]
  if( filteredLen < 0.05 * datasetLen and filteredLen > 0.95 * datasetLen):
    df = df.drop([val], axis=1)

In [22]:
df = df.drop('Assigned.ID', axis = 1)
df = df.drop('Unnamed: 0', axis = 1)

In [23]:
from sklearn.decomposition import PCA

pca = PCA()
df_pca = pca.fit_transform(df)

# Store as dataframe and print
df_pca = pd.DataFrame(df_pca)
df_pca.round(10).head()

print(pca.explained_variance_ratio_.round(3)[:30])

[0.313 0.071 0.064 0.057 0.053 0.046 0.035 0.032 0.025 0.024 0.023 0.019
 0.018 0.016 0.015 0.013 0.012 0.011 0.01  0.01  0.009 0.008 0.008 0.008
 0.007 0.007 0.006 0.006 0.006 0.005]


In [24]:
total_variance = 0
required_col = 0
for i in pca.explained_variance_ratio_:
    total_variance += i
    required_col += 1
    if total_variance > 0.9:
        break
print(required_col)

24


In [10]:
df_pca = df_pca[0:25]
df_pca.to_csv(r'Encoded_data_with_PCA.csv', index = False)