In [None]:
# Import the libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 

In [None]:
# Normalization & Scaling Functions using Numpy & Pandas

# Outlier Scaling using .quantile() Pandas methods
def scale_outlier(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_bound = Q1 - 1.5*IQR
    max_bound = Q3 + 1.5*IQR
    df[column] = np.where(df[column] > max_bound, max_bound, df[column])
    df[column] = np.where(df[column] < min_bound, min_bound, df[column])

# Min-Max Scaling using .min() and .max() Pandas methods
def min_max_scaling(df):    
    df_norm = df.copy()
    for column in df_norm.columns:
        df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())        
    return df_norm

In [None]:
# Reading the dataset using Pandas
df = pd.read_csv("Cleaned_data.csv")

In [None]:
df.shape

(55990, 34)

In [None]:
# One hot encoding
# Convert BoRace, CoRace, BoGender, CoGender, Geog, BoCreditScore, CoBoCreditScore, PropType, BoEth, CoEth, SpcHsgGoals, AcqType, Bank, FedGuar, First, Self and NumBor  to One hot encoding
newdf = pd.get_dummies(df, columns = ['BoRace', 'CoRace', 'BoGender', 'CoGender', 'Geog', 'BoCreditScor', 'CoBoCreditScor', 'PropType', 'BoEth', 'CoEth', 'SpcHsgGoals', 'AcqTyp', 'Bank', 'FedGuar', 'First', 'Self', 'NumBor'])

In [None]:
newdf

In [None]:
# Normalise
scalecolumns = newdf[['BoAge', 'CoAge','MSA', 'MinPer', 'LocMedY',	'Tractrat',	'Income',	'IncRat',	'UPB',	'LTV',	'Term',	'Rate',	'Front',	'Back',	'PMI']]

In [None]:
scalecolumns = min_max_scaling(scalecolumns)
scalecolumns

In [None]:
scalecolumns.shape

(55990, 15)

In [None]:
newdf = newdf.drop(['BoAge', 'CoAge','MSA', 'MinPer', 'LocMedY',	'Tractrat',	'Income',	'IncRat',	'UPB',	'LTV',	'Term',	'Rate',	'Front',	'Back',	'PMI'], axis = 1)
# newdf

In [None]:
newdf.shape

(55990, 79)

In [None]:
mergecols = [newdf, scalecolumns]
newdf = pd.concat(mergecols, axis=1)

In [None]:
newdf

In [None]:
origcolumns = df[['BoRace', 'CoRace', 'BoGender', 'CoGender', 'Geog', 'BoCreditScor', 'CoBoCreditScor', 'PropType', 'BoEth', 'CoEth', 'SpcHsgGoals', 'AcqTyp', 'Bank', 'FedGuar', 'First', 'Self', 'NumBor']]

mergecols = [newdf, origcolumns]
newdf = pd.concat(mergecols, axis=1)

In [None]:
origcolumns.shape

(55990, 17)

In [None]:
newdf

In [None]:
# Drop one hot encoded columns that have values in less than 5% of total number of rows
oheEncodedCols = ['BoRace_1',	'BoRace_2',	'BoRace_3',	'BoRace_4',	'BoRace_5',	'BoRace_7',	'CoRace_1',	'CoRace_2',	'CoRace_3',	'CoRace_4',	'CoRace_5',	'CoRace_7',	'CoRace_8',	'BoGender_1',	'BoGender_2',	'BoGender_3',	'CoGender_1',	'CoGender_2',	'CoGender_3',	'CoGender_4', 'BoCreditScor_1',	'BoCreditScor_2',	'BoCreditScor_3',	'BoCreditScor_4',	'BoCreditScor_5',	'BoCreditScor_9',	'CoBoCreditScor_1',	'CoBoCreditScor_2',	'CoBoCreditScor_3',	'CoBoCreditScor_4',	'CoBoCreditScor_5',	'CoBoCreditScor_9', 'BoEth_1',	'BoEth_2',	'BoEth_3',	'CoEth_1',	'CoEth_2',	'CoEth_3',	'CoEth_5', 'Geog_1',	'Geog_2',	'PropType_PT01',	'PropType_PT02',	'PropType_PT04',	'PropType_PT06',	'PropType_PT07',	'PropType_PT09',	'PropType_PT10',	'PropType_PT11',	'PropType_PT12',	'SpcHsgGoals_1',	'SpcHsgGoals_2',	'AcqTyp_1',	'AcqTyp_4',	'Bank_Atlanta',	'Bank_Boston',	'Bank_Chicago',	'Bank_Cincinnati',	'Bank_Dallas',	'Bank_Des Moines',	'Bank_Indianapolis',	'Bank_New York',	'Bank_Pittsburgh',	'Bank_San Francisco',	'Bank_Topeka',	'FedGuar_0',	'FedGuar_1',	'FedGuar_2',	'FedGuar_3',	'First_1',	'First_2',	'Self_1',	'Self_2',	'NumBor_1',	'NumBor_2',	'NumBor_3',	'NumBor_4',]
morenewdf = newdf.copy()
for val in oheEncodedCols:
  filteredLen = morenewdf[morenewdf[val] == 1].count()[0]
  datasetLen = morenewdf.count()[0]
  if( filteredLen < 0.05 * datasetLen and filteredLen > 0.95 * datasetLen):
    morenewdf = morenewdf.drop([val], axis=1)
# df[ df > 0 ].count(axis=1)

In [None]:
morenewdf

In [None]:
morenewdf.to_csv('Even_More_cleaned_dataset.csv',index=False)

# PCA

In [None]:
from sklearn.decomposition import PCA


In [None]:
features = ['BoRace', 'CoRace', 'BoGender', 'CoGender', 'Geog', 'BoCreditScor', 'CoBoCreditScor',  'BoEth', 'CoEth', 'SpcHsgGoals', 'AcqTyp', 'FedGuar', 'First', 'Self', 'NumBor','BoAge', 'CoAge','MSA', 'MinPer', 'LocMedY',	'Tractrat',	'Income',	'IncRat',	'UPB',	'LTV',	'Term']

# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['Rate']].values

In [None]:
x[0]

array([7.0000e+00, 7.0000e+00, 1.0000e+00, 2.0000e+00, 2.0000e+00,
       5.0000e+00, 5.0000e+00, 1.0000e+00, 1.0000e+00, 2.0000e+00,
       1.0000e+00, 2.0000e+00, 2.0000e+00, 2.0000e+00, 2.0000e+00,
       7.0000e+01, 6.9000e+01, 2.5020e+04, 9.9150e+01, 1.7448e+04,
       9.0920e-01, 6.0588e+04, 3.2928e+00, 7.1967e+04, 8.9000e-01,
       1.8000e+02])

In [None]:
principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
finalDf = pd.concat([principalDf, df[['Rate']]], axis = 1)

In [None]:
finalDf

Unnamed: 0,principal component 1,principal component 2,Rate
0,-166206.464784,18096.084893,0.0288
1,299489.020924,-16656.350804,0.0325
2,180126.993552,-69866.676329,0.0325
3,42329.211893,42484.941295,0.0338
4,34607.965054,-72158.150732,0.0325
...,...,...,...
55985,387382.275033,-124975.613351,0.0400
55986,104712.618048,56935.680358,0.0400
55987,34223.234638,62377.734721,0.0388
55988,-83206.900117,109455.885218,0.0425
