# 6. Dimensionality Reduction
I am following this interesting instruction for PCA: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
%matplotlib inline

In [2]:
train_file = 'house-prices-advanced-regression-techniques/train_normalized.csv'
train_df = pd.read_csv(train_file)
test_file = 'house-prices-advanced-regression-techniques/test_normalized.csv'
test_df = pd.read_csv(test_file)

label_col = train_df['SalePrice']
data_df = train_df.drop(['SalePrice'], axis=1)
print(train_df.head())

   TotRmsAbvGrd  OpenPorchSF  MasVnrArea  OverallQual  1stFlrSF  HalfBath  \
0      1.094086     0.248951    0.600193     0.652759 -0.776823  1.271730   
1     -0.267594    -0.724180   -0.591533    -0.076155  0.304044 -0.751629   
2     -0.267594    -0.054155    0.393465     0.652759 -0.606440  1.271730   
3      0.413246    -0.165826   -0.591533     0.652759 -0.497288 -0.751629   
4      1.774926     0.615869    1.536549     1.381673 -0.007437  1.271730   

    LotArea  GrLivArea  YearBuilt  2ndFlrSF  ...  Exterior2nd_2  \
0 -0.256880   0.498442   1.028105  1.266019  ...      -0.811709   
1  0.003496  -0.469281   0.141363 -0.788496  ...      -0.811709   
2  0.377080   0.662609   0.962420  1.294888  ...      -0.811709   
3 -0.007824   0.513563  -1.862016  1.030255  ...      -0.811709   
4  1.058586   1.552568   0.929578  1.744764  ...      -0.811709   

   Exterior2nd_3  Exterior2nd_4  MasVnrType_2  MasVnrType_3  Foundation_2  \
0      -0.788706       0.927961     -1.487184      1.2319

## Import and Apply PCA
Notice the code below has .95 for the number of components parameter. It means that scikit-learn choose the minimum number of principal components such that 95% of the variance is retained.

In [3]:
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)

Fit PCA on training set. Note: you are fitting PCA on the training set only.

In [4]:
pca.fit(data_df)
print(pca.n_components_)

40


Note: You can find out how many components PCA choose after fitting the model using pca.n_components_ . In this case, 95% of the variance amounts to 40 principal components.

In [5]:
train_reduced = pca.transform(data_df)
test_reduced = pca.transform(test_df)

new_features_list = ['PCA_Feature_'+str(x) for x in range(pca.n_components_)]

train_reduced_df = pd.DataFrame(train_reduced, columns=new_features_list)
test_reduced_df = pd.DataFrame(test_reduced, columns=new_features_list)
print(train_reduced_df.shape)
print(test_reduced_df.shape)

(953, 40)
(1459, 40)


## Save the dimension reduced train and test data

In [6]:
train_reduced_with_label = pd.concat([train_reduced_df, label_col], axis=1)
train_reduced_with_label.to_csv('house-prices-advanced-regression-techniques/train_dim_reduced.csv', index=False)
test_reduced_df.to_csv('house-prices-advanced-regression-techniques/test_dim_reduced.csv', index=False)