# 6. Dimensionality Reduction
I am following this interesting instruction for PCA: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
%matplotlib inline

In [2]:
train_file = 'house-prices-advanced-regression-techniques/train_normalized.csv'
train_df = pd.read_csv(train_file)
test_file = 'house-prices-advanced-regression-techniques/test_normalized.csv'
test_df = pd.read_csv(test_file)

label_col = train_df['SalePrice']
data_df = train_df.drop(['SalePrice'], axis=1)
print(train_df.head())

   TotRmsAbvGrd  OpenPorchSF  MasVnrArea  OverallQual  1stFlrSF  HalfBath  \
0      1.118861     0.265956    0.598482     0.624599 -0.772228  1.305417   
1     -0.260728    -0.735765   -0.597528    -0.103002  0.307004 -0.744557   
2     -0.260728    -0.046056    0.391011     0.624599 -0.602103  1.305417   
3      0.429067    -0.161007   -0.597528     0.624599 -0.493116 -0.744557   
4      1.808656     0.643654    1.538204     1.352200 -0.004006  1.305417   

    LotArea  GrLivArea  YearBuilt  2ndFlrSF  ...  Exterior2nd_2  \
0 -0.239668   0.517958   1.014529  1.297882  ...      -0.791427   
1  0.048154  -0.465280   0.132142 -0.792681  ...      -0.791427   
2  0.461116   0.684757   0.949167  1.327258  ...      -0.791427   
3  0.035640   0.533321  -1.861401  1.057981  ...      -0.791427   
4  1.214459   1.588985   0.916486  1.785027  ...      -0.791427   

   Exterior2nd_3  Exterior2nd_4  MasVnrType_2  MasVnrType_3  Foundation_2  \
0      -0.801819       0.930886     -1.521307      1.2335

## Import and Apply PCA
Notice the code below has .95 for the number of components parameter. It means that scikit-learn choose the minimum number of principal components such that 95% of the variance is retained.

In [3]:
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)

Fit PCA on training set. Note: you are fitting PCA on the training set only.

In [9]:
pca.fit(data_df)
print(pca.n_components_)

40
<bound method BaseEstimator.get_params of PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)>


Note: You can find out how many components PCA choose after fitting the model using pca.n_components_ . In this case, 95% of the variance amounts to 40 principal components.

In [10]:
train_reduced = pca.transform(data_df)
test_reduced = pca.transform(test_df)

new_features_list = ['PCA_Feature_'+str(x) for x in range(pca.n_components_)]

train_reduced_df = pd.DataFrame(train_reduced, columns=new_features_list)
test_reduced_df = pd.DataFrame(test_reduced, columns=new_features_list)
print(train_reduced_df.shape)
print(test_reduced_df.shape)

(953, 40)
(1145, 40)


## Save the dimension reduced train and test data

In [6]:
train_reduced_with_label = pd.concat([train_reduced_df, label_col], axis=1)
train_reduced_with_label.to_csv('house-prices-advanced-regression-techniques/train_dim_reduced.csv', index=False)
test_reduced_df.to_csv('house-prices-advanced-regression-techniques/test_dim_reduced.csv', index=False)

TypeError: cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid