## Imports 

In [2]:
import numpy as np
import pandas as pd 
import os
import matplotlib as mpl
import matplotlib.ticker as ticker
import sklearn 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.preprocessing import OrdinalEncoder# for oon
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error 
import category_encoders as ce #Encoding 

## Load Data into dataframe

In [3]:
HousingData = pd.read_csv("housing.csv") 
HousingData.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## General Statistics 

In [4]:
HousingData.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [40]:
HousingData.info() # need to change object to encoded feature vector. 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20640 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null int32
dtypes: float64(9), int32(1)
memory usage: 1.5 MB


## Preprocessing Technique 1: Handling of Nan values before standardization and encoding of categorical feature vector. 

In [5]:
NAN_location=  HousingData[HousingData.isnull().any(axis=1)].head() #find NaN values

In [6]:
NAN_location

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
290,-122.16,37.77,47.0,1256.0,,570.0,218.0,4.375,161900.0,NEAR BAY
341,-122.17,37.75,38.0,992.0,,732.0,259.0,1.6196,85100.0,NEAR BAY
538,-122.28,37.78,29.0,5154.0,,3741.0,1273.0,2.5762,173400.0,NEAR BAY
563,-122.24,37.75,45.0,891.0,,384.0,146.0,4.9489,247100.0,NEAR BAY
696,-122.1,37.69,41.0,746.0,,387.0,161.0,3.9063,178400.0,NEAR BAY


In [21]:
#median_bedrooms= HousingData["total_bedrooms"].median()  #NAN value replacement. 
#nan_replace=["total_bedrooms"].fillna(median_bedrooms)
HousingData['total_bedrooms'].fillna(value=HousingData['total_bedrooms'].mean(), inplace=True)
HousingData
#HousingData['total_bedrooms'] = HousingData['total_bedrooms'].fillna((HousingData['total_bedrooms'].mean()))
#HousingData2=  HousingData[HousingData.isnull().any(axis=1)].head()
#HousingData2.describe() 


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [37]:
HousingData['total_bedrooms'].isnull().values.any() #verification of no NAN values 

False

## Preprocessing Technique #2:  Encoding for ocean_proximity catagorical feature vector. Encoding done on string varaible prior to standardization for ridgeregression. 

In [28]:
Ocean_proximity_encoder = preprocessing.LabelEncoder() # label encoder used for ocean proximity feature. 

In [29]:
HousingData['ocean_proximity'] = Ocean_proximity_encoder.fit_transform(HousingData['ocean_proximity']) #select target feature and apply fit.transform function to that column

In [30]:
#housing_num = HousingData.drop("ocean_proximity", axis=1)

In [31]:
#Housing_location= HousingData[["ocean_proximity"]]
#Housing_location

In [24]:
ordinal_encoder = OrdinalEncoder()
Housing_location_encoded= ordinal_encoder.fit_transform(Housing_location) # encode selected feature vector
Housing_location_encoded

array([[3.],
       [3.],
       [3.],
       ...,
       [1.],
       [1.],
       [1.]])

In [25]:
cat_encoder = OneHotEncoder()  ##define encoder
housing_cat_1hot = cat_encoder.fit_transform(Housing_location)
housing_cat_1hot

<20640x5 sparse matrix of type '<class 'numpy.float64'>'
	with 20640 stored elements in Compressed Sparse Row format>

In [26]:
housing_cat_1hot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [32]:
HousingData #verify encoded feature vector

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,3
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,3
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,3
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,3
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,1
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,1
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,1
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,1


In [39]:
HousingData["ocean_proximity"].value_counts()

0    9136
1    6551
4    2658
3    2290
2       5
Name: ocean_proximity, dtype: int64

## Preprocessing Technique #2: Z-score standardization for ridge regression. Because rideregression applies a penalty, normalization is needed in order for that penalty to perform equally on all features. 


In [33]:
scaler = StandardScaler() # define scaler to be used (standardized needed for ridge regression)
scaled_HousingData = scaler.fit_transform(HousingData)

#X_scaled = preprocessing.scale(HousingData) 
#transformer = Normalizer().fit(HousingData)

In [34]:
scaled_HousingData

array([[-1.32783522,  1.05254828,  0.98214266, ...,  2.34476576,
         2.12963148,  1.2910888 ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  2.33223796,
         1.31415614,  1.2910888 ],
       [-1.33282653,  1.03850269,  1.85618152, ...,  1.7826994 ,
         1.25869341,  1.2910888 ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ..., -1.14259331,
        -0.99274649, -0.11673923],
       [-0.87362627,  1.77823747, -0.84539315, ..., -1.05458292,
        -1.05860847, -0.11673923],
       [-0.83369581,  1.75014627, -1.00430931, ..., -0.78012947,
        -1.01787803, -0.11673923]])

In [38]:
HousingData.mean(axis=0) #reference mean of original Dataframe

longitude               -119.569704
latitude                  35.631861
housing_median_age        28.639486
total_rooms             2635.763081
total_bedrooms           537.870553
population              1425.476744
households               499.539680
median_income              3.870671
median_house_value    206855.816909
ocean_proximity            1.165843
dtype: float64

In [36]:
 # verify mean is 0, and std is 1 
print(scaled_HousingData.mean(axis=0))
print(scaled_HousingData.std(axis=0)) 

[-8.52651283e-15 -1.07958431e-15  5.50808322e-18  3.20157337e-17
  1.92782913e-16 -1.10161664e-17  6.88510403e-17  6.05889155e-17
 -9.36374148e-17 -5.50808322e-18]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


As we can see, mean = 0, and s is 1 across all features. 

 The dataframe is now preprocessed and the next steps of dimensionality reduction via PCA or Ridge regression can take place. 