# **Load "Raw" California Housing dataset**

In [None]:
# California Housing dataset
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv", "housing.csv")

import pandas as pd

# load data from csv file
housing = pd.read_csv('housing.csv')

# display the top rows
housing.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


# Fill missing values using sklearn

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = "median")

housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

data = imputer.transform(housing_num)
print(type(data))
print(data.shape)

<class 'numpy.ndarray'>
(20640, 9)


In [None]:
df = pd.DataFrame(data)

df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,536.838857,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,419.391878,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,297.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,643.25,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


# Feature scaling using standard scaler

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(data)

In [None]:
scaler.mean_

array([-1.19569704e+02,  3.56318614e+01,  2.86394864e+01,  2.63576308e+03,
        5.36838857e+02,  1.42547674e+03,  4.99539680e+02,  3.87067100e+00,
        2.06855817e+05])

In [None]:
data_scaled = scaler.transform(data)
print(type(data_scaled))
print(data_scaled.shape)

<class 'numpy.ndarray'>
(20640, 9)


In [None]:
df = pd.DataFrame(data_scaled)

df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-8.526513e-15,-1.079584e-15,5.508083e-18,3.2015730000000005e-17,-9.363741e-17,-1.101617e-17,6.885104000000001e-17,6.6097e-17,-9.363741e-17
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024
min,-2.385992,-1.447568,-2.19618,-1.207283,-1.277688,-1.256123,-1.303984,-1.774299,-1.662641
25%,-1.113209,-0.7967887,-0.8453931,-0.5445698,-0.5718868,-0.5638089,-0.5742294,-0.6881186,-0.7561633
50%,0.5389137,-0.6422871,0.02864572,-0.2332104,-0.2428309,-0.2291318,-0.2368162,-0.1767951,-0.2353337
75%,0.7784964,0.9729566,0.6643103,0.2348028,0.2537334,0.2644949,0.2758427,0.4593063,0.5014973
max,2.62528,2.958068,1.856182,16.81558,14.08779,30.25033,14.60152,5.858286,2.540411


# Feature scaling using minmaxscaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(data)

In [None]:
scaler.data_max_

array([-1.14310e+02,  4.19500e+01,  5.20000e+01,  3.93200e+04,
        6.44500e+03,  3.56820e+04,  6.08200e+03,  1.50001e+01,
        5.00001e+05])

In [None]:
scaler.data_min_

array([-1.2435e+02,  3.2540e+01,  1.0000e+00,  2.0000e+00,  1.0000e+00,
        3.0000e+00,  1.0000e+00,  4.9990e-01,  1.4999e+04])

In [None]:
data_scaled = scaler.transform(data)
print(type(data_scaled))
print(data_scaled.shape)

<class 'numpy.ndarray'>
(20640, 9)


In [None]:
df = pd.DataFrame(data_scaled)

df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,0.476125,0.328572,0.541951,0.066986,0.083153,0.039869,0.081983,0.232464,0.395579
std,0.199555,0.226988,0.246776,0.055486,0.065083,0.03174,0.062873,0.13102,0.237928
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.253984,0.147715,0.333333,0.036771,0.045934,0.021974,0.045881,0.142308,0.215671
50%,0.583665,0.182784,0.54902,0.054046,0.067349,0.032596,0.067094,0.209301,0.339588
75%,0.631474,0.549416,0.705882,0.080014,0.099666,0.048264,0.099326,0.292641,0.514897
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
