In [None]:
# California Housing dataset
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv", "housing.csv")

import pandas as pd

# load data from csv file
housing = pd.read_csv('housing.csv')

# display the top rows
housing.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


In [None]:
# Using the setting inplace=False, drop() creates a copy of the data and does not affect housing dataset
housing_data = housing.drop("median_house_value", axis=1, inplace=False)
housing_target = housing["median_house_value"].copy()

housing_data.head(10)

print(type(housing_data))
print(type(housing_target))

feature_names = list(housing_data.columns)
print(type(feature_names))
print(feature_names)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'list'>
['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity']


In [None]:
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.4+ MB


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, feature_names[:-1]),
    ('cat', OneHotEncoder(), [feature_names[-1]]),
])

housing_preprocessed = full_pipeline.fit_transform(housing_data)

In [None]:
print(housing_preprocessed.shape)

(20640, 13)


# Ordinary Least Squares

In [None]:
X = housing_preprocessed
y = housing_target.to_numpy()

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

lr = LinearRegression()
lr.fit(X_train, y_train)

print(lr.coef_)

print(lr.intercept_)

print("training loss: %.3f" % lr.score(X_train, y_train))
print("testing loss: %.3f" % lr.score(X_test, y_test))

[-53501.62245579 -54020.80760721  12433.57277234 -11873.86162379
  30575.04737412 -40973.17771109  26574.8186763   73865.29778874
 -24736.8583481  -64539.42542077 138284.68594797 -29754.30730686
 -19254.09487224]
243606.4449631918
training loss: 0.641
testing loss: 0.661


# Ridge regression

In [None]:
X = housing_preprocessed
y = housing_target.to_numpy()

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

lr = Ridge(alpha=.9)

lr.fit(X_train, y_train)

print(lr.coef_)

print(lr.intercept_)

print("training loss: %.3f" % lr.score(X_train, y_train))
print("testing loss: %.3f" % lr.score(X_test, y_test))

[-54060.49260591 -55211.11146506  12840.47971052  -9219.95402867
  29466.20621683 -43298.69970356  28287.92065345  72723.20593147
 -16046.52190405 -54924.10279072 101088.56503101 -19099.10016495
 -11018.84016085]
234634.6069015277
training loss: 0.644
testing loss: 0.651


# Lasso

In [None]:
X = housing_preprocessed
y = housing_target.to_numpy()

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

lr = Lasso(alpha=.6, max_iter = 6000)

lr.fit(X_train, y_train)

print(lr.coef_)

print(lr.intercept_)

print("training loss: %.3f" % lr.score(X_train, y_train))
print("testing loss: %.3f" % lr.score(X_test, y_test))

[-51817.49676602 -52276.43275926  13124.09588038  -9011.08797084
  29350.30326284 -43893.01233794  28560.92518397  72898.83308174
      0.         -41334.83628702  97722.18459807  -4099.79040659
   5122.15574293]
219811.48500060156
training loss: 0.640
testing loss: 0.666
