In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import sklearn.datasets as datasets


from sklearn.preprocessing import PolynomialFeatures

# Cross-Validation tools
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Housing Dataset Loading

In [None]:
import sklearn.datasets as datasets

chd = datasets.fetch_california_housing() 

In [None]:
print ('Explore the dataset:')
for key in chd.keys():
  print ('Field: ', key)
  print (chd[key])

Explore the dataset:
Field:  data
[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]
Field:  target
[4.526 3.585 3.521 ... 0.923 0.847 0.894]
Field:  frame
None
Field:  target_names
['MedHouseVal']
Field:  feature_names
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Field:  DESCR
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

   

#Linear Regression with Scikit-Learn

In [None]:
"""
Split the dataset into training and test set. 
In practice, you will not have access to the test set labels. 
You have to finetune your hyperparams within the training set.
"""
X_train, X_test, y_train, y_test = train_test_split(chd.data,chd.target,test_size=0.1, random_state=1)

"""
Load the Linear Regression Model
The function fit(X_train, y_train) trains the models 
"""
LR = linear_model.LinearRegression()
LR.fit(X_train, y_train) #Train the model

print ('Weights after training: ', LR.coef_)
print ('Bias after training: ', LR.intercept_)



# Evaluate the mean-squared-error on the test set.
print ('Linear regression Train MSE error: ', mean_squared_error(LR.predict(X_train), y_train))
print ('Linear regression Test MSE error: ', mean_squared_error(LR.predict(X_test), y_test))

Weights after training:  [ 4.37733780e-01  9.53253650e-03 -1.08493647e-01  6.48451190e-01
 -4.59029268e-06 -3.59055200e-03 -4.23228504e-01 -4.37158591e-01]
Bias after training:  -37.1940870363519
Linear regression Train MSE error:  0.5229488752694601
Linear regression Test MSE error:  0.5368479730539837


# (Optional) Linear Regression with Polynomial Feature Transformation and L2 Regularization

From scikit-learn (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html): 

"PolynomialFeatures: Generate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree. For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]."

This function is useful when you the relationship between your input features and output labels is non-linear.
 

In [None]:
#Load our preprocessing functions
scaler = StandardScaler()
chd = datasets.fetch_california_housing()
feature_trf = PolynomialFeatures(2) 

N_compare = 50
non_linear_wins = 0
print ('----------------Comparing Linear Regression vs Linear Regression with Feature Transformation------------------')
for i in range(50):
  X_train, X_test, y_train, y_test = train_test_split(chd.data,chd.target,test_size=0.1)
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  ridge_params = {'alpha':[1, 10, 100, 1000, 10000, 100000, 1000000]}
  NLR = GridSearchCV(linear_model.Ridge(), param_grid=ridge_params).fit(feature_trf.fit_transform( X_train), y_train).best_estimator_
  NLR = linear_model.Ridge(alpha=NLR.alpha).fit(feature_trf.fit_transform( X_train), y_train)
  LR = linear_model.LinearRegression().fit(X_train, y_train)

  lr_err = mean_squared_error(LR.predict(X_test), y_test)
  nlr_err = mean_squared_error(NLR.predict(feature_trf.transform( X_test)), y_test)
  print ('Linear regression MSE error: ', lr_err )
  print ('Nonlinear regression MSE error: ', nlr_err)
  
  
  if nlr_err < lr_err:
    non_linear_wins +=1
  print ('----------------------------------------------------------------------------------------------------------------')

----------------Comparing Linear Regression vs Linear Regression with Feature Transformation------------------
Linear regression MSE error:  0.4904271033535111
Nonlinear regression MSE error:  0.39373065206744484
----------------------------------------------------------------------------------------------------------------
Linear regression MSE error:  0.5325660613715891
Nonlinear regression MSE error:  0.5066057839074886
----------------------------------------------------------------------------------------------------------------
Linear regression MSE error:  0.5231823209067591
Nonlinear regression MSE error:  0.4405909108004551
----------------------------------------------------------------------------------------------------------------
Linear regression MSE error:  0.5180814398233858
Nonlinear regression MSE error:  0.5296826496699174
----------------------------------------------------------------------------------------------------------------
Linear regression MSE error:  0.

In [None]:
print ('% of times Polynomial Feature wins:', non_linear_wins*100/N_compare, '%')

% of times Polynomial Feature wins: 72.0 %
