## Import libraries

In [2]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline    

## Data preparation

In [3]:
data = pd.read_csv('CaliforniaHousing/cal_housing.txt',header=None)

In [4]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [5]:
names = ['longitude','latitude','housingMedianAge','totalRooms',
         'totalBedrooms','population','households','medianIncome','medianHouseValue']
data.columns=names

In [6]:
data.head()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [7]:
data.describe()

Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.898014,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.247906,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,295.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [8]:
data.shape

(20640, 9)

In [9]:
data.isnull().sum()

longitude           0
latitude            0
housingMedianAge    0
totalRooms          0
totalBedrooms       0
population          0
households          0
medianIncome        0
medianHouseValue    0
dtype: int64

## Linear Regression with all features

In [10]:
X=data.drop('medianHouseValue',axis=1)
y=data['medianHouseValue']

In [12]:
#先数据拆分再特征缩放的顺序
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
# random_state: create the same training set and test set every time, if random_state value is the same

In [11]:
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
model1 = LinearRegression(normalize=True)
model1.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [13]:
model1.score(X_test, y_test) # coefficient of determination: R^2, the bigger the better

0.6277645980446468

In [14]:
model1.intercept_  # intercept of the linear model

207249.89589389446

In [15]:
model1.coef_  # coefficients of the linear model

array([-85809.99559835, -90730.80254927,  14650.92216241, -16732.70525827,
        44956.5030505 , -43432.77763716,  19664.45740533,  76709.08047259])

In [16]:
y_pred = model1.predict(X_test)
metrics.mean_squared_error(y_test, y_pred) #MSE, the smaller the better

4853781771.947944

In [17]:
np.sqrt(metrics.mean_squared_error(y_test, y_pred))  #RMSE, the smaller the better

69669.08763539209

## Linear Regression with selected features

In [18]:
X=data.drop(['medianHouseValue','longitude','latitude'],axis=1)
y=data['medianHouseValue']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
# random_state: create the same training set and test set every time, if random_state value is the same

In [20]:
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
model2 = LinearRegression(normalize=True)
model2.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [22]:
model2.score(X_test, y_test) # coefficient of determination: R^2, [0,1], the bigger the better

0.5528951664811548

In [23]:
y_pred = model2.predict(X_test)
metrics.mean_squared_error(y_test, y_pred) #MSE, the smaller the better

5830045395.155306

In [24]:
np.sqrt(metrics.mean_squared_error(y_test, y_pred))  #RMSE, the smaller the better

76354.73394070145

In [25]:
# worse result than with all features

## Linear Regression with Cross Validation

In [26]:
X=data.drop('medianHouseValue',axis=1)
y=data['medianHouseValue']

In [27]:
y_pred_CV = cross_val_predict(model1, X, y, cv=10)

In [28]:
metrics.mean_squared_error(y, y_pred_CV) #MSE

5083168215.393718

In [29]:
np.sqrt(metrics.mean_squared_error(y, y_pred_CV))  #RMSE

71296.34082751875

In [30]:
# worse result than without CV

## Polynominal  Regression

In [31]:
X=data.drop('medianHouseValue',axis=1)
y=data['medianHouseValue']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

poly = PolynomialFeatures(degree=2,include_bias=False) #degree: number of polynominal features
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [32]:
model3 = LinearRegression(normalize=True)
model3.fit(X_train_poly, y_train)
model3.score(X_test_poly, y_test)

0.6874007781492513

In [33]:
y_pred_poly = model3.predict(X_test_poly)
metrics.mean_squared_error(y_test, y_pred_poly) #MSE, the smaller the better

4076152877.9206834

In [34]:
np.sqrt(metrics.mean_squared_error(y_test, y_pred_poly))  #RMSE

63844.75607221539

In [35]:
# better result !!

## Ridge Regression

In [36]:
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import GridSearchCV

In [37]:
# data preparation
X=data.drop('medianHouseValue',axis=1)
y=data['medianHouseValue']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
# choose the best hyper-param, method 1
alpha = np.logspace(-3,2,10) # learning rate = from 10^-3 to 10^2
ridge = RidgeCV(alpha,cv=5)
ridge.fit(X_train,y_train)
ridge.alpha_  #best hyper-param

27.825594022071257

In [39]:
# choose the best hyper-param, method 2
from sklearn.linear_model import Ridge
ridge_model = GridSearchCV(Ridge(),param_grid={'alpha':alpha},cv=5)
ridge_model.fit(X_train,y_train)
ridge_model.best_params_

{'alpha': 27.825594022071257}

In [40]:
y_pred = ridge.predict(X_test)
mse_ridge = metrics.mean_squared_error(y_test,y_pred)
r2_ridge = metrics.r2_score(y_test,y_pred)
mse_ridge,np.sqrt(mse_ridge),r2_ridge

(4857481957.076677, 69695.63800609531, 0.6274808317025621)

## Lasso Regression

In [41]:
from sklearn.linear_model import LassoCV

In [42]:
from sklearn.linear_model import Lasso
lasso = LassoCV(alphas=alpha,cv=5)
lasso.fit(X_train,y_train)
lasso.alpha_

27.825594022071257

In [43]:
y_pred = lasso.predict(X_test)
mse_lasso = metrics.mean_squared_error(y_test,y_pred)
r2_lasso = metrics.r2_score(y_test,y_pred)
mse_lasso,np.sqrt(mse_lasso),r2_lasso

(4854925758.112072, 69677.29729339444, 0.6276768660101955)