In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
from sklearn.datasets import fetch_california_housing
ca_housing = fetch_california_housing(as_frame = True)

In [13]:
print(ca_housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [14]:
X = ca_housing.data

In [15]:
y = ca_housing.target

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 19)

In [19]:
from sklearn.preprocessing import StandardScaler

In [21]:
scaler  = StandardScaler()

In [22]:
X_train  =  scaler.fit_transform(X_train)

In [23]:
X_test = scaler.fit_transform(X_test)

In [24]:
from sklearn.linear_model import Lasso

In [25]:
lasso = Lasso()

In [26]:
lasso.fit(X_train,y_train)

In [27]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [30]:
y_preds = lasso.predict(X_test)

In [31]:
mean_absolute_error(y_test,y_preds)

0.9119430573559844

In [32]:
mean_squared_error(y_test,y_preds)

1.3264181489134033

In [33]:
r2_score(y_test,y_preds)

-4.109353628090062e-05

In [34]:
param_grid = {
    "alpha": [0.0001,0.001,0.01,0.1,1,10,100,1000]
}

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
lasso_cv = GridSearchCV(lasso,param_grid,cv = 3,n_jobs = -1)

In [37]:
lasso_cv.fit(X_train,y_train)

In [38]:
y_preds2 = lasso_cv.predict(X_test)

In [40]:
mean_absolute_error(y_test,y_preds2)

0.53535005943999

In [41]:
mean_squared_error(y_test,y_preds2)

0.5211195697368108

In [42]:
r2_score(y_test,y_preds2)

0.6071065637863344

In [44]:
lasso_cv.best_estimator_

In [45]:
lasso3 = Lasso(alpha = 0.001)

In [46]:
lasso3.fit(X_train,y_train)

In [47]:
lasso.intercept_

2.0670816194282944

In [53]:
coefs = lasso3.coef_

In [58]:
X_cols = X.columns.tolist()
X_cols

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [59]:
pd.DataFrame({"Feature_names":X_cols,"Coef":coefs})

Unnamed: 0,Feature_names,Coef
0,MedInc,0.836738
1,HouseAge,0.121265
2,AveRooms,-0.260897
3,AveBedrms,0.303707
4,Population,-0.001737
5,AveOccup,-0.028494
6,Latitude,-0.886599
7,Longitude,-0.860203
