## import housing data from sklearn

In [None]:
from sklearn.datasets import fetch_california_housing

In [None]:
housing_data_features, housing_data_labels = fetch_california_housing(as_frame=True, return_X_y=True)

In [None]:
housing_data_features.shape

(20640, 8)

## How many missing values are present in the dataset?

In [None]:
housing_data_features.isna().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

## Split the data with random_state=0, shuffle=False and test_size=0.2

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(housing_data_features, housing_data_labels, test_size=0.2, random_state=0, shuffle=False)

In [None]:
X_test.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')

In [None]:
X_test['HouseAge'].iloc[2]

28.0

In [None]:
X_train['Population'].iloc[0]

322.0

In [None]:
y_train.mean()

2.02067031310562

In [None]:
y_test.median()

2.1475

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),      # Step 1: Standardize the data
    ('regressor', LinearRegression())  # Step 2: Apply linear regression
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline['regressor'].coef_

array([ 0.83817967,  0.12032096, -0.32135834,  0.36456599, -0.00170674,
       -0.04524631, -0.87994217, -0.82634126])

In [None]:
pipeline['regressor'].intercept_

2.020670313105614

In [None]:
pipeline.score(X_test, y_test)

0.6605140591531993

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print the RMSE value
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 0.7033383507521878


In [None]:
from sklearn.metrics import r2_score

In [None]:
variance_score = r2_score(y_test, y_pred)

# Print the RMSE and variance score values
print("Variance Score (R^2):", variance_score)

Variance Score (R^2): 0.6605140591531993


In [None]:
from sklearn.metrics import max_error

In [None]:
max_err = max_error(y_test, y_pred)
print("Maximum Error:", max_err)

Maximum Error: 7.260453292958372


In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
absolute_err = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", absolute_err)

Mean Absolute Error: 0.5168526993787042


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_err = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mean_squared_err)

Mean Squared Error: 0.4946848356388075


## perform SGD Regression on the given (scaled using StandardScalar()) dataset, using default hyperparameters and random_state=0.

In [None]:
from sklearn.linear_model import SGDRegressor

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),      # Step 1: Standardize the data
    ('regressor', SGDRegressor(random_state=0))  # Step 2: Apply linear regression
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline['regressor'].coef_

array([ 0.84046697,  0.112331  , -0.41213039,  0.21595971, -0.01781887,
       -0.01480892, -0.87394103, -0.83913104])

In [None]:
pipeline['regressor'].intercept_

array([2.01123921])