---
title: "REGULARIZED REGRESSION"
author: "Kwabena Asabere"
df-print: kable
code-overflow: wrap
execute:
    echo: true
    warning: false
    message: true
format: html
---

In [602]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [603]:
from sklearn.linear_model import Ridge,Lasso,RidgeCV,LassoCV,ElasticNetCV,ElasticNet
from sklearn.preprocessing import StandardScaler,OneHotEncoder,PolynomialFeatures,FunctionTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate,cross_val_predict,GridSearchCV
from sklearn.metrics import root_mean_squared_error,mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [604]:
housing = pd.read_csv(r"C:\Users\KAsab\Desktop\PYTHON\california_housing.csv",engine = "pyarrow")

In [605]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [606]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [607]:
housing.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [608]:
housing["total_bedrooms"]=housing["total_bedrooms"].fillna(housing["total_bedrooms"].median())

In [609]:
housing.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [610]:
housing["rooms_per_house"] = housing['total_bedrooms']/housing["total_rooms"]
housing["people_per_house"] = housing["population"]/housing["households"]


In [611]:
coords = housing[["latitude","longitude"]]


In [612]:
kmeans = make_pipeline(StandardScaler(),KMeans(n_clusters = 5,random_state = 42))
housing["cluster"]= kmeans.fit_predict(coords)
housing["cluster"]= pd.Categorical(housing["cluster"])

In [613]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           20640 non-null  float64 
 1   latitude            20640 non-null  float64 
 2   housing_median_age  20640 non-null  float64 
 3   total_rooms         20640 non-null  float64 
 4   total_bedrooms      20640 non-null  float64 
 5   population          20640 non-null  float64 
 6   households          20640 non-null  float64 
 7   median_income       20640 non-null  float64 
 8   median_house_value  20640 non-null  float64 
 9   ocean_proximity     20640 non-null  object  
 10  rooms_per_house     20640 non-null  float64 
 11  people_per_house    20640 non-null  float64 
 12  cluster             20640 non-null  category
dtypes: category(1), float64(11), object(1)
memory usage: 1.9+ MB


In [614]:
numeric_features = ["latitude","longitude","rooms_per_house","people_per_house"]
log_features = housing.iloc[:,2:8].columns.to_list()
cat_features = housing.select_dtypes(include = ["object","category"]).columns.to_list()

In [615]:
X = housing.drop(columns = ["median_house_value"])
y = housing["median_house_value"]


In [616]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 14196 to 15795
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           16512 non-null  float64 
 1   latitude            16512 non-null  float64 
 2   housing_median_age  16512 non-null  float64 
 3   total_rooms         16512 non-null  float64 
 4   total_bedrooms      16512 non-null  float64 
 5   population          16512 non-null  float64 
 6   households          16512 non-null  float64 
 7   median_income       16512 non-null  float64 
 8   ocean_proximity     16512 non-null  object  
 9   rooms_per_house     16512 non-null  float64 
 10  people_per_house    16512 non-null  float64 
 11  cluster             16512 non-null  category
dtypes: category(1), float64(10), object(1)
memory usage: 1.5+ MB


In [617]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [618]:
numeric_transformer = Pipeline( steps = [
    ("imputer",SimpleImputer(strategy = "median")),
    ("scaler",StandardScaler())
])

log_transformer = Pipeline(steps = [
    ("imputer",SimpleImputer(strategy = "median")),
    ("log",FunctionTransformer(np.log1p)),
    ("scaler",StandardScaler())
])

cat_transformer = Pipeline(steps = [
    ("imputer",SimpleImputer(strategy = "most_frequent")),
    ("onehot",OneHotEncoder(handle_unknown = "ignore",sparse_output = False))
])

In [619]:
preprocessor = ColumnTransformer(transformers =[
    ("log",log_transformer,log_features),
    ("num",numeric_transformer,numeric_features),
    ("cat",cat_transformer,cat_features)
])

In [620]:
pipeline = Pipeline( steps = [
    ("preprocessor",preprocessor),
    ("ridge",Ridge())
])

In [621]:
param_grid = {
    "ridge__alpha":[0.1,1.0,10,100,1000]
}

The names of the hyperparameters in the `param_grid` must contain the name of the  step in the pipeline followed by 2 underscores and then the hyperparameter.

In [622]:
grid = GridSearchCV(pipeline,param_grid,cv = 5,scoring = "neg_root_mean_squared_error",n_jobs = -1)

In [623]:
grid.fit(X_train,y_train)

In [624]:
print(f"Best parameters:{grid.best_params_}")
print(f"Best score:{grid.best_score_}")

Best parameters:{'ridge__alpha': 1.0}
Best score:-67919.24629003313


In [625]:
y_pred = grid.best_estimator_.predict(X_test)

In [626]:
rmse = root_mean_squared_error(y_test,y_pred)

In [627]:
print("Root Mean Squared Error:" ,rmse)

Root Mean Squared Error: 83421.15981536356
