In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()

In [37]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [42]:
data['feature_names']

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [3]:
X = data['data']
y = data['target']

In [4]:
# Perform train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
# Create a pipeline
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("scaler", StandardScaler()),
    ("regressor", LinearRegression()),
])

model

Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor', LinearRegression())])

In [6]:
# Choosing a scaling model
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

param_grid = {'scaler': [StandardScaler(), MinMaxScaler()]}
grid_search = GridSearchCV(model, param_grid, cv=100, verbose=1)

In [7]:
# By fitting grid search, we invoke the K-folds strategy
grid_search.fit(X_train, y_train)

Fitting 100 folds for each of 2 candidates, totalling 200 fits


GridSearchCV(cv=100,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('regressor', LinearRegression())]),
             param_grid={'scaler': [StandardScaler(), MinMaxScaler()]},
             verbose=1)

In [8]:
grid_search.best_params_

{'scaler': StandardScaler()}

In [9]:
# how well it does on training
grid_search.score(X_train, y_train)

0.6088968118672871

In [10]:
# how well it did on validation
# the average across all K validation folds
grid_search.best_score_

0.6001084724913038

In [11]:
# How well it does on testing
grid_search.score(X_test, y_test)

0.5943232652466204

In [12]:
# the best chosen was then retrained on the entire training set
# so after the K-folds CV, it ran one more fit with the best candiate
# but NOW using the entire training set
model_best = grid_search.best_estimator_

In [50]:
prices = grid_search.predict(X_test)

In [29]:
# Convert to tens of dollars
predicted_prices=pd.DataFrame(prices,columns=["Price"])*10000
predicted_prices.round(2)

Unnamed: 0,Price
0,22811.07
1,27900.91
2,19033.28
3,10176.03
4,29485.24
...,...
4123,16167.53
4124,24091.88
4125,8418.70
4126,27984.95
