In [None]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()


In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins = 50,figsize=(12,6))

In [None]:
housing['income_cat'] = pd.cut(housing['median_income'],
                               bins=[0,1.5,3.0,4.5,6.0,np.inf],
                               labels=[1,2,3,4,5])

In [None]:
housing["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
plt.show()

In [None]:
# for 10 splits 
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
strat_splits = []
for train_index, test_index in splitter.split(housing, housing["income_cat"]):
    strat_train_set_n = housing.iloc[train_index]
    strat_test_set_n = housing.iloc[test_index]
    strat_splits.append([strat_train_set_n, strat_test_set_n])
strait_train_set,Strait_test_set = strat_splits[0]

In [None]:
(Strait_test_set["income_cat"].value_counts()/len(Strait_test_set)).sort_index()

In [None]:
# removing the income category column
for set_ in (strait_train_set,Strait_test_set):
    set_.drop('income_cat',axis=1,inplace=True)

In [None]:
housing = strait_train_set.copy()
housing.plot(kind=  "scatter",x = "longitude" , y = "latitude",grid = True,alpha = 0.2)

In [None]:
housing.plot(kind = "scatter",x = "longitude",y = "latitude",s = housing["population"]/100,label = "population"
             ,c = housing["median_house_value"],cmap = "jet",colorbar = True,legend = True,sharex = False,figsize = (10,7))

In [None]:
corr_matrix = housing.corr(numeric_only=True)
print(corr_matrix)

In [None]:
corr_matrix["median_house_value"].sort_values(ascending = False)

In [None]:
housing.plot(kind = "scatter",x = "median_income",y = "median_house_value",grid = True,alpha = 0.1)
plt.show()

In [None]:
# Experiment With attribute combinations
housing["rooms_per_house"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["people_per_house"] = housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending = False)

In [None]:
housing = strait_train_set.drop("median_house_value",axis=1)
housing_labels = strait_train_set["median_house_value"].copy()

In [None]:
null_rows_index = housing.isnull().any(axis = 1)
housing.loc[null_rows_index].head()

In [None]:
housing_cat = housing[["ocean_proximity"]]

In [None]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

In [None]:
from sklearn.cluster import KMeans

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = ["longitude","latitude","housing_mediian_age","total_rooms",
                  "total_bedrooms","households","median_income"]
cat_attribs = ["ocean_proximity"]
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"),
                             OneHotEncoder(handle_unknown="ignore"))


In [None]:
from sklearn.compose import make_column_selector,make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),                # one-to-one means that output feature name will be same as input feature name
    StandardScaler())
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())
preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)                                              # one column remaining: housing_median_age
    

In [None]:
preprocessing

In [None]:
housing_prepared = preprocessing.fit_transform(housing)
housing_prepared.shape

In [None]:
preprocessing.get_feature_names_out

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score 
import joblib

forest_reg = make_pipeline(preprocessing,RandomForestRegressor(random_state=42))
# forest_rmses = -cross_val_score(forest_reg,housing,housing_labels,scoring="neg_root_mean_squared_error",cv=10)
forest_rmses = joblib.load("forest_rmses.pkl")



In [None]:
pd.Series(forest_rmses).describe()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
full_pipeline = Pipeline([
    ("preprocessing",preprocessing),
    ("random_forest",RandomForestRegressor(random_state=42)),
])
# param_distribs = [
#     {"preprocessing__geo__n_clusters" : randint(low = 3,high = 50)},
#     {"random_forest__max_features" : randint(low = 2,high = 20)},
# ]
# # rnd_search = RandomizedSearchCV(full_pipeline,param_distribs,n_iter=10,scoring = "neg_root_mean_squared_error",cv = 3,random_state=42)
# rnd_search.fit(housing,housing_labels)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {'preprocessing__geo__n_clusters': randint(low=3, high=50),
                  'random_forest__max_features': randint(low=2, high=20)}

# rnd_search = RandomizedSearchCV(
#     full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,
#     scoring='neg_root_mean_squared_error', random_state=42)

# rnd_search.fit(housing, housing_labels)
rnd_search = joblib.load("rnd_search.pkl")

In [None]:
rnd_search.best_params_

In [None]:
final_model = rnd_search.best_estimator_  # includes preprocessing
feature_importances = final_model["random_forest"].feature_importances_
feature_importances.round(2)
sorted(zip(feature_importances,
           final_model["preprocessing"].get_feature_names_out()),
           reverse=True)

In [None]:
from sklearn.metrics import root_mean_squared_error

X_test = Strait_test_set.drop("median_house_value", axis=1)
y_test = Strait_test_set["median_house_value"].copy()

final_predictions = final_model.predict(X_test)

final_rmse = root_mean_squared_error(y_test, final_predictions)
print(final_rmse)

In [None]:
from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))