In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rc('font', size = 14)
plt.rc('axes', labelsize = 14, titlesize = 10)
plt.rc('legend', fontsize = 14)
plt.rc('xtick', labelsize = 10)
plt.rc('ytick', labelsize = 10)

In [2]:
data = pd.read_csv("/Users/shinigami/Documents/Machine learning/project/Housing/datasets/housing/housing.csv", index_col = False)

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [5]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [7]:
from sklearn.model_selection import train_test_split

data["income_cat"] = pd.cut(data["median_income"], bins = [0.,1.5,3,4.5,6,np.inf],
                           labels = [1,2,3,4,5])

train_data, test_data = train_test_split(data, stratify = data["income_cat"], test_size = 0.2)

In [8]:
train_data.shape, test_data.shape

((16512, 11), (4128, 11))

### Creating all the pipelines for cleaning and processing the data

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler())
])

cat_pipeline = make_pipeline(SimpleImputer(strategy = "most_frequent"),
                            OneHotEncoder(sparse_output = False, handle_unknown = "ignore"))

log_pipeline = make_pipeline(SimpleImputer(strategy = "median"),
                            FunctionTransformer(np.log, feature_names_out = "one-to-one"),
                            StandardScaler())
def ratio(X):
    return (X[:,0]/X[:,1]).reshape(-1,1)

def ratio_name_out(self, input_features = None):
    return ["ratio"]

ratio_pipeline = make_pipeline(SimpleImputer(strategy = "median"),
                              FunctionTransformer(ratio, feature_names_out = ratio_name_out),
                              StandardScaler())

from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_cluster = 10, gamma = 1.0, random_state = None):
        self.n_cluster = n_cluster
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y = None, sample_weight = None):
        self.kmeans_ = KMeans(self.n_cluster, n_init = 10,
                   random_state = self.random_state)
        self.kmeans_.fit(X, sample_weight = sample_weight)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma = self.gamma)

    def get_feature_names_out(self, names = None):
        return [f"Cluster {i} Similarity " for i in range(self.n_cluster)]

cluster_simil = ClusterSimilarity(n_cluster = 25, gamma = 1., random_state = 42)

In [18]:
X_train = train_data.drop(columns = ["median_house_value", "income_cat"], axis = 1)
y_train = train_data["median_house_value"]
X_test = test_data.drop(columns = ["median_house_value", "income_cat"], axis = 1)
y_test = test_data["median_house_value"]

In [19]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
17026,-118.07,33.8,34.0,3486.0,507.0,1311.0,503.0,7.1221,<1H OCEAN
6164,-122.17,37.45,35.0,1025.0,242.0,388.0,232.0,5.1995,NEAR BAY
16399,-118.38,33.85,28.0,4430.0,928.0,2131.0,885.0,4.9384,<1H OCEAN
257,-119.85,36.8,14.0,1876.0,324.0,1031.0,311.0,3.6563,INLAND
14543,-117.78,33.82,12.0,6208.0,750.0,2443.0,739.0,9.1808,<1H OCEAN


In [20]:
from sklearn.compose import ColumnTransformer

pipeline = ColumnTransformer([
    ("bedroom", ratio_pipeline, ["total_rooms", "total_bedrooms"]),
    ("room_per_house", ratio_pipeline, ["total_rooms", "households"]),
    ("person_per_hosue", ratio_pipeline, ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
    ("cat", cat_pipeline, ["ocean_proximity"]),
    ("num", num_pipeline, ["median_income"]),
    ("geo", cluster_simil, ["longitude", "latitude"])
])



In [21]:
X_train_ready = pipeline.fit_transform(X_train)

In [22]:
X_train_ready.shape

(16512, 39)

In [23]:
pipeline.get_feature_names_out()

array(['bedroom__ratio', 'room_per_house__ratio',
       'person_per_hosue__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'num__median_income', 'geo__Cluster 0 Similarity ',
       'geo__Cluster 1 Similarity ', 'geo__Cluster 2 Similarity ',
       'geo__Cluster 3 Similarity ', 'geo__Cluster 4 Similarity ',
       'geo__Cluster 5 Similarity ', 'geo__Cluster 6 Similarity ',
       'geo__Cluster 7 Similarity ', 'geo__Cluster 8 Similarity ',
       'geo__Cluster 9 Similarity ', 'geo__Cluster 10 Similarity ',
       'geo__Cluster 11 Similarity ', 'geo__Cluster 12 Similarity ',
       'geo__Cluster 13 Similarity ', 'geo__Cluster 14 Similarity ',
       'geo__Cluster 15 Similarity ', 'geo__Cluster 16 Similarity ',
       'geo__Cluste

In [24]:
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(pipeline, LinearRegression())
lin_reg.fit(X_train, y_train)

In [25]:
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score

predictions = lin_reg.predict(X_test)
rmse = root_mean_squared_error(y_test, predictions)
lin_rmse = -cross_val_score(lin_reg, X_train, y_train,scoring = "neg_root_mean_squared_error", cv = 10 )
rmse

62069.09309996156

In [26]:
pd.Series(lin_rmse).describe()

count       10.000000
mean     63327.940706
std       4758.116155
min      59293.298415
25%      60186.222687
50%      61346.713778
75%      65706.115857
max      74425.114626
dtype: float64

In [27]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(pipeline, DecisionTreeRegressor(random_state = 42))
tree_rmse = -cross_val_score(tree_reg, X_train, y_train, scoring = "neg_root_mean_squared_error", cv = 10)

In [28]:
pd.Series(tree_rmse).describe()

count       10.000000
mean     64080.107738
std       3191.071170
min      61041.359765
25%      62018.297195
50%      62435.277950
75%      66066.441519
max      69859.321461
dtype: float64

In [29]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = make_pipeline(pipeline, RandomForestRegressor(random_state = 42, max_features = 6))
rf_rmse = -cross_val_score(rf_reg, X_train, y_train, scoring ="neg_root_mean_squared_error", cv = 10)
pd.Series(rf_rmse).describe()

count       10.000000
mean     43180.707211
std       3006.146131
min      39946.720525
25%      41171.678053
50%      41799.073559
75%      44451.074539
max      48771.533374
dtype: float64

In [31]:
from sklearn.model_selection import GridSearchCV

full_pipeline = Pipeline([
    ("preprocessing", pipeline),
    ("random_forest", RandomForestRegressor(random_state = 42))
])

param_grid = [{"preprocessing__geo__n_cluster": [5, 10, 15, 20, 25],
              "random_forest__max_features": [4,6,8, 10]},]

grid_search = GridSearchCV(full_pipeline, param_grid, cv = 3, scoring = 'neg_root_mean_squared_error')

grid_search.fit(X_train, y_train)

In [33]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by = "mean_test_score", ascending = False, inplace = True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_cluster,param_random_forest__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
16,2.530412,0.005258,0.087646,0.000497,25,4,"{'preprocessing__geo__n_cluster': 25, 'random_...",-42163.90963,-46027.827514,-43551.966727,-43914.567957,1598.139563,1
17,3.515124,0.029931,0.086728,0.000754,25,6,"{'preprocessing__geo__n_cluster': 25, 'random_...",-42335.787379,-46072.434823,-43898.719915,-44102.314039,1532.25791,2
18,4.51934,0.028962,0.091171,0.003388,25,8,"{'preprocessing__geo__n_cluster': 25, 'random_...",-42531.398483,-46473.59982,-44222.871977,-44409.290093,1614.786184,3
12,2.515189,0.062104,0.085926,0.000943,20,4,"{'preprocessing__geo__n_cluster': 20, 'random_...",-42689.009198,-46477.622466,-44400.568162,-44522.399942,1549.092177,4
19,5.776856,0.114329,0.088591,0.000749,25,10,"{'preprocessing__geo__n_cluster': 25, 'random_...",-42877.209249,-46836.109267,-44648.691395,-44787.336637,1619.184819,5


In [35]:
grid_search.best_params_

{'preprocessing__geo__n_cluster': 25, 'random_forest__max_features': 4}

In [42]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

params_rnd = [{"preprocessing__geo__n_cluster": randint(low = 3, high = 50),
"random_forest__max_features": randint(low = 2, high = 15)}]

random_search = RandomizedSearchCV(full_pipeline, params_rnd, n_iter = 10, cv = 3, scoring = "neg_root_mean_squared_error")

random_search.fit(X_train, y_train)

In [43]:
random_search.best_params_

{'preprocessing__geo__n_cluster': 41, 'random_forest__max_features': 8}

In [44]:
random_search_cv = pd.DataFrame(random_search.cv_results_)
random_search_cv.sort_values(by = 'mean_test_score', ascending = False, inplace = True)
random_search_cv.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_cluster,param_random_forest__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
9,4.549289,0.023186,0.087348,0.000414,41,8,"{'preprocessing__geo__n_cluster': 41, 'random_...",-41181.702824,-45362.835819,-43163.485163,-43236.007936,1707.710542,1
5,6.177543,0.057257,0.089847,0.001358,45,11,"{'preprocessing__geo__n_cluster': 45, 'random_...",-41555.221182,-45631.318957,-43266.790188,-43484.443442,1671.161866,2
1,3.805825,0.231734,0.087862,0.001202,26,6,"{'preprocessing__geo__n_cluster': 26, 'random_...",-41973.338424,-46031.28194,-43743.022406,-43915.880923,1661.151497,3
2,3.467874,0.009464,0.0872,0.001567,26,6,"{'preprocessing__geo__n_cluster': 26, 'random_...",-41973.338424,-46031.28194,-43743.022406,-43915.880923,1661.151497,3
4,7.672004,0.084977,0.099781,0.01253,46,14,"{'preprocessing__geo__n_cluster': 46, 'random_...",-41959.422976,-45941.225727,-43943.10312,-43947.917274,1625.567731,5


In [49]:
final_model = random_search.best_estimator_
final_model_cv = -cross_val_score(final_model, X_train, y_train, cv = 5, scoring = "neg_root_mean_squared_error")
pd.Series(final_model_cv).describe()

count        5.000000
mean     42578.614499
std       1717.698773
min      40592.880948
25%      40911.012685
50%      43296.072557
75%      43690.541272
max      44402.565032
dtype: float64

In [50]:
predictions = final_model.predict(X_test)
final_rmse = root_mean_squared_error(predictions, y_test)
final_rmse

41771.8996450274

In [51]:
from scipy import stats

confidence = 0.95
squared_errors = (predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

array([39507.27192835, 43919.91193853])