<a href="https://colab.research.google.com/github/GautierBlandin/real-estate-ml/blob/main/Housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Init



In [1]:
import os
import tarfile
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)



# Training / Test set initilization

In [2]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

fetch_housing_data()
housing = load_housing_data()

train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels = [1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing,housing["income_cat"]):
  strat_train_set = housing.loc[train_index]
  strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
  set_.drop("income_cat", axis = 1, inplace = True)


housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

# Dataset Visualization

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize = (20, 15))
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha= 0.4, figsize=(15, 10),
             s=housing["population"]/50, label="population", c="median_house_value",
             cmap=plt.get_cmap("jet"), colorbar=True)

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(15, 10))

# Training Set Preparation

In [3]:
housing_num = housing.drop("ocean_proximity", axis=1)
housing_cat = housing[["ocean_proximity"]]

num_attribs = list(housing_num)
cat_attribs = list(housing_cat)

In [4]:
# Custom sklearn transformer for creation of new relevant features
from sklearn.base import BaseEstimator, TransformerMixin

total_rooms_ix, total_bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room: bool = True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self, X, y = None):
    return self  # Nothing else to do
  def transform(self, X, y = None):
    rooms_per_household = X[:, total_rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room:
      bedrooms_per_room = X[:, total_bedrooms_ix] / X[:, total_rooms_ix]
      return np.c_[X, rooms_per_household, population_per_household,
                   bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

In [5]:
# Numerical pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Pipeline steps : 
# Fill n/a values will median value of the column
# Create additional relevant features by combining existing ones
# Standardize all columns (y = [X - E(X)]/sigma)
num_pipeline = Pipeline([
                     ('imputer', SimpleImputer(strategy='median')),
                     ('attribs_adder', CombinedAttributesAdder()),
                     ('std_scaler', StandardScaler())
])

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
                                   ("num", num_pipeline, num_attribs),
                                   ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

# Training and evaluation on the training set

In [9]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [19]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor()

In [21]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

housing_tree_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_tree_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

forest_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, forest_predictions)
forest_rmse = np.sqrt(forest_mse)
print(forest_rmse)

68627.87390018745
0.0
18705.705646578674


In [22]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                         scoring = "neg_mean_squared_error", cv = 10)
tree_rmse_scores = np.sqrt(-scores)

print(type(scores))

def display_scores(scores):
  print(f"Scores: {scores}")
  print(f"Mean: {scores.mean()}")
  print(f"Standard Deviation: {scores.std()}")

display_scores(tree_rmse_scores)

<class 'numpy.ndarray'>
Scores: [51518.71879196 49130.68577044 46973.53789877 51998.15434688
 47730.03725242 51815.291573   52248.39966007 49733.88901207
 48771.37473358 53835.68912047]
Mean: 50375.57781596664
Standard Deviation: 2111.8684196295767
