In [1]:
import os
import tarfile
import urllib
import joblib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
fetch_housing_data()
housing = load_housing_data()
housing.head()

In [4]:
len(housing)

20640

In [None]:
# print the info of the data

housing.info()

In [6]:
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)
len(train_set)

In [None]:
len(test_set)

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(len(train_set))
print(len(test_set))

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                                bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                labels=[1, 2, 3, 4, 5])


housing["income_cat"].hist()

In [None]:
housing.info()

In [12]:
# Divide el train y test pero se asegura que la proporción 
# de datos se mantenga en train y test.
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [13]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
import matplotlib.pyplot as plt

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population", figsize=(10,7),
                c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)

plt.legend()

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = [
    "median_house_value",
    "median_income",
    "total_rooms",
    "housing_median_age"
]

scatter_matrix(housing[attributes], figsize = (12, 8))

In [19]:
housing["rooms_per_houshold"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
print(len(housing))
print(len(housing_labels))
print(type(housing))
print(type(housing_labels))

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = "median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [26]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns = housing_num.columns, index = housing_num.index)

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
housing.head()

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

housing_prepared.shape

In [None]:
import joblib

joblib.dump(full_pipeline, "full_pipeline.pkl")

In [None]:
print(num_attribs)
print(cat_attribs)

In [None]:
housing_prepared.shape

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

In [None]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

decision_tree_save = "decision_tree_model.pkl"
joblib.dump(tree_reg, decision_tree_save)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
print(tree_rmse_scores)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'bootstrap': [False, True],
        'n_estimators': [30, 100, 200], 
        'criterion': [
            'squared_error',
            'poisson',
            'friedman_mse',
            'absolute_error'
        ],
        'max_depth': [None, 2, 4, 6, 8, 10],
        'max_features': ['sqrt', 'log2', None]
    }
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(
    forest_reg, 
    param_grid, 
    cv = 5,
    scoring = 'neg_mean_squared_error',
    return_train_score = True
)

grid_search.fit(housing_prepared, housing_labels)

In [None]:
print(grid_search.best_estimator_)

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
  print(np.sqrt(-mean_score), params)

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

random_forest_save = "random_forest_model.pkl"
joblib.dump(final_model, random_forest_save)

In [None]:
param_grid_linear = [
    {
        'fit_intercept': [True, False],
        'copy_X': [True, False],
        'positive': [True, False]
    }
]

linear_reg = LinearRegression()

grid_search_linear = GridSearchCV(
    linear_reg, 
    param_grid_linear, 
    cv = 5,
    scoring = 'neg_mean_squared_error',
    return_train_score = True
)

grid_search_linear.fit(housing_prepared, housing_labels)

In [None]:
grid_search_linear.best_estimator_

In [None]:
cvres_linear = grid_search_linear.cv_results_

for mean_score, params in zip(cvres_linear['mean_test_score'], cvres_linear['params']):
  print(np.sqrt(-mean_score), params)

In [None]:
final_model_linear = grid_search_linear.best_estimator_

X_test_linear = strat_test_set.drop("median_house_value", axis=1)
y_test_linear = strat_test_set["median_house_value"].copy()

X_test_prepared_linear = full_pipeline.transform(X_test_linear)
final_predictions_linear = final_model_linear.predict(X_test_prepared_linear)

final_mse_linear = mean_squared_error(y_test_linear, final_predictions_linear)
final_rmse_linear = np.sqrt(final_mse_linear)
print(final_rmse_linear)

linear_regression_save = "linear_regression_model.pkl"
joblib.dump(final_model_linear, linear_regression_save)

In [None]:
param_grid_tree = [
    {
        'criterion': [
            'squared_error',
            'poisson',
            'friedman_mse',
            'absolute_error'
        ],
        'splitter': [
            'best',
            'random'
        ],
        'max_depth': [None, 2, 4, 6, 8, 10],
        'max_features': [
            'auto',
            'sqrt',
            'log2'
        ]
    }
]

tree_reg = DecisionTreeRegressor()

grid_search_tree = GridSearchCV(
    tree_reg, 
    param_grid_tree, 
    cv = 5,
    scoring = 'neg_mean_squared_error',
    return_train_score = True
)

grid_search_tree.fit(housing_prepared, housing_labels)

In [None]:
print(grid_search_tree.best_estimator_)

In [None]:
cvres_tree = grid_search_tree.cv_results_

for mean_score, params in zip(cvres_tree['mean_test_score'], cvres_tree['params']):
  print(np.sqrt(-mean_score), params)

In [None]:
final_model_tree = grid_search_tree.best_estimator_

X_test_tree = strat_test_set.drop("median_house_value", axis=1)
y_test_tree = strat_test_set["median_house_value"].copy()

X_test_prepared_tree = full_pipeline.transform(X_test_tree)
final_predictions_tree = final_model_tree.predict(X_test_prepared_tree)

final_mse_tree = mean_squared_error(y_test_linear, final_predictions_tree)
final_rmse_tree = np.sqrt(final_mse_tree)
print(final_rmse_tree)

tree_regression_save = "tree_regression_model.pkl"
joblib.dump(final_model_tree, tree_regression_save)

In [None]:
from sklearn.svm import SVR

svr_model = SVR()
svr_model.fit(housing_prepared, housing_labels)
svr_scores = cross_val_score(svr_model, housing_prepared, housing_labels, scoring = 'neg_mean_squared_error', cv = 10)
svr_rmse_scores = np.sqrt(-svr_scores)
display_scores(svr_rmse_scores)

In [None]:
param_grid_svr = [
    {
        'kernel': [
            'linear',
            'poly',
            'rbf',
            'sigmoid',
            'precomputed'
        ],
        'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10],
        'gamma': [
            'scale',
            'auto'
        ],
        'shrinking': [True, False]
    }
]

svr_reg = SVR()

grid_search_svr = GridSearchCV(
    svr_reg,
    param_grid_svr,
    cv = 5,
    scoring = 'neg_mean_squared_error',
    return_train_score = True
)

grid_search_svr.fit(housing_prepared, housing_labels)

In [None]:
grid_search_svr.best_estimator_

In [None]:
cvres_svr = grid_search_svr.cv_results_

for mean_score, params in zip(cvres_svr['mean_test_score'], cvres_svr['params']):
    print(np.sqrt(-mean_score), params)

In [None]:
final_model_svr = grid_search_svr.best_estimator_

X_test_svr = strat_test_set.drop("median_house_value", axis=1)
y_test_svr = strat_test_set["median_house_value"].copy()

X_test_prepared_svr = full_pipeline.transform(X_test_svr)
final_predictions_svr = final_model_svr.predict(X_test_prepared_svr)

final_mse_svr = mean_squared_error(y_test_svr, final_predictions_svr)
final_rmse_svr = np.sqrt(final_mse_svr)
print(final_rmse_svr)

svr_regression_save = "svr_regression_model.pkl"
joblib.dump(final_model_svr, svr_regression_save)