Universidad Autónoma de Chihuahua

Facultad de Ingeniería

---
##"*End-to-end project*"
---


>Data Science

>Jesús Roberto López Santillán

**338900 - Marley Zaragoza Balderrama**

---

01/03/2023

In [None]:
# librerías
import os
import tarfile
import urllib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import joblib
from scipy import stats
from sklearn.svm import SVR

: 

In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
  os.makedirs(housing_path, exist_ok = True)
  tgz_path = os.path.join(housing_path, "housing.tgz")
  urllib.request.urlretrieve(housing_url, tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path = housing_path)
  housing_tgz.close()

: 

In [None]:
def load_housing_data(housing_path = HOUSING_PATH):
  csv_path = os.path.join(housing_path, "housing.csv")
  return pd.read_csv(csv_path)

: 

In [None]:
fetch_housing_data()
housing = load_housing_data()
housing.head()

: 

In [None]:
# División automática de sklearn
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
print(len(train_set))
len(test_set)

: 

In [None]:
housing["median_income"].hist()

: 

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins = [0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels = [1, 2, 3, 4, 5])
housing["income_cat"].hist()

: 

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
  strat_train_set = housing.loc[train_index]
  strat_test_set = housing.loc[test_index]

: 

In [None]:
for set_ in (strat_train_set, strat_test_set):
  set_.drop("income_cat", axis = 1, inplace = True)

: 

In [None]:
housing = strat_train_set.copy()  #set de entrenamiento
housing.plot(kind="scatter", x= "longitude", y="latitude") #scatter es gráfica de puntos

: 

In [None]:
housing.plot(kind="scatter", x= "longitude", y="latitude", alpha=0.1) #densidad de población
#asigna más color según densidad

: 

In [None]:
housing.plot(kind="scatter", x= "longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population", figsize=(10,7),
             c="median_house_value",cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()

: 

In [None]:
corr_matrix = housing.corr()  #búsqueda de correlaciones, buscamos valores cercanos a 1 o -1
corr_matrix["median_house_value"].sort_values(ascending=False)

: 

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

: 

In [None]:
# data augmentation
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

: 

In [None]:
housing = strat_train_set.drop("median_house_value", axis = 1)  #se quita la columna mencionada, axis=1 columna y axis=0 fila
housing_labels = strat_train_set["median_house_value"].copy()

: 

In [None]:
print(type(housing))
print(type(housing_labels))
print(len(housing))
print(len(housing_labels))

: 

In [None]:
imputer = SimpleImputer(strategy = "median")
housing_num = housing.drop("ocean_proximity", axis = 1)
imputer.fit(housing_num) #significa entrenamiento

: 

In [None]:
imputer.statistics_

: 

In [None]:
housing_num.median().values

: 

In [None]:
x = imputer.transform(housing_num)
housing_tr = pd.DataFrame(x, columns = housing_num.columns,
                          index = housing_num.index)

: 

In [None]:
housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

: 

In [None]:
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) #cuenta las categorías diferentes
housing_cat_encoded[:10]

: 

In [None]:
ordinal_encoder.categories_

: 

In [None]:
cat_encoder = OneHotEncoder() #pone la matriz de 1 y 0 en lugar del 0 al 4
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

: 

In [None]:
housing_cat_1hot.toarray()

: 

In [None]:
room_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room = True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self, X, y = None):
    return self
  def transform(self, X):
    rooms_per_household = X[:, room_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room:
      bedrooms_per_room = X[:, bedrooms_ix] / X[:, room_ix]
      return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

: 

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")), #rellena todos los valores nulos con la mediana
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)  #set de vectores característicos y preprocesados

: 

In [None]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)

print(type(housing_prepared))
print(len(housing_prepared))

: 

# Linear Regression Model

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

: 

In [None]:
some_data = housing.iloc[:5] #features
some_labels = housing_labels.iloc[:5] #etiquetas
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions: ", lin_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))

: 

In [None]:
def display_scores(scores):
  print("Scores: ", scores)
  print("Mean: ", scores.mean())
  print("Standard deviation: ", scores.std())

: 

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring = "neg_mean_squared_error", cv = 10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

: 

In [None]:
param_grid = [
    {'fit_intercept': [False], 'copy_X': [False]},
    {'fit_intercept': [True], 'n_jobs': [2]},
    {'fit_intercept': [True], 'copy_X': [False]},
]
lin_reg = LinearRegression()

grid_search_lin = RandomizedSearchCV(lin_reg, param_grid, cv=5,
                           scoring= 'neg_mean_squared_error',
                           return_train_score=True, verbose=2)
grid_search_lin.fit(housing_prepared, housing_labels)

: 

In [None]:
grid_search_lin.best_estimator_
cvres = grid_search_lin.cv_results_
lin_min = float("inf")
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
  if np.sqrt(-mean_score) < lin_min:
    lin_min = np.sqrt(-mean_score)
  print(np.sqrt(-mean_score), params)

: 

In [None]:
final_model = grid_search_lin.best_estimator_
print(final_model)
print(lin_min)

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
print(final_predictions)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

: 

In [None]:
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc = squared_errors.mean(),
                         scale = stats.sem(squared_errors)))

: 

# Random Forest Regressor Model

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

: 

In [None]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                             scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

: 

In [None]:
param_grid = [
    {'n_estimators': [5, 10, 50], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    {'n_estimators': [50, 100], 'n_jobs': [1, 2]}
]
forest_reg = RandomForestRegressor()

grid_search_for = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring= 'neg_mean_squared_error',
                           return_train_score=True, verbose=2)
grid_search_for.fit(housing_prepared, housing_labels)

: 

In [None]:
for_best_estimator = grid_search_for.best_estimator_
cvres = grid_search_for.cv_results_
forest_min = float("inf")
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
  if np.sqrt(-mean_score) < forest_min:
    forest_min = np.sqrt(-mean_score)
  print(np.sqrt(-mean_score), params)

: 

In [None]:
final_model = grid_search_for.best_estimator_
print(final_model)
print(forest_min)

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

: 

In [None]:
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc = squared_errors.mean(),
                         scale = stats.sem(squared_errors)))

: 

# Decission Tree Regressor Model

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

: 

In [None]:
tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                             scoring = "neg_mean_squared_error", cv = 10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

: 

In [None]:
param_grid = [
    {'criterion': ['squared_error', 'absolute_error', 'poisson'], 'splitter': ['random']},
    {'criterion': ['absolute_error', 'poisson'], 'max_features': [3, 10], 'splitter': ['best']},
    {'random_state': [0, 10]}
]
tree_reg = DecisionTreeRegressor()

grid_search_tree = GridSearchCV(tree_reg, param_grid, cv=5,
                           scoring= 'neg_mean_squared_error',
                           return_train_score=True, verbose=2)
grid_search_tree.fit(housing_prepared, housing_labels)

: 

In [None]:
grid_search_tree.best_estimator_
cvres = grid_search_tree.cv_results_
tree_min = float("inf")
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
  if np.sqrt(-mean_score) < tree_min:
    tree_min = np.sqrt(-mean_score)
  print(np.sqrt(-mean_score), params)

: 

In [None]:
final_model_t = grid_search_tree.best_estimator_
print(final_model_t)
print(tree_min)

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model_t.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

: 

In [None]:
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc = squared_errors.mean(),
                         scale = stats.sem(squared_errors)))

: 

# Support Vector Regressor Model

In [None]:
vector_reg = SVR()
vector_reg.fit(housing_prepared, housing_labels)
housing_predictions = vector_reg.predict(housing_prepared)
vector_mse = mean_squared_error(housing_labels, housing_predictions)
vector_rmse = np.sqrt(vector_mse)
vector_rmse

: 

In [None]:
vector_scores = cross_val_score(vector_reg, housing_prepared, housing_labels,
                             scoring = "neg_mean_squared_error", cv = 10)
vector_rmse_scores = np.sqrt(-vector_scores)
display_scores(vector_rmse_scores)

: 

In [None]:
param_grid = [
    {'kernel': ['rbf'], 'tol': [0.1, 0.5, 0.9]},
    {'kernel': ['poly'], 'degree': [3, 5], 'C': [0.5, 1.0, 2.5, 3.0]},
    {'kernel': ['sigmoid', 'linear'], 'epsilon': [0.1, 0.2, 0.5, 1.0]}
]
vector_reg = SVR()

grid_search_svr = GridSearchCV(vector_reg, param_grid, cv=5,
                           scoring= 'neg_mean_squared_error',
                           return_train_score=True, verbose=2)
grid_search_svr.fit(housing_prepared, housing_labels)

: 

In [None]:
grid_search_svr.best_estimator_
cvres = grid_search_svr.cv_results_
svr_min = float("inf")
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
  if np.sqrt(-mean_score) < svr_min:
    svr_min = np.sqrt(-mean_score)
  print(np.sqrt(-mean_score), params)

: 

In [None]:
from sklearn.metrics import accuracy_score
final_model = grid_search_svr.best_estimator_
print(final_model)
print(svr_min)

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

: 

In [None]:
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc = squared_errors.mean(),
                         scale = stats.sem(squared_errors)))

: 

# Results

In [None]:
print("Tree decision: " + str(tree_min))
print('Forest: ' + str(forest_min))
print("Linear: " + str(lin_min))
print("SVR: " + str(svr_min))

: 

In [None]:
import joblib
joblib.dump(for_best_estimator, "m_model.pkl")

: 

In [None]:
my_model_loaded = joblib.load("m_model.pkl")

: 