In [None]:
# Obtenemos las variables de entorno

import os
from dotenv import load_dotenv

PATH = os.getcwd()
PATH = PATH.replace("/Price", "")

load_dotenv(PATH + "/.env")

In [None]:
import urllib.parse
import certifi
from pymongo.mongo_client import MongoClient
import json
import pandas as pd
import os

pd.set_option('display.max_columns', None)

username = urllib.parse.quote_plus(os.getenv("MONGO_USERNAME"))
password = urllib.parse.quote_plus(os.getenv("MONGO_PASSWORD"))

uri = "mongodb://{}:{}@{}:{}/?authSource={}&authMechanism={}".format(username, password, os.getenv("MONGO_HOST"),
                                            os.getenv("MONGO_PORT"), os.getenv("MONGO_DATABASE"), "SCRAM-SHA-1")

client = MongoClient(uri)

In [None]:
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
db = client[os.getenv("MONGO_DATABASE")]
collection = db.PriceProduction

# Debido a que el tipo de operación esta embebido en el campo "attributes" vamos a tener
# que importar todo y filtrar luego

properties = collection.find({})
properties_list = list()

for p in properties:
    properties_list.append(p)

print(len(properties_list))
df = pd.DataFrame(properties_list)
df.drop(["_id"], axis=1, inplace=True)

pd.set_option('display.max_columns', None)

In [None]:
from sklearn.metrics import mean_squared_error

# Tenemos varios outliers en relación a superficie de los departamentos

df = df.loc[df.price > 150]
df = df.loc[df.price < 5000]
df = df.loc[df.surface_total > 15]
df = df.loc[df.surface_total < 300]

In [None]:
from sklearn.model_selection import train_test_split
y = df.price
X = df.drop(["price"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Con resultados del CVGridSearch grande

GBoost2 = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,
                                   max_depth=9, max_features='log2',
                                   min_samples_leaf=2, min_samples_split=5,
                                   loss='squared_error', random_state = 5, subsample=0.5)
GBoost_model2 = GBoost2.fit(X_train, y_train)

GBoost_pred2 = GBoost_model2.predict(X_test)
print("RMSE score is: " + str(mean_squared_error(y_test, GBoost_pred2, squared=False)))
print("Model score is: " + str(GBoost_model2.score(X_test, y_test)))

In [None]:
# Vamos a persistir el modelo que mejor nos dió

import pickle
from joblib import dump, load

dump(GBoost_model2, 'Models/gboost_modelv2_3.joblib')