In [2]:
import pandas as pd
import numpy as np
dados = pd.read_csv('sao-paulo-properties-april-2019.csv')
dados.drop(columns=['District'], inplace=True)

In [3]:
dados.head()
dados.info()
dados.describe()
dados.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13640 entries, 0 to 13639
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             13640 non-null  int64  
 1   Condo             13640 non-null  int64  
 2   Size              13640 non-null  int64  
 3   Rooms             13640 non-null  int64  
 4   Toilets           13640 non-null  int64  
 5   Suites            13640 non-null  int64  
 6   Parking           13640 non-null  int64  
 7   Elevator          13640 non-null  int64  
 8   Furnished         13640 non-null  int64  
 9   Swimming Pool     13640 non-null  int64  
 10  New               13640 non-null  int64  
 11  Negotiation Type  13640 non-null  object 
 12  Property Type     13640 non-null  object 
 13  Latitude          13640 non-null  float64
 14  Longitude         13640 non-null  float64
dtypes: float64(2), int64(11), object(2)
memory usage: 1.6+ MB


Unnamed: 0,0
Price,0
Condo,0
Size,0
Rooms,0
Toilets,0
Suites,0
Parking,0
Elevator,0
Furnished,0
Swimming Pool,0


In [6]:
dados.dropna(inplace=True)

In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
dados[['Latitude']] = scaler.fit_transform(dados[['Latitude']])

In [8]:
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['Negotiation Type', 'Property Type']

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # sparse=False for dense output

encoded_data = encoder.fit_transform(dados[categorical_features])

encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))

dados = pd.concat([dados, encoded_df], axis=1)

dados.drop(columns=categorical_features, inplace=True)

In [9]:
from sklearn.preprocessing import StandardScaler

numerical_features = ['Latitude']

scaler = StandardScaler()
dados[numerical_features] = scaler.fit_transform(dados[numerical_features])

In [10]:
!pip install mlflow==2.3.0



In [11]:
!pip install setuptools==65.5.0



In [12]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
     mlflow.set_tracking_uri("file://./mlruns")
else:
    mlflow.set_tracking_uri("file:///tmp/mlruns")

def train_and_evaluate_model(model, model_name):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)


        mlflow.sklearn.log_model(model, model_name, registered_model_name=model_name)

        print(f"{model_name} - MSE: {mse}, R2: {r2}")


        return mse, r2



X = dados.drop(columns=['Price'])
y = dados['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


linear_regression = LinearRegression()
train_and_evaluate_model(linear_regression, "Regressão Linear")

decision_tree = DecisionTreeRegressor(random_state=42)
train_and_evaluate_model(decision_tree, "Árvore de Decisão")

Registered model 'Regressão Linear' already exists. Creating a new version of this model...
2025/04/21 13:56:06 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Regressão Linear, version 7
Created version '7' of model 'Regressão Linear'.


Regressão Linear - MSE: 185553269347.21902, R2: 0.4580061481784199


Registered model 'Árvore de Decisão' already exists. Creating a new version of this model...
2025/04/21 13:56:12 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Árvore de Decisão, version 7


Árvore de Decisão - MSE: 49783051571.43678, R2: 0.8545856509477893


Created version '7' of model 'Árvore de Decisão'.


(49783051571.43678, 0.8545856509477893)

In [13]:
!pip install mlflow==2.3.0 mlflow-skinny==2.3.0



In [None]:
!mlflow ui --backend-store-uri file:///tmp/mlruns

[2025-04-21 13:51:10 +0000] [32081] [INFO] Starting gunicorn 20.1.0
[2025-04-21 13:51:10 +0000] [32081] [INFO] Listening at: http://127.0.0.1:5000 (32081)
[2025-04-21 13:51:10 +0000] [32081] [INFO] Using worker: sync
[2025-04-21 13:51:10 +0000] [32090] [INFO] Booting worker with pid: 32090
[2025-04-21 13:51:10 +0000] [32091] [INFO] Booting worker with pid: 32091
[2025-04-21 13:51:10 +0000] [32092] [INFO] Booting worker with pid: 32092
[2025-04-21 13:51:10 +0000] [32093] [INFO] Booting worker with pid: 32093
[2025-04-21 13:51:57 +0000] [32081] [INFO] Handling signal: int

Aborted!
[2025-04-21 13:51:57 +0000] [32092] [INFO] Worker exiting (pid: 32092)
[2025-04-21 13:51:57 +0000] [32090] [INFO] Worker exiting (pid: 32090)
[2025-04-21 13:51:57 +0000] [32091] [INFO] Worker exiting (pid: 32091)
[2025-04-21 13:51:57 +0000] [32093] [INFO] Worker exiting (pid: 32093)
[2025-04-21 13:51:58 +0000] [32081] [INFO] Shutting down: Master


In [14]:
!pip install fastapi uvicorn pyngrok nest_asyncio



In [15]:
from fastapi import FastAPI, Request, Response
import mlflow
import uvicorn
import nest_asyncio
from pyngrok import ngrok
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
!ngrok authtoken 2w0We1RGgvPqwkPaiBACFz2h5Dv_83bRNy2urX3bwfvDvizyd
app = FastAPI()

# Load the trained OneHotEncoder (outside the predict function)
training_data = pd.read_csv('sao-paulo-properties-april-2019.csv')
training_data = training_data.drop(columns=['District'], errors='ignore')  # Drop 'District' here

categorical_features = ['Negotiation Type', 'Property Type']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(training_data[categorical_features])


model_name = "Árvore de Decisão"
model_uri = f"models:/{model_name}/latest"
loaded_model = mlflow.pyfunc.load_model(model_uri)


@app.post("/predict")
async def predict(input_data: dict):
    input_data = input_data['input_data']


    input_df = pd.DataFrame([input_data])
    input_df = input_df.drop(columns=['District', 'Price'], errors='ignore')  # Drop 'District' and 'Price' here


    categorical_data = input_df[categorical_features]
    encoded_data = encoder.transform(categorical_data)
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))

    input_df = pd.concat([input_df, encoded_df], axis=1)
    input_df = input_df.drop(columns=categorical_features)

    missing_features = ['Toilets', 'Suites', 'Parking', 'Elevator', 'Furnished', 'Swimming Pool', 'New', 'Condo', 'Size', 'Rooms', 'Latitude', 'Longitude']
    for feature in missing_features:
        if feature not in input_df.columns:
            input_df[feature] = 0


    all_features = list(training_data.columns)
    all_features = all_features[1:]
    all_features_encoded = all_features + list(encoder.get_feature_names_out(categorical_features))
    input_df = input_df[[c for c in all_features_encoded if c in input_df.columns]]


    prediction = loaded_model.predict(input_df)

    return {"prediction": prediction.tolist()[0]}
ngrok_tunnel = ngrok.connect(8000)  # Expose port 8000
print('Public URL:', ngrok_tunnel.public_url)


nest_asyncio.apply()

uvicorn.run(app, host="0.0.0.0", port=8000)


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


INFO:     Started server process [32671]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Public URL: https://6a34-34-72-152-88.ngrok-free.app


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [32671]


In [18]:
!pip install -U matplotlib
!pip install -U seaborn

Collecting matplotlib
  Downloading matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.10.0
    Uninstalling matplotlib-3.10.0:
      Successfully uninstalled matplotlib-3.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlflow 2.3.0 requires pyarrow<12,>=4.0.0, but you have pyarrow 12.0.1 which is incompatible.
bigframes 1.42.0 requires fsspec>=2023.3.0, but you have fsspec 2022.11.0 which is incompatible.
bigframes 1.42.0 requires gcsfs>=2023.3.0, but you have gcsfs 2022

In [21]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shutil
import os
import schedule
import time
from sklearn.preprocessing import OneHotEncoder

!pip install gdown
import gdown

file_id = '1Kj0mK9hzRhNnKE0nA_pcyKVZEPvcoB5S'
file_path = f'https://drive.google.com/uc?id={file_id}'
gdown.download(file_path, 'sao-paulo-properties-april-2019.csv', quiet=False)


def train_and_evaluate_model(model, model_name):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)

        mlflow.sklearn.log_model(model, model_name, registered_model_name=model_name)

        print(f"{model_name} - MSE: {mse}, R2: {r2}")

        return mse, r2


def load_data():
    dados = pd.read_csv("sao-paulo-properties-april-2019.csv")
    dados.drop(columns=["District"], inplace=True, errors='ignore')
    dados.dropna(inplace=True)
    return dados


def preprocess_data(dados):
    """Preprocesses the data for model training."""
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    categorical_features = ['Negotiation Type', 'Property Type']
    encoder.fit(dados[categorical_features])

    encoded_data = encoder.transform(dados[categorical_features])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))

    dados = pd.concat([dados, encoded_df], axis=1)
    dados = dados.drop(columns=categorical_features, errors='ignore')

    X = dados.drop(columns=["Price"], errors='ignore')
    y = dados["Price"]
    return X, y


def retrain_model():
    global X_train, X_test, y_train, y_test, best_model, best_mse, best_r2

    dados = load_data()
    X, y = preprocess_data(dados)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    linear_regression = LinearRegression()
    linear_regression_mse, linear_regression_r2 = train_and_evaluate_model(linear_regression, "Linear Regression")

    decision_tree = DecisionTreeRegressor(random_state=42)
    decision_tree_mse, decision_tree_r2 = train_and_evaluate_model(decision_tree, "Decision Tree Regression")

    if decision_tree_mse < best_mse:
        best_model = decision_tree
        best_mse = decision_tree_mse
        best_r2 = decision_tree_r2
        print("New best model trained!")
    else:
        print("Existing model is still the best.")


dados = load_data()
X, y = preprocess_data(dados)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_model = None
best_mse = float("inf")
best_r2 = 0

retrain_model()

schedule.every(24).hours.do(retrain_model)

while True:
    schedule.run_pending()
    time.sleep(1)



Downloading...
From: https://drive.google.com/uc?id=1Kj0mK9hzRhNnKE0nA_pcyKVZEPvcoB5S
To: /content/sao-paulo-properties-april-2019.csv
100%|██████████| 1.21M/1.21M [00:00<00:00, 81.1MB/s]
Registered model 'Linear Regression' already exists. Creating a new version of this model...
2025/04/21 14:02:03 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Linear Regression, version 3
Created version '3' of model 'Linear Regression'.


Linear Regression - MSE: 185553269347.2191, R2: 0.4580061481784198


Registered model 'Decision Tree Regression' already exists. Creating a new version of this model...
2025/04/21 14:02:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Decision Tree Regression, version 3
Created version '3' of model 'Decision Tree Regression'.


Decision Tree Regression - MSE: 58357871550.75089, R2: 0.8295389367313536
New best model trained!


KeyboardInterrupt: 