In [12]:
!pip install scikit-optimize



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical
from google.colab import drive
from sklearn.model_selection import train_test_split


In [None]:

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_path = "/content/drive/MyDrive/DataSet/Dataset/"
train_path = "/content/drive/MyDrive/DataSet/Dataset/train.csv"
test_path = "/content/drive/MyDrive/DataSet/Dataset/test.csv"


In [None]:
base_path = "/content/drive/MyDrive/DataSet/Dataset/"
train_path = base_path + "train.csv"
test_path  = base_path + "test.csv"

train_df = pd.read_csv(train_path, nrows=50000)
test_df  = pd.read_csv(test_path)

print(train_df.head())


                             key  fare_amount          pickup_datetime  \
0    2009-06-15 17:26:21.0000001          4.5  2009-06-15 17:26:21 UTC   
1    2010-01-05 16:52:16.0000002         16.9  2010-01-05 16:52:16 UTC   
2   2011-08-18 00:35:00.00000049          5.7  2011-08-18 00:35:00 UTC   
3    2012-04-21 04:30:42.0000001          7.7  2012-04-21 04:30:42 UTC   
4  2010-03-09 07:51:00.000000135          5.3  2010-03-09 07:51:00 UTC   

   pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  \
0        -73.844311        40.721319         -73.841610         40.712278   
1        -74.016048        40.711303         -73.979268         40.782004   
2        -73.982738        40.761270         -73.991242         40.750562   
3        -73.987130        40.733143         -73.991567         40.758092   
4        -73.968095        40.768008         -73.956655         40.783762   

   passenger_count  
0                1  
1                1  
2                2  
3       

In [None]:
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # raio da terra em km
    return c * r

def prepare(df):
    df = df.copy()

    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['hour'] = df['pickup_datetime'].dt.hour

    df['distance_km'] = haversine(df['pickup_longitude'], df['pickup_latitude'],
                                  df['dropoff_longitude'], df['dropoff_latitude'])

    df = df[(df['distance_km'] > 0) & (df['distance_km'] < 50)]
    df = df[df['fare_amount'] > 0]
    df = df[df['passenger_count'] > 0]

    return df


In [None]:
train_df = prepare(train_df)

features = ["distance_km", "hour", "month", "passenger_count"]
X = train_df[features]
y = train_df["fare_amount"]


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
def rf_objective(params):
    n_estimators = params[0]
    max_depth = params[1]
    min_samples_split = params[2]
    min_samples_leaf = params[3]

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )

    cv = KFold(n_splits=2, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=cv, scoring="neg_mean_squared_error")
    return -score.mean()


In [None]:
search_space = [
    Integer(50, 150, name='n_estimators'),
    Integer(5, 30, name='max_depth'),
    Integer(2, 15, name='min_samples_split'),
    Integer(1, 10, name='min_samples_leaf')
]


In [13]:
opt = gp_minimize(
    rf_objective,
    search_space,
    n_calls=10,
    random_state=42,
    verbose=True
)

best_params = opt.x
print("Melhores parâmetros:", best_params)


Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 26.0252
Function value obtained: 21.0264
Current minimum: 21.0264
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 14.0113
Function value obtained: 20.8567
Current minimum: 20.8567
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 6.3677
Function value obtained: 21.2584
Current minimum: 20.8567
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 5.0655
Function value obtained: 20.9577
Current minimum: 20.8567
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 4.8777
Function value obtained: 20.9756
Current minimum: 20.8567
Iteration No: 6 started

In [15]:
model_rf = RandomForestRegressor(
    n_estimators=best_params[0],
    max_depth=best_params[1],
    min_samples_split=best_params[2],
    min_samples_leaf=best_params[3],
    random_state=42,
    n_jobs=-1
)

model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_val)

mse_rf = mean_squared_error(y_val, pred_rf)
print("MSE (RandomForest):", mse_rf)


MSE (RandomForest): 20.21018330780453


In [16]:
model_lr = LinearRegression()
model_knn = KNeighborsRegressor(n_neighbors=5)
model_dt = DecisionTreeRegressor(random_state=42)

model_lr.fit(X_train, y_train)
model_knn.fit(X_train, y_train)
model_dt.fit(X_train, y_train)


In [17]:
ensemble = VotingRegressor(
    estimators=[
        ('RF', model_rf),
        ('LR', model_lr),
        ('KNN', model_knn),
        ('DT', model_dt)
    ],
    weights=[0.9, 0.1, 0.09, 0.05]
)


In [18]:
ensemble.fit(X_train, y_train)
pred_ensemble = ensemble.predict(X_val)

mse_ensemble = mean_squared_error(y_val, pred_ensemble)
print("MSE (Voting Ensemble Peso):", mse_ensemble)


MSE (Voting Ensemble Peso): 20.19738237050107


In [19]:
ensemble = VotingRegressor([
    ('RF', model_rf),
    ('LR', model_lr),
    ('KNN', model_knn),
    ('DT', model_dt)
])

ensemble.fit(X_train, y_train)
pred_ensemble = ensemble.predict(X_val)

mse_ensemble = mean_squared_error(y_val, pred_ensemble)
print("MSE (Voting Ensemble):", mse_ensemble)


MSE (Voting Ensemble): 22.187793627519753


In [20]:
y_pred = model_rf.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

print("RMSE RandomForest:", rmse)


RMSE RandomForest: 4.495573746231345


In [21]:
!pip install gradio==3.41.2





In [22]:
import gradio as gr
from datetime import datetime
import pandas as pd

def prever_tarifa(pickup_datetime, pickup_longitude, pickup_latitude,
                  dropoff_longitude, dropoff_latitude, passenger_count):

    dt = datetime.fromisoformat(pickup_datetime)

    hour = dt.hour
    month = dt.month


    distance_km = haversine(
        pickup_longitude, pickup_latitude,
        dropoff_longitude, dropoff_latitude
    )


    data = {
        "distance_km": [distance_km],
        "hour": [hour],
        "month": [month],
        "passenger_count": [passenger_count]
    }
    X_new = pd.DataFrame(data)


    fare_pred = ensemble.predict(X_new)[0]



    return round(float(fare_pred), 2)


In [23]:
inputs = [
    gr.Textbox(label="Data e hora da corrida (YYYY-MM-DD HH:MM:SS)", value="2013-07-06 17:18:00"),
    gr.Number(label="Pickup longitude", value=-73.985428),
    gr.Number(label="Pickup latitude", value=40.748817),
    gr.Number(label="Dropoff longitude", value=-73.985000),
    gr.Number(label="Dropoff latitude", value=40.758896),
    gr.Number(label="Número de passageiros", value=1)
]

output = gr.Number(label="Tarifa prevista (USD)")

demo = gr.Interface(
    fn=prever_tarifa,
    inputs=inputs,
    outputs=output,
    title="Predição de Tarifa de Táxi em NYC",
    description="Digite os dados da corrida e veja a previsão de tarifa usando um ensemble de modelos (VotingRegressor)."
)

demo.launch()


IMPORTANT: You are using gradio version 3.41.2, however version 4.44.1 is available, please upgrade.
--------
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [24]:
import gradio as gr
import pandas as pd
from datetime import datetime
import math

locations = {
    "Times Square": (-73.985130, 40.758896),
    "Central Park (5th Ave Entrance)": (-73.974187, 40.764356),
    "World Trade Center": (-74.010300, 40.711600),
    "Chelsea": (-73.993500, 40.746500),
    "Brooklyn Bridge Park": (-73.990300, 40.700300),
    "Madison Square Garden": (-73.993400, 40.750500),
    "Financial District": (-74.005970, 40.712776),
    "Grand Central Terminal": (-73.977229, 40.752726),
    "JFK Airport": (-73.778139, 40.641311)
}


def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat/2)**2 + math.cos(lat1)*math.cos(lat2)*math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a))
    r = 6371
    return c * r



def prever(pickup_place, dropoff_place, pickup_datetime, passenger_count):
    pickup_lon, pickup_lat = locations[pickup_place]
    dropoff_lon, dropoff_lat = locations[dropoff_place]

    dt = datetime.fromisoformat(pickup_datetime)
    hour = dt.hour
    month = dt.month

    distance_km = haversine(pickup_lon, pickup_lat, dropoff_lon, dropoff_lat)

    X = pd.DataFrame({
        "distance_km": [distance_km],
        "hour": [hour],
        "month": [month],
        "passenger_count": [passenger_count]
    })

    fare = ensemble.predict(X)[0]
    return round(float(fare), 2)


demo = gr.Interface(
    fn=prever,
    inputs=[
        gr.Dropdown(list(locations.keys()), label="Local de Partida"),
        gr.Dropdown(list(locations.keys()), label="Local de Destino"),
        gr.Textbox(label="Data e Hora (YYYY-MM-DD HH:MM:SS)", value="2025-11-26 17:18:00"),
        gr.Slider(1, 4, step=1, label="Passageiros", value=1)
    ],
    outputs=gr.Number(label="Tarifa Prevista (USD)"),
    title="Predição de Tarifa de Táxi em NYC",
    description="Selecione o local de partida, destino e obtenha a previsão da tarifa."
)

demo.launch()


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

