In [None]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time


In [2]:
df_mod_clean = pd.read_parquet('tabla_modelado.parquet')

In [3]:
df_mod_clean

Unnamed: 0,vendor_name,passenger_count,trip_distance,rate_code_desc,total_amount,service_type,pickup_dow,pickup_month,pickup_year,pickup_hour,is_weekend,is_rush_hour,pickup_borough
0,"Curb Mobility, LLC",1.0,0.84,Standard rate,11.16,yellow,7,7,2022,17,True,True,Manhattan
1,"Curb Mobility, LLC",1.0,0.47,Standard rate,8.30,yellow,6,7,2022,15,False,False,Manhattan
2,"Curb Mobility, LLC",1.0,1.59,Standard rate,14.30,yellow,4,3,2022,14,False,False,Manhattan
3,"Curb Mobility, LLC",1.0,2.17,Standard rate,15.96,yellow,7,11,2022,15,True,False,Manhattan
4,"Curb Mobility, LLC",1.0,1.43,Standard rate,17.16,yellow,4,4,2022,11,False,False,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
639995,"Curb Mobility, LLC",1.0,2.68,Standard rate,24.66,green,7,7,2025,18,True,True,Manhattan
639996,"Curb Mobility, LLC",1.0,1.50,Standard rate,13.20,green,5,3,2025,20,False,True,Manhattan
639997,"Curb Mobility, LLC",1.0,1.44,Standard rate,12.42,green,6,2,2025,10,False,True,Manhattan
639998,"Curb Mobility, LLC",1.0,1.49,Standard rate,15.10,green,7,3,2025,16,True,True,Brooklyn


In [5]:
df_train = df_mod_clean[df_mod_clean["pickup_year"].isin([2022, 2023])]
df_val = df_mod_clean[df_mod_clean["pickup_year"] == 2024]
df_test = df_mod_clean[df_mod_clean["pickup_year"] == 2025]
print(f"Train shape: {df_train.shape}, Val shape: {df_val.shape}, Test shape: {df_test.shape}")

target = "total_amount"
feature_cols = [col for col in df_mod_clean.columns if col != target]

X_train = df_train.drop(columns=[target])
y_train = df_train[target]

X_val = df_val.drop(columns=[target])
y_val = df_val[target]

X_test = df_test.drop(columns=[target])
y_test = df_test[target]


Train shape: (264755, 13), Val shape: (130157, 13), Test shape: (119552, 13)


In [35]:
df_mod_clean.columns

Index(['vendor_name', 'passenger_count', 'trip_distance', 'rate_code_desc',
       'total_amount', 'service_type', 'pickup_dow', 'pickup_month',
       'pickup_year', 'pickup_hour', 'is_weekend', 'is_rush_hour',
       'pickup_borough'],
      dtype='object')

In [6]:
#Procesamiento
categorical = ['vendor_name', 'rate_code_desc','service_type','pickup_borough','is_weekend','is_rush_hour']
numeric_features = ['trip_distance', 'passenger_count','pickup_year'] 

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
])
# Escalar usando solo datos de entrenamiento
X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
X_test_scaled = X_test.copy()

X_train_scaled = preprocessor.fit_transform(X_train)
X_val_scaled = preprocessor.transform(X_val)
X_test_scaled = preprocessor.transform(X_test)


In [None]:
#Baseline 

  # import numpy as np
  # from sklearn.metrics import mean_squared_error, mean_absolute_error

  # # 1. Calcular media del target en entrenamiento
  # y_train_mean = y_train.mean()

  # # 2. Predecir la media para validación y test
  # y_val_pred_baseline = np.full(len(y_val), y_train_mean)
  # y_test_pred_baseline = np.full(len(y_test), y_train_mean)

  # # 3. Calcular métricas
  # # Validación
  # val_rmse_baseline = np.sqrt(mean_squared_error(y_val, y_val_pred_baseline))
  # val_mae_baseline = mean_absolute_error(y_val, y_val_pred_baseline)

  # # Test  
  # test_rmse_baseline = np.sqrt(mean_squared_error(y_test, y_test_pred_baseline))
  # test_mae_baseline = mean_absolute_error(y_test, y_test_pred_baseline)

  # print(f"Baseline (Media):")
  # print(f"Validación - RMSE: {val_rmse_baseline:.4f}, MAE: {val_mae_baseline:.4f}")
  # print(f"Test - RMSE: {test_rmse_baseline:.4f}, MAE: {test_mae_baseline:.4f}")

  # ¿Por qué es útil?

  # El baseline te da un punto de referencia mínimo. Si tu modelo complejo no supera este baseline,
  # algo está muy mal.

  # Ejemplo interpretación:

  # - Baseline RMSE: 15.2
  # - Tu modelo RMSE: 8.7
  # - Mejora: 43% mejor que predecir siempre la media

  # Baseline alternativo (Regresión Lineal simple):

  # from sklearn.linear_model import LinearRegression

  # # Solo con una feature importante
  # lr_baseline = LinearRegression()
  # lr_baseline.fit(X_train[['trip_distance']], y_train)

  # y_val_pred_lr = lr_baseline.predict(X_val[['trip_distance']])
  # val_rmse_lr = np.sqrt(mean_squared_error(y_val, y_val_pred_lr))



In [10]:
  # Combinar train + val
X_combined = np.vstack([X_train_scaled, X_val_scaled])
y_combined = np.hstack([y_train, y_val])


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# 1. Definir 3 regresores base diferentes
estimators = [
    ('linear', LinearRegression()),
    ('forest', RandomForestRegressor(random_state=42)),
    ('svm', SVR())
]

# 2. Hard Voting (promedio simple)
voting_hard = VotingRegressor(
    estimators=estimators,
    n_jobs=-1
  )


# Hiperparámetros para cada modelo base
param_grid_voting = {

    # RandomForest parameters (prefijo: forest__)
    'forest__n_estimators': [50],
    'forest__max_depth': [10],
    'forest__min_samples_split': [2],

    # SVR parameters (prefijo: svm__)
    'svm__C': [0.1],
    'svm__gamma': [ 0.1],
    'svm__kernel': ['linear']
  }

train_idx = np.arange(len(X_train))                     
val_idx = np.arange(len(X_train), len(X_combined))

  # GridSearch
grid_voting = GridSearchCV(
    estimator=voting_hard,
    param_grid=param_grid_voting,
    cv = [(train_idx, val_idx)],  #Split no la entendi jaja
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)






In [13]:
pip install torch torchvision torchaudio


Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.8.0-cp39-none-macosx_11_0_arm64.whl (73.6 MB)
[K     |████████████████████████████████| 73.6 MB 8.1 MB/s eta 0:00:012
[?25hCollecting torchvision
  Downloading torchvision-0.23.0-cp39-cp39-macosx_11_0_arm64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 39.6 MB/s eta 0:00:01
[?25hCollecting torchaudio
  Downloading torchaudio-2.8.0-cp39-cp39-macosx_11_0_arm64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 12.1 MB/s eta 0:00:01
[?25hCollecting sympy>=1.13.3
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 40.5 MB/s eta 0:00:01
[?25hCollecting networkx
  Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 15.0 MB/s eta 0:00:01
[?25hCollecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
[K     |██████████████████████

In [14]:
import torch
print(torch.backends.mps.is_available())


True


In [12]:
  # Entrenar
grid_voting.fit(X_combined, y_combined)

Fitting 1 folds for each of 324 candidates, totalling 324 fits


KeyboardInterrupt: 

In [None]:
  # Mejores parámetros
print("Mejores parámetros:", grid_voting.best_params_)

  # Predicciones
y_val_pred_voting = grid_voting.predict(X_val_scaled)
y_test_pred_voting = grid_voting.predict(X_test_scaled)

  # Métricas
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_voting))
val_mae = mean_absolute_error(y_val, y_val_pred_voting)
val_r2 = r2_score(y_val, y_val_pred_voting)

print(f"Voting - Validación RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}")

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

base_tree = DecisionTreeRegressor(random_state=42)

  # BAGGING 
bagging_model = BaggingRegressor(
    base_estimator=base_tree,
    n_estimators=100,
    bootstrap=True,        
    random_state=42,
    n_jobs=-1
  )

# PASTING  
pasting_model = BaggingRegressor(
    base_estimator=base_tree,
    n_estimators=100,
    bootstrap=False,       
    random_state=42,
    n_jobs=-1
)



  # Parámetros
param_grid_bagging = {
    'n_estimators': [50, 100, 200],                    
    'max_samples': [0.5, 0.7, 1.0],                   
    'max_features': [0.5, 0.7, 1.0],                  
    'base_estimator__max_depth': [5, 10, 15, None],  
    'base_estimator__min_samples_split': [2, 5, 10]  
}


grid_bagging = GridSearchCV(
    estimator=bagging_model,
    param_grid=param_grid_bagging,
    cv=[(train_idx, val_idx)],
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)


grid_pasting = GridSearchCV(
    estimator=pasting_model,
    param_grid=param_grid_bagging, 
    cv=[(train_idx, val_idx)],
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

In [None]:
grid_bagging.fit(X_combined, y_combined)

In [None]:
grid_pasting.fit(X_combined, y_combined)

In [None]:
X_combined_n = np.vstack([X_train, X_val])

In [None]:
#CatBoost

from catboost import CatBoostRegressor

catboost_model = CatBoostRegressor(
    random_seed=42,
    verbose=False,
    cat_features=categorical # Índices de columnas categóricas
)

  # Parámetros para CatBoost
param_grid_catboost = {
    'iterations': [100, 300, 500],         
    'learning_rate': [0.01, 0.1, 0.3],    
    'depth': [4, 6, 8],                   
    'l2_leaf_reg': [1, 3, 5],             
    'border_count': [32, 64, 128]         
}

grid_catboost = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid_catboost,
    cv=[(np.arange(len(X_train)), np.arange(len(X_val)))],
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

In [None]:
grid_catboost.fit(X_combined_n, y_combined)