<a href="https://colab.research.google.com/github/MatteoAldovardi92/Taxi_Project/blob/main/MachineLearning_Pipeline1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import os
import pandas as pd

# Mount Google Drive

import os
from google.colab import drive

mount_point = '/content/drive'

# Check if the mount point directory exists and is not empty
# A mounted drive will typically have content under its mount point
if not os.path.exists(mount_point) or not os.listdir(mount_point):
    print(f"Drive not detected at {mount_point}. Attempting to mount...")
    drive.mount(mount_point)
else:
    print(f"Drive already mounted at {mount_point}.")


# Define path inside Google Drive
drive_path = '/content/drive/MyDrive/datasets'

# Create the folder if it doesn't exist
os.makedirs(drive_path, exist_ok=True)


bog_train_df = pd.read_csv('/content/drive/MyDrive/datasets/bog_train_df.csv')
mex_train_df = pd.read_csv('/content/drive/MyDrive/datasets/mex_train_df.csv')
uio_train_df = pd.read_csv('/content/drive/MyDrive/datasets/uio_train_df.csv')



Drive already mounted at /content/drive.


In [2]:
# To be kept aside for model evaluation

bog_test_df = pd.read_csv('/content/drive/MyDrive/datasets/bog_test_df.csv')
mex_test_df = pd.read_csv('/content/drive/MyDrive/datasets/mex_test_df.csv')
uio_test_df = pd.read_csv('/content/drive/MyDrive/datasets/uio_test_df.csv')

target_column = 'trip_duration'

bog_y_test = bog_test_df[target_column]
bog_X_test = bog_test_df.drop(columns=[target_column])

mex_y_test = mex_test_df[target_column]
mex_X_test = mex_test_df.drop(columns=[target_column])

uio_y_test = uio_test_df[target_column]
uio_X_test = uio_test_df.drop(columns=[target_column])

################################################################





In [3]:


# Train Validation Splitting
# NB: For a Cross validation folding you can always restart from
# the datasets id_train and manually split between target and regressors
# Given that you have uneven small datasets this could be the best approach
####################


import pandas as pd
from sklearn.model_selection import train_test_split

# Dictionary of your datasets to process
# You'll likely get these from your previous step where you performed the initial train/test split
# For this example, I'm using the full dataframes assuming you'd have train_df's here
dataframes_to_split = {
    'bog_train': bog_train_df, # In a real scenario, this would be bog_train_df from the first split
    'mex_train': mex_train_df, # mex_train_df
    'uio_train': uio_train_df  # uio_train_df
}

# Dictionary to store the final X_train, X_val, y_train, y_val for each city
processed_splits = {}

for name, df_train in dataframes_to_split.items():
    print(f"\n--- Processing {name.upper()} (for Validation Split) ---")

    # Define target and features
    target_column = 'trip_duration'
    if target_column not in df_train.columns:
        print(f"Error: '{target_column}' not found in {name.upper()} DataFrame. Skipping.")
        continue

    y = df_train[target_column]
    X = df_train.drop(columns=[target_column])

    # Reset indexes for X and y (important if previous operations altered them)
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)

    # Perform the train-validation split (random split as per your decision)
    X_train_final, X_validation, y_train_final, y_validation = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Store the results in the dictionary
    processed_splits[f'{name}_X_train'] = X_train_final
    processed_splits[f'{name}_X_val'] = X_validation
    processed_splits[f'{name}_y_train'] = y_train_final
    processed_splits[f'{name}_y_val'] = y_validation

    print(f"Split completed for {name.upper()}:")
    print(f"  X_train shape: {X_train_final.shape}")
    print(f"  X_val shape: {X_validation.shape}")
    print(f"  y_train shape: {y_train_final.shape}")
    print(f"  y_val shape: {y_validation.shape}")

    # Optional: Print some value counts to see distribution, even if not stratified
    if 'vendor_id' in X_train_final.columns:
        print(f"\n  {name.upper()} 'vendor_id' proportions (Train vs. Val):")
        print("  Train:\n", X_train_final['vendor_id'].value_counts(normalize=True))
        print("  Validation:\n", X_validation['vendor_id'].value_counts(normalize=True))
    if 'is_rush_hour' in X_train_final.columns:
        print(f"\n  {name.upper()} 'is_rush_hour' proportions (Train vs. Val):")
        print("  Train:\n", X_train_final['is_rush_hour'].value_counts(normalize=True))
        print("  Validation:\n", X_validation['is_rush_hour'].value_counts(normalize=True))


print("\n--- All DataFrames processed and split for training/validation ---")

# How to access your split data:
# For Bogotá:
bog_X_train = processed_splits['bog_train_X_train']
bog_X_val = processed_splits['bog_train_X_val']
bog_y_train = processed_splits['bog_train_y_train']
bog_y_val = processed_splits['bog_train_y_val']

# For Mexico City:
mex_X_train = processed_splits['mex_train_X_train']
mex_X_val = processed_splits['mex_train_X_val']
mex_y_train = processed_splits['mex_train_y_train']
mex_y_val = processed_splits['mex_train_y_val']

# For Quito:
uio_X_train = processed_splits['uio_train_X_train']
uio_X_val = processed_splits['uio_train_X_val']
uio_y_train = processed_splits['uio_train_y_train']
uio_y_val = processed_splits['uio_train_y_val']



--- Processing BOG_TRAIN (for Validation Split) ---
Split completed for BOG_TRAIN:
  X_train shape: (1388, 6)
  X_val shape: (348, 6)
  y_train shape: (1388,)
  y_val shape: (348,)

  BOG_TRAIN 'vendor_id' proportions (Train vs. Val):
  Train:
 vendor_id
Bogotá              0.843660
Bogota UberBlack    0.079251
Bogotá UberX        0.058357
Bogotá UberVan      0.018012
Bogotá UberAngel    0.000720
Name: proportion, dtype: float64
  Validation:
 vendor_id
Bogotá              0.853448
Bogota UberBlack    0.080460
Bogotá UberX        0.057471
Bogotá UberVan      0.005747
Bogotá UberAngel    0.002874
Name: proportion, dtype: float64

  BOG_TRAIN 'is_rush_hour' proportions (Train vs. Val):
  Train:
 is_rush_hour
False    0.672911
True     0.327089
Name: proportion, dtype: float64
  Validation:
 is_rush_hour
False    0.692529
True     0.307471
Name: proportion, dtype: float64

--- Processing MEX_TRAIN (for Validation Split) ---
Split completed for MEX_TRAIN:
  X_train shape: (6076, 6)
  X_va

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge

# Define your categorical columns
categorical_columns = ['vendor_id', 'is_rush_hour']

# Identify numerical columns from one of the DataFrames
all_features = bog_X_train.columns.tolist()
numerical_columns = [col for col in all_features if col not in categorical_columns]

# Function to create a fresh pipeline
def make_pipeline():
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
            ('num', StandardScaler(), numerical_columns)
        ]
    )
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge())
    ])

# Create and fit pipelines separately
bog_pipeline = make_pipeline()
bog_pipeline.fit(bog_X_train, bog_y_train)

mex_pipeline = make_pipeline()
mex_pipeline.fit(mex_X_train, mex_y_train)

uio_pipeline = make_pipeline()
uio_pipeline.fit(uio_X_train, uio_y_train)

# Score on validation data
print("BOG score:", bog_pipeline.score(bog_X_val, bog_y_val))
print("MEX score:", mex_pipeline.score(mex_X_val, mex_y_val))
print("UIO score:", uio_pipeline.score(uio_X_val, uio_y_val))


BOG score: 0.6468291451189592
MEX score: 0.5050759730649734
UIO score: 0.566955299392494


# Deployment:

In [30]:
import joblib
import os

# Define a directory to save the models
model_dir = '/content/trained_models' # I don't need to save them I want to deploy them immediately
os.makedirs(model_dir, exist_ok=True)

# Save each pipeline
joblib.dump(bog_pipeline, os.path.join(model_dir, 'bog_ridge_pipeline.pkl'))
joblib.dump(mex_pipeline, os.path.join(model_dir, 'mex_ridge_pipeline.pkl'))
joblib.dump(uio_pipeline, os.path.join(model_dir, 'uio_ridge_pipeline.pkl'))

print(f"Models saved to {model_dir}")

Models saved to /content/trained_models


In [32]:
import joblib
import os

# Define the directory where the models are saved
model_dir = '/content/trained_models'

# Load each pipeline
loaded_bog_pipeline = joblib.load(os.path.join(model_dir, 'bog_ridge_pipeline.pkl'))
loaded_mex_pipeline = joblib.load(os.path.join(model_dir, 'mex_ridge_pipeline.pkl'))
loaded_uio_pipeline = joblib.load(os.path.join(model_dir, 'uio_ridge_pipeline.pkl'))

print("Models loaded successfully.")

Models loaded successfully.


### Prediction:

In [33]:
# Select the first row from the Bogotá test set
# .iloc[0] selects the first row
# .drop(columns=[target_column]) removes the target variable
single_bog_test_point = bog_test_df.iloc[[0]].drop(columns=[target_column])

display(single_bog_test_point)

Unnamed: 0,vendor_id,dist_meters,wait_sec,geodetic_dist,mean_velocity,is_rush_hour
0,Bogotá UberX,18.976,1640,15.439039,17.172851,False


In [37]:
# Make a prediction using the loaded Bogotá pipeline
predicted_trip_duration = loaded_bog_pipeline.predict(single_bog_test_point)

print(f"Predicted trip duration for the selected Bogotá point: {predicted_trip_duration[0]:.4f}")

print(f"{bog_y_test[0]}")

Predicted trip duration for the selected Bogotá point: 1.1805
1.105
