<a href="https://colab.research.google.com/github/MammadovN/Machine_Learning/blob/main/projects/06_real-world-apps/taxi-trip-duration-prediction/notebooks/taxi_trip_duration_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')        # Authorise in the popup window

Mounted at /content/drive


In [2]:
from pathlib import Path, PurePosixPath
import os

# Root directory for the project inside Google Drive
PROJECT_ROOT = Path("/content/drive/MyDrive/taxi-trip-duration")

# Sub-directories you want
subdirs = [
    PROJECT_ROOT / "data" / "raw",
    PROJECT_ROOT / "data" / "processed",
    PROJECT_ROOT / "notebooks",
    PROJECT_ROOT / "src",
    PROJECT_ROOT / "models",
]

# Make the folders (parents=True handles nested paths)
for d in subdirs:
    d.mkdir(parents=True, exist_ok=True)

# Add a simple .gitignore if it doesn’t already exist
gitignore = PROJECT_ROOT / ".gitignore"
if not gitignore.exists():
    gitignore.write_text(
        "# Ignore large or generated files\n"
        "data/\n"
        "models/\n"
        ".ipynb_checkpoints/\n"
    )

# Display the resulting tree
print("Project structure:")
for root, _, _ in os.walk(PROJECT_ROOT):
    indent_level = len(PurePosixPath(root).relative_to(PROJECT_ROOT).parts)
    indent = "    " * indent_level + ("└── " if indent_level else "")
    print(f"{indent}{Path(root).name}/")


Project structure:
taxi-trip-duration/
    └── data/
        └── raw/
        └── processed/
    └── notebooks/
    └── src/
    └── models/


In [3]:
!ls -R /content/drive/MyDrive/taxi-trip-duration/src


/content/drive/MyDrive/taxi-trip-duration/src:


In [5]:
# 0-a.  Install Python libraries
!pip install -q polars lightgbm scikit-learn

# 0-b.  Add src/ to Python path  ➜  so imports work everywhere
import sys, pathlib
sys.path.append('/content/drive/MyDrive/taxi-trip-duration/src')


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.8/34.8 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m135.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/          # now the file exists
!chmod 600 ~/.kaggle/kaggle.json


In [11]:
!kaggle competitions download -c nyc-taxi-trip-duration \
        -p /content/drive/MyDrive/taxi-trip-duration/data/raw


In [13]:
# Proje içindeki raw klasörünü listele
!ls -lh /content/drive/MyDrive/taxi-trip-duration/data/raw


total 86M
-rw------- 1 root root 86M Dec 11  2019 nyc-taxi-trip-duration.zip


In [16]:
%%bash
# move to the raw-data folder
cd /content/drive/MyDrive/taxi-trip-duration/data/raw

# unzip every ZIP in that folder
for z in *.zip; do
  unzip -o "$z" -d .
done


Archive:  nyc-taxi-trip-duration.zip
  inflating: ./sample_submission.zip  
  inflating: ./test.zip              
  inflating: ./train.zip             
Archive:  sample_submission.zip
  inflating: ./sample_submission.csv  
Archive:  test.zip
  inflating: ./test.csv              
Archive:  train.zip
  inflating: ./train.csv             


In [17]:
%%writefile /content/drive/MyDrive/taxi-trip-duration/src/preprocess.py
import polars as pl

def clean_data(df: pl.DataFrame) -> pl.DataFrame:
    """Remove obvious outliers & nulls."""
    return (
        df.filter(
            (pl.col("trip_duration").is_between(60, 7200))
            & (pl.col("passenger_count") > 0)
        )
        .drop_nulls()
    )

Writing /content/drive/MyDrive/taxi-trip-duration/src/preprocess.py


In [18]:
%%writefile /content/drive/MyDrive/taxi-trip-duration/src/features.py
import polars as pl

def build_features(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        pl.col("pickup_datetime").str.strptime(pl.Datetime).alias("pickup_dt")
    )
    return (
        df.with_columns([
            pl.col("pickup_dt").dt.hour().alias("pickup_hour"),
            pl.col("pickup_dt").dt.weekday().alias("pickup_wday"),
            pl.col("pickup_dt").dt.month().alias("pickup_month"),
            (
                (pl.col("pickup_longitude") - pl.col("dropoff_longitude")).abs()
              + (pl.col("pickup_latitude")  - pl.col("dropoff_latitude")).abs()
            ).alias("manhattan_dist")
        ])
        .select([
            "vendor_id", "passenger_count",
            "pickup_hour", "pickup_wday", "pickup_month",
            "manhattan_dist",
            "trip_duration"
        ])
    )


Writing /content/drive/MyDrive/taxi-trip-duration/src/features.py


In [21]:
%%writefile /content/drive/MyDrive/taxi-trip-duration/src/train.py
import polars as pl, joblib, math, lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from preprocess import clean_data
from features   import build_features
from pathlib import Path

def train_and_save(raw_csv: str, model_dir: str):
    df = pl.read_csv(raw_csv)
    df = clean_data(df)
    df = build_features(df)

    X = df.drop("trip_duration").to_pandas()
    y = df["trip_duration"].to_pandas()

    X_tr, X_val, y_tr, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    params = dict(
        objective="regression",
        metric="rmse",
        learning_rate=0.1,
        num_leaves=64,
        seed=42,
    )

    dtrain = lgb.Dataset(X_tr, y_tr)
    dval   = lgb.Dataset(X_val, y_val, reference=dtrain)

    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dval],
        callbacks=[lgb.early_stopping(50)]      # ← NEW
    )

    rmsle = math.sqrt(mean_squared_log_error(y_val, model.predict(X_val)))
    print(f"Validation RMSLE: {rmsle:.4f}")

    Path(model_dir).mkdir(parents=True, exist_ok=True)
    joblib.dump(model, f"{model_dir}/lgbm.pkl")

if __name__ == "__main__":
    train_and_save(
        "/content/drive/MyDrive/taxi-trip-duration/data/raw/train.csv",
        "/content/drive/MyDrive/taxi-trip-duration/models"
    )


Writing /content/drive/MyDrive/taxi-trip-duration/src/train.py


In [22]:
!python /content/drive/MyDrive/taxi-trip-duration/src/train.py

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 305
[LightGBM] [Info] Number of data points in the train set: 1158223, number of used features: 6
[LightGBM] [Info] Start training from score 841.140317
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[98]	valid_0's rmse: 351.038
Validation RMSLE: 0.4203


In [23]:
import joblib, pandas as pd, datetime
from features import build_features

model = joblib.load('/content/drive/MyDrive/taxi-trip-duration/models/lgbm.pkl')

def predict_one(row_dict):
    df = pd.DataFrame([row_dict])
    # Convert to Polars, build features, back to pandas
    import polars as pl
    features = build_features(pl.from_pandas(df).with_columns(pl.lit(0).alias("trip_duration")))
    return int(model.predict(features.drop("trip_duration").to_pandas())[0])

example = dict(
    vendor_id=1, passenger_count=1,
    pickup_datetime=datetime.datetime.now().isoformat(),
    pickup_longitude=-73.985428, pickup_latitude=40.748817,
    dropoff_longitude=-73.985428, dropoff_latitude=40.748817,
)
print(predict_one(example), "seconds")


552 seconds
