# Прогноз часового спроса такси по районам — улучшенная версия

## Пайплайн предобработки / FE / генерации данных

In [6]:
import os
import glob
import zipfile
import gdown
import holidays
import pandas as pd
import numpy as np
from meteostat import Stations, Hourly
from datetime import datetime
from pathlib import Path

In [7]:
def load_raw_data() -> pd.DataFrame:
    current_dir = Path().resolve()
    target_dir = current_dir.parent / 'mfdp_data'

    file_paths = [f for f in target_dir.iterdir() if f.is_file()]
    dfs = []
    for path in file_paths:
        df = pd.read_parquet(path)
        fname = str(path).lower()
        if 'yellow' in fname:
            df = df.rename(columns={
                'tpep_pickup_datetime': 'pickup_datetime',
                'PULocationID': 'location_id',
                'trip_distance': 'distance',
                'fare_amount': 'cost',
            })
        elif 'green' in fname:
            df = df.rename(columns={
                'lpep_pickup_datetime': 'pickup_datetime',
                'PULocationID': 'location_id',
                'trip_distance': 'distance',
                'fare_amount': 'cost',
            })
        elif 'fhvhv' in fname:
            df = df.rename(columns={
                'pickup_datetime': 'pickup_datetime', 
                'PULocationID': 'location_id',
                'trip_miles': 'distance',
                'base_passenger_fare': 'cost',
            })
        if ('pickup_datetime' in df.columns and 'location_id' in df.columns and
                'distance' in df.columns and 'cost' in df.columns):
            df = df[['pickup_datetime', 'location_id', 'distance', 'cost']]
            allowed_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 212, 213, 214, 215, 217, 218, 219, 220, 221, 222, 223, 224, 227, 235, 240, 241, 242, 243, 245, 248, 250, 251, 252, 253, 254, 257, 258, 259, 260, 264, 265]

            df = df[df['location_id'].isin(allowed_ids)]

            df = df[(df['distance'] > 0) & (df['cost'] > 0)]
            df['cost_per_mile'] = df['cost'] / df['distance']
            
            df['cost_per_mile'] = df['cost_per_mile'].replace([float('inf'), -float('inf')], None)
            df = df[df['cost_per_mile'] < 100]
            df = df.drop(columns=['cost', 'distance'])
            df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
            df = df.dropna().drop_duplicates()
            dfs.append(df)
    combined = pd.concat(dfs, ignore_index=True).drop_duplicates().reset_index(drop=True)
    combined = combined[(combined['pickup_datetime'] >= '2024-01-01') & (combined['pickup_datetime'] < '2025-04-01')]
    combined['date'] = combined['pickup_datetime'].dt.date
    combined['hour'] = combined['pickup_datetime'].dt.hour
    combined = combined.drop(columns=['pickup_datetime'])
    return combined

In [8]:
def aggregate_trips(df: pd.DataFrame) -> pd.DataFrame:
    grouped = df.groupby(['date', 'hour', 'location_id']).agg(
        avg_cost_per_mile=('cost_per_mile', 'mean')
    ).reset_index()
    grouped['trips_count'] = df.groupby(['date', 'hour', 'location_id']).size().values
    return grouped

In [9]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    us_holidays = holidays.US()
    non_working = {'New Year\'s Day', 'MLK Day', 'Washington\'s Birthday', 'Memorial Day',
                   'Juneteenth', 'Independence Day', 'Labor Day', 'Thanksgiving', 'Christmas Day'}
    df['date'] = pd.to_datetime(df['date'])
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_holiday'] = df['date'].apply(lambda d: us_holidays.get(d.date()) in non_working)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['day_of_year'] = df['date'].dt.dayofyear
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
    df['is_pre_holiday'] = df['date'].shift(-1).apply(lambda d: us_holidays.get(d.date()) in non_working)
    df['is_post_holiday'] = df['date'].shift(1).apply(lambda d: us_holidays.get(d.date()) in non_working)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    return df

In [10]:
def merge_weather(df: pd.DataFrame) -> pd.DataFrame:
    start = datetime(2024, 1, 1)
    end = datetime(2025, 5, 1)
    station = Stations().nearby(40.7128, -74.0060).fetch(1).index[0]
    weather = Hourly(station, start, end).fetch().reset_index()
    weather['date'] = weather['time'].dt.date
    weather['hour'] = weather['time'].dt.hour
    weather = weather[['date', 'hour', 'temp', 'prcp', 'wspd']]
    weather['date'] = pd.to_datetime(weather['date'])
    return df.merge(weather, on=['date', 'hour'], how='left')

In [11]:
def save_data(df: pd.DataFrame, output: str='result_lstm_cost.csv') -> pd.DataFrame:
    df['datetime'] = pd.to_datetime(df['date'].dt.strftime('%Y-%m-%d') + ' ' + df['hour'].astype(str) + ':00')
    df = df.sort_values(['location_id', 'datetime']).reset_index(drop=True)
    df = df.drop(['datetime'], axis=1)
    df.to_csv(output, index=False)
    df.head(n=15).to_csv('result_lstm_test_cost.csv', index=False)
    return df

In [12]:
def pipeline() -> pd.DataFrame:
    df = load_raw_data()
    df = aggregate_trips(df)
    df = engineer_features(df)
    df = merge_weather(df)
    df = save_data(df)
    return df

In [13]:
df1 = load_raw_data()
df1.head()

Unnamed: 0,location_id,cost_per_mile,date,hour
0,161,16.116608,2024-01-01,0
1,137,6.401274,2024-01-01,0
2,79,9.126263,2024-01-01,0
3,148,14.592453,2024-01-01,0
4,95,4.045013,2024-01-01,0


In [14]:
df1[df1['location_id'] == 2].head(1000)

Unnamed: 0,location_id,cost_per_mile,date,hour
280512,2,4.265306,2024-01-01,10
1606929,2,17.598253,2024-01-04,15
1941147,2,3.329898,2024-01-05,8
1971110,2,4.445983,2024-01-05,9
2030622,2,4.282034,2024-01-05,12
...,...,...,...,...
276863579,2,3.625000,2025-02-23,8
277173343,2,3.943662,2025-02-28,10
278609481,2,4.561404,2025-03-15,4
279202937,2,4.182891,2025-03-24,7


In [15]:
df2 = aggregate_trips(df1)
df2.head()

Unnamed: 0,date,hour,location_id,avg_cost_per_mile,trips_count
0,2024-01-01,0,3,7.375202,75
1,2024-01-01,0,4,7.834235,288
2,2024-01-01,0,5,5.871854,19
3,2024-01-01,0,6,6.332942,32
4,2024-01-01,0,7,6.671421,630


In [16]:
df3 = engineer_features(df2)
df3.head()

Unnamed: 0,date,hour,location_id,avg_cost_per_mile,trips_count,day_of_week,month,is_weekend,is_holiday,is_month_start,is_month_end,day_of_year,week_of_year,is_pre_holiday,is_post_holiday,hour_sin,hour_cos
0,2024-01-01,0,3,7.375202,75,0,1,0,True,1,0,1,1,True,False,0.0,1.0
1,2024-01-01,0,4,7.834235,288,0,1,0,True,1,0,1,1,True,True,0.0,1.0
2,2024-01-01,0,5,5.871854,19,0,1,0,True,1,0,1,1,True,True,0.0,1.0
3,2024-01-01,0,6,6.332942,32,0,1,0,True,1,0,1,1,True,True,0.0,1.0
4,2024-01-01,0,7,6.671421,630,0,1,0,True,1,0,1,1,True,True,0.0,1.0


In [17]:
df4 = merge_weather(df3)
df4.head()



Unnamed: 0,date,hour,location_id,avg_cost_per_mile,trips_count,day_of_week,month,is_weekend,is_holiday,is_month_start,is_month_end,day_of_year,week_of_year,is_pre_holiday,is_post_holiday,hour_sin,hour_cos,temp,prcp,wspd
0,2024-01-01,0,3,7.375202,75,0,1,0,True,1,0,1,1,True,False,0.0,1.0,6.0,0.0,11.0
1,2024-01-01,0,4,7.834235,288,0,1,0,True,1,0,1,1,True,True,0.0,1.0,6.0,0.0,11.0
2,2024-01-01,0,5,5.871854,19,0,1,0,True,1,0,1,1,True,True,0.0,1.0,6.0,0.0,11.0
3,2024-01-01,0,6,6.332942,32,0,1,0,True,1,0,1,1,True,True,0.0,1.0,6.0,0.0,11.0
4,2024-01-01,0,7,6.671421,630,0,1,0,True,1,0,1,1,True,True,0.0,1.0,6.0,0.0,11.0


In [18]:
df5 = save_data(df4)
df5.head()

Unnamed: 0,date,hour,location_id,avg_cost_per_mile,trips_count,day_of_week,month,is_weekend,is_holiday,is_month_start,is_month_end,day_of_year,week_of_year,is_pre_holiday,is_post_holiday,hour_sin,hour_cos,temp,prcp,wspd
0,2024-01-01,16,1,6.941176,1,0,1,0,True,1,0,1,1,True,True,-0.8660254,-0.5,6.7,0.0,18.4
1,2024-01-02,18,1,4.5,1,1,1,0,False,0,0,2,1,False,False,-1.0,-1.83697e-16,6.1,0.0,7.6
2,2024-01-02,19,1,5.443038,1,1,1,0,False,0,0,2,1,False,False,-0.9659258,0.258819,7.8,0.0,7.6
3,2024-01-03,12,1,5.192744,1,2,1,0,False,0,0,3,1,False,False,1.224647e-16,-1.0,1.7,0.0,14.8
4,2024-01-03,15,1,7.507042,1,2,1,0,False,0,0,3,1,False,False,-0.7071068,-0.7071068,5.0,0.0,11.2


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from torch.utils.data import Dataset, DataLoader
import torch

df = pd.read_csv('result_lstm_cost.csv', parse_dates=['date'])

features = [
    'location_id', 'trips_count', 'hour', 'temp', 'prcp', 'wspd',
    'day_of_week', 'month', 'is_weekend', 'is_holiday',
    'is_month_start', 'is_month_end', 'day_of_year', 'week_of_year',
    'is_pre_holiday', 'is_post_holiday'
]
target = 'avg_cost_per_mile'

X = df[features]
y = df[target]

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.25, random_state=42
)

categorical_features = ['location_id']
numeric_features = [c for c in features if c not in categorical_features]

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_features),
    ('num', numeric_transformer, numeric_features)
])

X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc   = preprocessor.transform(X_val)
X_test_proc  = preprocessor.transform(X_test)

class TabularDataset(Dataset):
    def __init__(self, X, y):
        arr = X.toarray() if hasattr(X, "toarray") else X
        self.X = torch.tensor(arr, dtype=torch.float32)
        self.y = torch.tensor(y.values if hasattr(y, "values") else y,
                              dtype=torch.float32).unsqueeze(1)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = TabularDataset(X_train_proc, y_train)
val_ds   = TabularDataset(X_val_proc,   y_val)
test_ds  = TabularDataset(X_test_proc,  y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64)
test_loader  = DataLoader(test_ds,  batch_size=64)

In [3]:
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch
import torch.nn as nn
import torch.optim as optim
import joblib
import numpy as np
import pandas as pd
from IPython.display import display, Markdown

results = []

simple_models = {
    'XGBoost': XGBRegressor(tree_method='hist', random_state=42, verbosity=1),
    'MLP': MLPRegressor(
        hidden_layer_sizes=(64,32),
        max_iter=500,
        tol=1e-4,
        random_state=42
    )
}

for name, model in simple_models.items():
    display(Markdown(f'## Training **{name}**'))
    model.fit(X_train_proc, y_train)
    display(Markdown(f'**{name}** training completed.'))
    
    filename = f'{name.lower()}.joblib'
    joblib.dump(model, filename)
    display(Markdown(f'Saved **{name}** model to `{filename}`.'))

    y_pred = model.predict(X_test_proc)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2   = r2_score(y_test, y_pred)
    display(Markdown(f'- MAE: {mae:.4f}  \n- RMSE: {rmse:.4f}  \n- R²: {r2:.4f}'))
    results.append({'Model': name, 'MAE': mae, 'RMSE': rmse, 'R2': r2})

class TransformerRegressor(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=8, num_layers=2, dim_feedforward=128):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.input_proj(x)
        x = x.unsqueeze(1)
        x = self.transformer(x)
        x = x.squeeze(1)
        return self.fc_out(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = X_train_proc.shape[1]
model_t = TransformerRegressor(input_dim, d_model=128, nhead=8,
                               num_layers=2, dim_feedforward=256).to(device)
optimizer = optim.Adam(model_t.parameters(), lr=1e-3)
criterion = nn.MSELoss()

best_val_loss = float('inf')
epoch_logs = []

display(Markdown('## Training **TransformerRegressor**'))
for epoch in range(1, 11):
    model_t.train()
    epoch_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model_t(xb), yb)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    avg_train_loss = epoch_loss / len(train_loader)

    model_t.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            val_losses.append(criterion(model_t(xb), yb).item())
    avg_val_loss = np.mean(val_losses)
    
    epoch_logs.append({
        'Epoch': epoch,
        'Train Loss': avg_train_loss,
        'Val Loss': avg_val_loss
    })
    display(Markdown(f'Epoch {epoch} — Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}'))
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model_t.state_dict(), 'best_transformer.pth')
        display(Markdown('Saved best TransformerRegressor state to `best_transformer.pth`.'))

logs_df = pd.DataFrame(epoch_logs)
display(Markdown('### Transformer Training Logs'))
display(logs_df)

model_t.load_state_dict(torch.load('best_transformer.pth', map_location=device))
model_t.eval()
preds, true = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds.append(model_t(xb).cpu().numpy())
        true.append(yb.numpy())
preds = np.vstack(preds).squeeze()
true  = np.vstack(true).squeeze()

mae_t  = mean_absolute_error(true, preds)
rmse_t = np.sqrt(mean_squared_error(true, preds))
r2_t   = r2_score(true, preds)
display(Markdown('## TransformerRegressor Evaluation'))
display(Markdown(f'- MAE: {mae_t:.4f}  \\- RMSE: {rmse_t:.4f}  \\- R²: {r2_t:.4f}'))
results.append({'Model': 'Transformer', 'MAE': mae_t, 'RMSE': rmse_t, 'R2': r2_t})

results_df = pd.DataFrame(results).set_index('Model')
display(Markdown('## Summary of All Models'))
display(results_df)

## Training **XGBoost**

**XGBoost** training completed.

Saved **XGBoost** model to `xgboost.joblib`.

- MAE: 0.6208  
- RMSE: 1.1499  
- R²: 0.5611

## Training **MLP**

**MLP** training completed.

Saved **MLP** model to `mlp.joblib`.

- MAE: 0.5612  
- RMSE: 1.1041  
- R²: 0.5953

## Training **TransformerRegressor**

Epoch 1 — Train Loss: 1.3754, Val Loss: 1.3382

Saved best TransformerRegressor state to `best_transformer.pth`.

Epoch 2 — Train Loss: 1.3052, Val Loss: 1.2857

Saved best TransformerRegressor state to `best_transformer.pth`.

Epoch 3 — Train Loss: 1.2861, Val Loss: 1.2897

Epoch 4 — Train Loss: 1.2803, Val Loss: 1.2678

Saved best TransformerRegressor state to `best_transformer.pth`.

Epoch 5 — Train Loss: 1.2716, Val Loss: 1.2633

Saved best TransformerRegressor state to `best_transformer.pth`.

Epoch 6 — Train Loss: 1.2680, Val Loss: 1.2380

Saved best TransformerRegressor state to `best_transformer.pth`.

Epoch 7 — Train Loss: 1.2621, Val Loss: 1.2292

Saved best TransformerRegressor state to `best_transformer.pth`.

Epoch 8 — Train Loss: 1.2525, Val Loss: 1.2243

Saved best TransformerRegressor state to `best_transformer.pth`.

Epoch 9 — Train Loss: 1.2452, Val Loss: 1.2053

Saved best TransformerRegressor state to `best_transformer.pth`.

Epoch 10 — Train Loss: 1.2393, Val Loss: 1.2128

### Transformer Training Logs

Unnamed: 0,Epoch,Train Loss,Val Loss
0,1,1.3754,1.338154
1,2,1.305162,1.285714
2,3,1.286072,1.289695
3,4,1.280303,1.267782
4,5,1.27164,1.263299
5,6,1.268009,1.238031
6,7,1.262146,1.229211
7,8,1.25248,1.224269
8,9,1.245184,1.205259
9,10,1.239338,1.212758


## TransformerRegressor Evaluation

- MAE: 0.5956  \- RMSE: 1.1306  \- R²: 0.5757

## Summary of All Models

Unnamed: 0_level_0,MAE,RMSE,R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XGBoost,0.620842,1.149897,0.561067
MLP,0.561194,1.104085,0.595344
Transformer,0.595583,1.130591,0.575682


In [4]:
joblib.dump(preprocessor, 'preprocessor.joblib')

['preprocessor.joblib']