In [53]:
from google.colab import files
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Input, Embedding, Dense, LayerNormalization,
    MultiHeadAttention, Dropout, GlobalAveragePooling1D,
    Reshape, StringLookup
)
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
df = pd.read_csv('routes.csv')
df

Unnamed: 0,ID маршрута,Дата отправки,Расстояние (км),Средняя скорость (км/ч),Загруженность дорог,Погодные условия,Тип транспорта,Вес груза (кг),Время суток,День недели,Фактическое время в пути (часы)
0,RTE69196,2019-01-01,1946,75,Средняя,Снег,Прицеп,5152.0,День,Вторник,29
1,RT66389,2019-01-01,1106,67,Средняя,Снег,Фургон,8122.0,Вечер,Вторник,14
2,RT95245,2019-01-01,2059,41,Высокая,Снег,Прицеп,13803.0,Утро,Вторник,41
3,RTE53292,2019-01-02,515,72,Средняя,Ясно,Фургон,2156.0,Утро,Среда,7
4,RTE10603,2019-01-02,700,78,Средняя,,Прицеп,15181.0,Утро,Среда,12
...,...,...,...,...,...,...,...,...,...,...,...
12188,MR91272,2022-09-29 00:00:00,468,52,Высокая,Ясно,Фургон,11214.0,Утро,Среда,9
12189,MR85652,2019-10-22 00:00:00,1715,115,Средняя,Ясно,Прицеп,2913.0,Вечер,Пятница,21
12190,RT95542,2022-06-23 00:00:00,1217,59,Малая,Туман,Фургон,17199.0,Вечер,Воскресенье,21
12191,RTE35659,2024-06-03 00:00:00,595,52,Средняя,Шторм,Фургон,6512.0,Утро,Суббота,13


In [None]:
# 1. Preprocessing

In [None]:
# Parse dates and extract year/month
df['Дата отправки'] = pd.to_datetime(df['Дата отправки'], dayfirst=True, errors='coerce')
df['month'] = df['Дата отправки'].dt.month
df['year' ] = df['Дата отправки'].dt.year

In [None]:
# Fill missing categorical values with modes
for col in ['Загруженность дорог', 'Погодные условия', 'Тип транспорта', 'Время суток']:
    df[col] = df[col].fillna(df[col].mode()[0])


In [None]:
# Fill missing numerical values with medians
for col in ['Расстояние (км)', 'Средняя скорость (км/ч)', 'Вес груза (кг)']:
    df[col] = df[col].fillna(df[col].median())


In [None]:
# Drop unused columns
df = df.drop(columns=['ID маршрута', 'Дата отправки'])

In [None]:
# Define features X and target y
X = df.drop(columns=['Фактическое время в пути (часы)'])
y = df['Фактическое время в пути (часы)']

In [None]:
# 2. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# 3. Preprocessing Pipeline
numeric_features   = ['Расстояние (км)', 'Средняя скорость (км/ч)', 'Вес груза (кг)']
categorical_features = [
    'Загруженность дорог',
    'Погодные условия',
    'Тип транспорта',
    'Время суток',
    'День недели',
    'month',
    'year'
]

numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [None]:
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [None]:
# Fit the preprocessor on training data
preprocessor.fit(X_train)

# Transform train & test for tree-based and MLP models
X_train_proc = preprocessor.transform(X_train)
X_test_proc  = preprocessor.transform(X_test)

In [None]:
# 4. Evaluation Function
def eval_reg(name, model, X_tr, X_te):
    pred = model.predict(X_te)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    r2   = r2_score(y_test, pred)
    print(f"{name} → RMSE: {rmse:.4f}, R²: {r2:.4f}")


In [None]:
# === 5. Model 1: Random Forest ===
rf_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('rf', RandomForestRegressor(
        n_estimators=100, random_state=42, n_jobs=-1
    ))
])

In [None]:
rf_pipeline.fit(X_train, y_train)

In [None]:
eval_reg("Random Forest", rf_pipeline, X_train, X_test)

Random Forest → RMSE: 2.8885, R²: 0.9272


In [None]:
# === 7. Model 2: Gradient Boosting (sklearn) ===
gb_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('gb', GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, random_state=42
    ))
])
gb_pipeline.fit(X_train, y_train)

In [None]:
eval_reg("Gradient Boosting", gb_pipeline, X_train, X_test)

Gradient Boosting → RMSE: 2.7846, R²: 0.9324


In [None]:
# === 8. Model 3: XGBoost ===
xgb_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('xgb', XGBRegressor(
        n_estimators=100, learning_rate=0.1,
        objective='reg:squarederror', random_state=42,
        n_jobs=-1
    ))
])
xgb_pipeline.fit(X_train, y_train)

In [None]:
eval_reg("XGBoost Regressor", xgb_pipeline, X_train, X_test)

XGBoost Regressor → RMSE: 2.8278, R²: 0.9302


In [None]:
# === 9. Model 4: MLPRegressor ===
mlp_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('mlp', MLPRegressor(
        hidden_layer_sizes=(128,64),
        activation='relu',
        solver='adam',
        max_iter=200,
        random_state=42
    ))
])
mlp_pipeline.fit(X_train, y_train)



In [None]:
eval_reg("MLP Regressor", mlp_pipeline, X_train, X_test)

MLP Regressor → RMSE: 3.1847, R²: 0.9115


In [None]:
# 10. Prepare for CNN & LSTM
# 3D shape: (samples, features, 1)
X_tr_nn = X_train_proc.toarray() if hasattr(X_train_proc, "toarray") else X_train_proc
X_te_nn = X_test_proc.toarray()  if hasattr(X_test_proc,  "toarray") else X_test_proc

n_features = X_tr_nn.shape[1]
X_tr_nn = X_tr_nn.reshape(-1, n_features, 1)
X_te_nn = X_te_nn.reshape(-1, n_features, 1)


In [None]:
# 11. Model 5: 1D CNN
cnn = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(n_features, 1)),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(1)
])
cnn.compile(optimizer='adam', loss='mse')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
cnn.fit(X_tr_nn, y_train, validation_split=0.1,
        epochs=30, batch_size=32, callbacks=[es], verbose=2)

Epoch 1/30
275/275 - 3s - 11ms/step - loss: 148.2304 - val_loss: 101.4565
Epoch 2/30
275/275 - 1s - 3ms/step - loss: 80.2763 - val_loss: 65.7709
Epoch 3/30
275/275 - 1s - 3ms/step - loss: 45.2728 - val_loss: 33.9682
Epoch 4/30
275/275 - 1s - 4ms/step - loss: 23.6103 - val_loss: 19.4118
Epoch 5/30
275/275 - 1s - 3ms/step - loss: 18.6654 - val_loss: 17.6115
Epoch 6/30
275/275 - 1s - 3ms/step - loss: 17.5201 - val_loss: 17.6687
Epoch 7/30
275/275 - 1s - 5ms/step - loss: 17.0557 - val_loss: 16.1977
Epoch 8/30
275/275 - 1s - 3ms/step - loss: 16.4352 - val_loss: 16.0378
Epoch 9/30
275/275 - 1s - 3ms/step - loss: 15.8740 - val_loss: 15.3918
Epoch 10/30
275/275 - 1s - 4ms/step - loss: 15.5045 - val_loss: 14.9030
Epoch 11/30
275/275 - 1s - 4ms/step - loss: 15.0349 - val_loss: 15.1974
Epoch 12/30
275/275 - 2s - 6ms/step - loss: 14.7649 - val_loss: 14.1251
Epoch 13/30
275/275 - 1s - 4ms/step - loss: 14.4837 - val_loss: 13.9109
Epoch 14/30
275/275 - 1s - 4ms/step - loss: 14.0414 - val_loss: 14.119

<keras.src.callbacks.history.History at 0x7ee5901c81d0>

In [None]:
pred_cnn = cnn.predict(X_te_nn).ravel()
print(f"CNN → RMSE: {np.sqrt(mean_squared_error(y_test, pred_cnn)):.3f}, "
      f"R²: {r2_score(y_test, pred_cnn):.3f}")

[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
CNN → RMSE: 3.483, R²: 0.894


In [None]:
# 12. Model 6: LSTM

# Convert to dense NumPy array before reshaping for LSTM
X_tr_lstm = X_train_proc.toarray().reshape(-1, 1, n_features)
X_te_lstm = X_test_proc.toarray().reshape(-1, 1, n_features)

lstm = Sequential([
    LSTM(64, input_shape=(1, n_features)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1)
])
lstm.compile(optimizer='adam', loss='mse')

  super().__init__(**kwargs)


In [None]:
history = lstm.fit(
    X_tr_lstm, y_train,
    validation_split=0.1,
    epochs=30,
    batch_size=32,
    callbacks=[es],
    verbose=2
)

Epoch 1/30
275/275 - 4s - 15ms/step - loss: 115.3361 - val_loss: 10.7270
Epoch 2/30
275/275 - 1s - 3ms/step - loss: 12.4413 - val_loss: 8.8403
Epoch 3/30
275/275 - 1s - 4ms/step - loss: 11.7925 - val_loss: 8.3437
Epoch 4/30
275/275 - 1s - 5ms/step - loss: 11.1385 - val_loss: 8.6162
Epoch 5/30
275/275 - 1s - 3ms/step - loss: 11.2148 - val_loss: 8.1090
Epoch 6/30
275/275 - 1s - 5ms/step - loss: 10.6528 - val_loss: 8.1755
Epoch 7/30
275/275 - 1s - 4ms/step - loss: 10.4790 - val_loss: 8.2476
Epoch 8/30
275/275 - 1s - 5ms/step - loss: 10.4495 - val_loss: 8.2801
Epoch 9/30
275/275 - 1s - 5ms/step - loss: 10.1778 - val_loss: 7.9804
Epoch 10/30
275/275 - 2s - 5ms/step - loss: 10.3496 - val_loss: 8.3112
Epoch 11/30
275/275 - 1s - 4ms/step - loss: 9.8371 - val_loss: 7.8593
Epoch 12/30
275/275 - 1s - 4ms/step - loss: 9.9265 - val_loss: 7.8579
Epoch 13/30
275/275 - 1s - 5ms/step - loss: 9.6976 - val_loss: 7.9531
Epoch 14/30
275/275 - 1s - 5ms/step - loss: 9.6996 - val_loss: 8.2607
Epoch 15/30
275/

In [None]:
pred_lstm = lstm.predict(X_te_lstm).ravel()
print(
    f"LSTM → RMSE: {np.sqrt(mean_squared_error(y_test, pred_lstm)):.3f}, "
    f"R²: {r2_score(y_test, pred_lstm):.3f}"
)

[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
LSTM → RMSE: 2.844, R²: 0.929


In [None]:
results = {
    'Model': ['Random Forest', 'Gradient Boosting', 'XGBoost Regressor', 'MLP Regressor', 'CNN', 'LSTM'],
    'RMSE': [],
    'R²': []
}

results['RMSE'].append(np.sqrt(mean_squared_error(y_test, rf_pipeline.predict(X_test))))
results['R²'].append(r2_score(y_test, rf_pipeline.predict(X_test)))

results['RMSE'].append(np.sqrt(mean_squared_error(y_test, gb_pipeline.predict(X_test))))
results['R²'].append(r2_score(y_test, gb_pipeline.predict(X_test)))

results['RMSE'].append(np.sqrt(mean_squared_error(y_test, xgb_pipeline.predict(X_test))))
results['R²'].append(r2_score(y_test, xgb_pipeline.predict(X_test)))

results['RMSE'].append(np.sqrt(mean_squared_error(y_test, mlp_pipeline.predict(X_test))))
results['R²'].append(r2_score(y_test, mlp_pipeline.predict(X_test)))

results['RMSE'].append(np.sqrt(mean_squared_error(y_test, pred_cnn)))
results['R²'].append(r2_score(y_test, pred_cnn))

results['RMSE'].append(np.sqrt(mean_squared_error(y_test, pred_lstm)))
results['R²'].append(r2_score(y_test, pred_lstm))

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,RMSE,R²
0,Random Forest,2.888549,0.927212
1,Gradient Boosting,2.784648,0.932354
2,XGBoost Regressor,2.827789,0.930242
3,MLP Regressor,3.184692,0.911522
4,CNN,3.483495,0.89414
5,LSTM,2.844055,0.929437


In [None]:
import joblib

joblib.dump(rf_pipeline, 'rf_pipeline.pkl')
joblib.dump(gb_pipeline, 'gb_pipeline.pkl')
joblib.dump(xgb_pipeline, 'xgb_pipeline.pkl')
joblib.dump(mlp_pipeline, 'mlp_pipeline.pkl')

cnn.save('cnn_model.h5')
lstm.save('lstm_model.h5')

from google.colab import files
files.download('rf_pipeline.pkl')
files.download('gb_pipeline.pkl')
files.download('xgb_pipeline.pkl')
files.download('mlp_pipeline.pkl')
files.download('cnn_model.h5')
files.download('lstm_model.h5')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [51]:
# Tab Transformer
# Scale numeric features
num_scaler = StandardScaler()
num_scaler.fit(X_train[numeric_features])

# Apply scaling
X_train_num = num_scaler.transform(X_train[numeric_features])
X_test_num  = num_scaler.transform(X_test[numeric_features])

# Build DataFrames for ease of dict conversion
X_train_scaled = X_train.copy()
X_test_scaled  = X_test.copy()
X_train_scaled[numeric_features] = X_train_num
X_test_scaled[numeric_features]  = X_test_num

In [54]:
# Build StringLookup + Embedding for each categorical column
lookups = {}
emb_dim  = 16   # embedding dimension for each feature
for col in categorical_features:
    sl = StringLookup(output_mode='int', oov_token='[UNK]')
    sl.adapt(X_train[col].astype(str))
    lookups[col] = sl

In [87]:
# Create inputs & embeddings
inputs, embeddings = [], []
for col in categorical_features:
    inp = Input(shape=(), name=col, dtype=tf.string)
    x   = lookups[col](inp)
    x   = Embedding(
        input_dim=lookups[col].vocabulary_size(),
        output_dim=emb_dim
    )(x)
    x   = Reshape((emb_dim,))(x)
    inputs.append(inp)
    embeddings.append(x)

for col in numeric_features:
    inp = Input(shape=(1,), name=col.replace('/', '_'), dtype=tf.float32)
    x   = Dense(emb_dim)(inp)
    inputs.append(inp)
    embeddings.append(x)

In [88]:
# Stack into a sequence: (batch, num_tokens, emb_dim)
x = tf.keras.layers.concatenate(embeddings, axis=1)
x = tf.keras.layers.Lambda(lambda t: tf.expand_dims(t, axis=1))(x)

In [89]:
# Transformer-Encoder block
def transformer_block(x, head_size, num_heads, ff_dim, dropout=0.1):
    attn = MultiHeadAttention(
        num_heads=num_heads, key_dim=head_size
    )(x, x)
    attn = Dropout(dropout)(attn)
    x1   = LayerNormalization(epsilon=1e-6)(x + attn)
    ff   = Dense(ff_dim, activation='relu')(x1)
    ff   = Dense(x.shape[-1])(ff)  # Output shape now (None, 160)
    ff   = Dropout(dropout)(ff)
    return LayerNormalization(epsilon=1e-6)(x1 + ff)


In [90]:
# Apply a couple of encoder blocks
x = transformer_block(x, head_size=8, num_heads=2, ff_dim=64)
x = transformer_block(x, head_size=8, num_heads=2, ff_dim=64)

In [91]:
# Pool & MLP head
x = GlobalAveragePooling1D()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
output = Dense(1)(x)



In [92]:
from tensorflow.keras.models import Model
model = Model(inputs, output)
model.compile(optimizer='adam', loss='mse')


In [93]:
# Prepare data as a dict of arrays
def df_to_dict(df):
    d = {}
    for col in categorical_features:
        d[col] = df[col].astype(str).values
    for col in numeric_features:
        d[col.replace('/', '_')] = df[col].values.astype('float32').reshape(-1, 1)
    return d

train_dict = df_to_dict(X_train_scaled)
test_dict  = df_to_dict(X_test_scaled)

In [94]:
# Train
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)
history = model.fit(
    train_dict, y_train,
    validation_split=0.1,
    epochs=30,
    batch_size=32,
    callbacks=[es],
    verbose=2
)


Epoch 1/30
275/275 - 14s - 53ms/step - loss: 34.9706 - val_loss: 9.2682
Epoch 2/30
275/275 - 5s - 20ms/step - loss: 16.1646 - val_loss: 8.3696
Epoch 3/30
275/275 - 4s - 14ms/step - loss: 14.4198 - val_loss: 8.3097
Epoch 4/30
275/275 - 5s - 19ms/step - loss: 14.0834 - val_loss: 8.3188
Epoch 5/30
275/275 - 4s - 15ms/step - loss: 13.9585 - val_loss: 9.2329
Epoch 6/30
275/275 - 3s - 11ms/step - loss: 14.1746 - val_loss: 8.3942
Epoch 7/30
275/275 - 3s - 10ms/step - loss: 13.7505 - val_loss: 8.0549
Epoch 8/30
275/275 - 6s - 23ms/step - loss: 13.9790 - val_loss: 8.2491
Epoch 9/30
275/275 - 4s - 14ms/step - loss: 13.8289 - val_loss: 8.2196
Epoch 10/30
275/275 - 3s - 10ms/step - loss: 14.0695 - val_loss: 9.5228
Epoch 11/30
275/275 - 3s - 11ms/step - loss: 13.2952 - val_loss: 7.9178
Epoch 12/30
275/275 - 5s - 19ms/step - loss: 13.3597 - val_loss: 7.9287
Epoch 13/30
275/275 - 5s - 18ms/step - loss: 13.4848 - val_loss: 8.8477
Epoch 14/30
275/275 - 6s - 21ms/step - loss: 13.0433 - val_loss: 8.3240


In [95]:
# Evaluate
y_pred = model.predict(test_dict).ravel()
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
print(f"TabTransformer → RMSE: {rmse:.3f}, R²: {r2:.3f}")

[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step
TabTransformer → RMSE: 2.873, R²: 0.928


In [99]:
from google.colab import files
import joblib

model.save('tab_transformer_model.h5') #Save the Tab Transformer model


files.download('tab_transformer_model.h5') #Download the Tab Transformer model




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>