In [1]:
import io
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
!pip install catboost
from catboost import CatBoostRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, LSTM
from tensorflow.keras.callbacks import EarlyStopping



In [2]:
# 1. Load dataset
df = pd.read_csv('delays.csv')
df


Unnamed: 0,ID заказа,Дата отправки,Расстояние (км),Опыт водителя (лет),Вес груза (кг),Тип груза,Погодные условия,Сезон,Время суток,День недели,Задержка доставки (дней),Группа опыта
0,OD15142,2019-01-01,1113,15.0,7733,Опасный,Снег,Зима,День,Вторник,0,11-20
1,DL98960,2019-01-01,735,9.0,7477,Обычный,Метель,Зима,Утро,Вторник,0,6-10
2,OD41368,2019-01-02,1129,13.0,7589,Обычный,Ясно,Зима,Ночь,Среда,0,11-20
3,OR96207,2019-01-02,1130,20.0,14930,Опасный,Ясно,Зима,Вечер,Среда,2,11-20
4,OD46273,2019-01-02,1201,9.0,11187,Замороженный,Ясно,Зима,Ночь,Среда,0,6-10
...,...,...,...,...,...,...,...,...,...,...,...,...
9478,OD27481,2024-12-31,542,16.0,12083,Опасный,Метель,Зима,Ночь,Вторник,0,11-20
9479,OD20726,2024-12-31,902,8.0,10363,Обычный,Ясно,Зима,Вечер,Вторник,0,6-10
9480,DL37403,2024-12-31,1106,8.0,13567,Обычный,Ясно,Зима,Ночь,Вторник,0,6-10
9481,OD48127,2024-12-31,1978,32.0,27829,Обычный,Снег,Зима,Ночь,Вторник,4,30+


In [3]:
# 2. Preprocessing
df['Дата отправки'] = pd.to_datetime(
    df['Дата отправки'],
    dayfirst=True,
    errors='coerce'
)

df.dropna(subset=['Дата отправки','Задержка доставки (дней)'], inplace=True)

df['year']    = df['Дата отправки'].dt.year
df['month']   = df['Дата отправки'].dt.month
df['day']     = df['Дата отправки'].dt.day
df['weekday'] = df['Дата отправки'].dt.weekday
df['hour']    = df['Дата отправки'].dt.hour

In [4]:
# Ensure target is numeric and drop any rows where it's missing
df['Задержка доставки (дней)'] = pd.to_numeric(
    df['Задержка доставки (дней)'], errors='coerce'
)
df.dropna(subset=['Задержка доставки (дней)'], inplace=True)

In [5]:
# Drop ID and original date
df.drop(['ID заказа', 'Дата отправки'], axis=1, inplace=True)

In [6]:
# Separate features and target
y = df['Задержка доставки (дней)'].astype(np.float32)
X = df.drop('Задержка доставки (дней)', axis=1)

In [7]:
# Impute features
# Numeric
numeric_cols = X.select_dtypes(include=[np.number]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
# Categorical
categorical_cols = X.select_dtypes(['object']).columns
for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])

In [8]:
# One-hot encode categorical features
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [9]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Scale numeric features
df_scaler = StandardScaler()
X_train[numeric_cols] = df_scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = df_scaler.transform(X_test[numeric_cols])

In [11]:
# 3. Model 1: Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [12]:
y_pred_rf = rf.predict(X_test)
print('Random Forest RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print('Random Forest R2:', r2_score(y_test, y_pred_rf))


Random Forest RMSE: 0.2805627310814894
Random Forest R2: 0.9653741597720084


In [13]:
# 4. Model 2: XGBoost Regressor
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)

In [14]:
y_pred_xgb = xgb.predict(X_test)
print('XGBoost RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_xgb)))
print('XGBoost R2:', r2_score(y_test, y_pred_xgb))

XGBoost RMSE: 0.27875889804562753
XGBoost R2: 0.9658179879188538


In [15]:
# 5. Model 3: LightGBM Regressor
lgbm = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 632
[LightGBM] [Info] Number of data points in the train set: 3003, number of used features: 30
[LightGBM] [Info] Start training from score 0.964702


In [16]:
y_pred_lgbm = lgbm.predict(X_test)
print('LightGBM RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_lgbm)))
print('LightGBM R2:', r2_score(y_test, y_pred_lgbm))

LightGBM RMSE: 0.29808894788105217
LightGBM R2: 0.9609130207527949


In [17]:
# 6. Model 4: CatBoost Regressor
cat = CatBoostRegressor(iterations=100, learning_rate=0.1, verbose=0, random_state=42)
cat.fit(X_train, y_train)


<catboost.core.CatBoostRegressor at 0x7a8783eae210>

In [18]:
y_pred_cat = cat.predict(X_test)
print('CatBoost RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_cat)))
print('CatBoost R2:', r2_score(y_test, y_pred_cat))


CatBoost RMSE: 0.30218570763447516
CatBoost R2: 0.9598312608902975


In [19]:
# 7. Prepare data for neural networks (CNN & LSTM)
X_train_nn = X_train.values.astype(np.float32).reshape(
    (X_train.shape[0], X_train.shape[1], 1)
)
X_test_nn = X_test.values.astype(np.float32).reshape(
    (X_test.shape[0],  X_test.shape[1], 1)
)

y_train_nn = y_train.values.astype(np.float32)
y_test_nn  = y_test.values.astype(np.float32)

# Early stopping callback
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [21]:
# 8. Model 5: 1D CNN
n_timesteps, n_features = X_train_nn.shape[1], X_train_nn.shape[2]

cnn = Sequential([
    Conv1D(64, kernel_size=3, activation='relu',
           input_shape=(n_timesteps, n_features)),
    GlobalMaxPooling1D(),
    Dense(50, activation='relu'),
    Dense(1)
])
cnn.compile(optimizer='adam', loss='mse')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
history = cnn.fit(
    X_train_nn, y_train_nn,
    validation_data=(X_test_nn, y_test_nn),
    epochs=20, batch_size=32,
    callbacks=[es], verbose=2
)

Epoch 1/20
94/94 - 6s - 63ms/step - loss: 1.5989 - val_loss: 0.7284
Epoch 2/20
94/94 - 2s - 17ms/step - loss: 0.6241 - val_loss: 0.5638
Epoch 3/20
94/94 - 2s - 17ms/step - loss: 0.5336 - val_loss: 0.5100
Epoch 4/20
94/94 - 2s - 20ms/step - loss: 0.5062 - val_loss: 0.4977
Epoch 5/20
94/94 - 1s - 11ms/step - loss: 0.4870 - val_loss: 0.4732
Epoch 6/20
94/94 - 1s - 9ms/step - loss: 0.4777 - val_loss: 0.4557
Epoch 7/20
94/94 - 0s - 5ms/step - loss: 0.4678 - val_loss: 0.4595
Epoch 8/20
94/94 - 1s - 6ms/step - loss: 0.4544 - val_loss: 0.4358
Epoch 9/20
94/94 - 0s - 3ms/step - loss: 0.4518 - val_loss: 0.4378
Epoch 10/20
94/94 - 1s - 7ms/step - loss: 0.4456 - val_loss: 0.4282
Epoch 11/20
94/94 - 0s - 3ms/step - loss: 0.4387 - val_loss: 0.4188
Epoch 12/20
94/94 - 0s - 4ms/step - loss: 0.4314 - val_loss: 0.4268
Epoch 13/20
94/94 - 0s - 3ms/step - loss: 0.4227 - val_loss: 0.3966
Epoch 14/20
94/94 - 1s - 7ms/step - loss: 0.4123 - val_loss: 0.3953
Epoch 15/20
94/94 - 0s - 3ms/step - loss: 0.4180 - v

In [23]:
y_pred_cnn = cnn.predict(X_test_nn).ravel()
print('1D CNN RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_cnn)))
print('1D CNN R2:', r2_score(y_test, y_pred_cnn))

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
1D CNN RMSE: 0.6094501400128622
1D CNN R2: 0.8366132974624634


In [24]:
# 9. Model 6: LSTM
lstm = Sequential([
    LSTM(64, input_shape=(n_timesteps, n_features)),
    Dropout(0.3),
    Dense(50, activation='relu'),
    Dense(1)
])
lstm.compile(optimizer='adam', loss='mse')

  super().__init__(**kwargs)


In [25]:
lstm.fit(X_train_nn, y_train, validation_data=(X_test_nn, y_test), epochs=20, batch_size=32, callbacks=[es], verbose=2)

Epoch 1/20
94/94 - 6s - 61ms/step - loss: 1.3046 - val_loss: 0.7364
Epoch 2/20
94/94 - 2s - 21ms/step - loss: 0.6493 - val_loss: 0.4932
Epoch 3/20
94/94 - 2s - 25ms/step - loss: 0.4716 - val_loss: 0.3351
Epoch 4/20
94/94 - 3s - 27ms/step - loss: 0.3512 - val_loss: 0.4332
Epoch 5/20
94/94 - 2s - 17ms/step - loss: 0.3045 - val_loss: 0.1946
Epoch 6/20
94/94 - 2s - 23ms/step - loss: 0.2666 - val_loss: 0.2098
Epoch 7/20
94/94 - 2s - 19ms/step - loss: 0.2511 - val_loss: 0.1986
Epoch 8/20
94/94 - 1s - 16ms/step - loss: 0.2537 - val_loss: 0.1973
Epoch 9/20
94/94 - 3s - 27ms/step - loss: 0.2388 - val_loss: 0.2506
Epoch 10/20
94/94 - 2s - 17ms/step - loss: 0.2497 - val_loss: 0.1654
Epoch 11/20
94/94 - 2s - 26ms/step - loss: 0.2229 - val_loss: 0.2130
Epoch 12/20
94/94 - 2s - 18ms/step - loss: 0.2170 - val_loss: 0.1901
Epoch 13/20
94/94 - 3s - 27ms/step - loss: 0.2467 - val_loss: 0.2805
Epoch 14/20
94/94 - 2s - 24ms/step - loss: 0.2737 - val_loss: 0.2125
Epoch 15/20
94/94 - 3s - 27ms/step - loss: 

<keras.src.callbacks.history.History at 0x7a87845647d0>

In [28]:
y_pred_lstm = lstm.predict(X_test_nn).ravel()
print('LSTM RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_lstm)))
print('LSTM R2:', r2_score(y_test, y_pred_lstm))

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
LSTM RMSE: 0.4067199080073739
LSTM R2: 0.9272335171699524


In [30]:
# Create a dictionary to store the evaluation metrics
results = {
    'Model': ['Random Forest', 'XGBoost', 'LightGBM', 'CatBoost', '1D CNN', 'LSTM'],
    'RMSE': [
        np.sqrt(mean_squared_error(y_test, y_pred_rf)),
        np.sqrt(mean_squared_error(y_test, y_pred_xgb)),
        np.sqrt(mean_squared_error(y_test, y_pred_lgbm)),
        np.sqrt(mean_squared_error(y_test, y_pred_cat)),
        np.sqrt(mean_squared_error(y_test, y_pred_cnn)),
        np.sqrt(mean_squared_error(y_test, y_pred_lstm))
    ],
    'R2': [
        r2_score(y_test, y_pred_rf),
        r2_score(y_test, y_pred_xgb),
        r2_score(y_test, y_pred_lgbm),
        r2_score(y_test, y_pred_cat),
        r2_score(y_test, y_pred_cnn),
        r2_score(y_test, y_pred_lstm)
    ]
}

# Create a pandas DataFrame from the dictionary
results_df = pd.DataFrame(results)

# Display the DataFrame as a comparison table
results_df


Unnamed: 0,Model,RMSE,R2
0,Random Forest,0.280563,0.965374
1,XGBoost,0.278759,0.965818
2,LightGBM,0.298089,0.960913
3,CatBoost,0.302186,0.959831
4,1D CNN,0.60945,0.836613
5,LSTM,0.40672,0.927234


In [31]:
import joblib

# Save the trained models
joblib.dump(rf, 'random_forest_model.pkl')
joblib.dump(xgb, 'xgboost_model.pkl')
joblib.dump(lgbm, 'lightgbm_model.pkl')
joblib.dump(cat, 'catboost_model.pkl')
cnn.save('cnn_model.h5')
lstm.save('lstm_model.h5')

# Save the scaler
joblib.dump(df_scaler, 'scaler.pkl')

# Download the models and scaler
from google.colab import files
files.download('random_forest_model.pkl')
files.download('xgboost_model.pkl')
files.download('lightgbm_model.pkl')
files.download('catboost_model.pkl')
files.download('cnn_model.h5')
files.download('lstm_model.h5')
files.download('scaler.pkl')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>