In [1]:
import pandas as pd
import numpy as np


In [2]:
# 1. Load & initial cleaning
df = pd.read_csv('types.csv')
# Drop ID and raw date columns
df.drop(columns=['ID заказа', 'Дата заказа'], inplace=True)

In [3]:
# 2. Drop rows where target is missing
df = df.dropna(subset=['Приоритет доставки'])

In [4]:
# 3. Separate target & features
target = 'Приоритет доставки'
X = df.drop(columns=[target])
y = df[target]

In [5]:
# 4. Missing-value treatment
num_cols = X.select_dtypes(include=['int64','float64']).columns
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

cat_cols = X.select_dtypes(include=['object']).columns
for c in cat_cols:
    X[c] = X[c].fillna(X[c].mode()[0])

In [6]:
# 5. One-hot encode categoricals
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [7]:
# 6. Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [8]:
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=0.0)
X_train = pd.DataFrame(vt.fit_transform(X_train),
                       columns=X_train.columns[vt.get_support()])
X_test  = pd.DataFrame(vt.transform(X_test),
                       columns=X_train.columns)

In [9]:
# 7. Scale only the original continuous columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

cont_cols = [c for c in num_cols if c in X_train.columns]

X_train_scaled = X_train.copy()
X_test_scaled  = X_test.copy()

X_train_scaled[cont_cols] = scaler.fit_transform(X_train[cont_cols])
X_test_scaled[cont_cols]  = scaler.transform(X_test[cont_cols])

In [10]:
X_train_scaled.replace([np.inf, -np.inf], 0, inplace=True)
X_test_scaled.replace( [np.inf, -np.inf], 0, inplace=True)
X_train_scaled.fillna(0, inplace=True)
X_test_scaled.fillna(0, inplace=True)

In [11]:
# 8. Rebuild sequence arrays for CNN
n_features  = X_train_scaled.shape[1]
X_train_seq = X_train_scaled.values.reshape(-1, n_features, 1).astype(np.float32)
X_test_seq  = X_test_scaled.values.reshape(-1, n_features, 1).astype(np.float32)


In [12]:
# 9. Encode y once
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le          = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

y_train_cat = to_categorical(y_train_enc)
y_test_cat  = to_categorical(y_test_enc)
n_classes   = y_train_cat.shape[1]

In [13]:
# 10. Utility for evaluation
from sklearn.metrics import classification_report, accuracy_score
def eval_model(name, y_true, y_pred):
    print(f'\n--- {name} ---')
    print('Accuracy:', accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, target_names=le.classes_))

In [14]:
# 11. Model 1: Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


In [15]:
y_pred_rf = rf.predict(X_test)
eval_model('Random Forest', y_test, y_pred_rf)



--- Random Forest ---
Accuracy: 0.9985265225933202
              precision    recall  f1-score   support

     Высокий       1.00      1.00      1.00      3386
      Низкий       1.00      1.00      1.00      1508
     Средний       1.00      1.00      1.00      1214

    accuracy                           1.00      6108
   macro avg       1.00      1.00      1.00      6108
weighted avg       1.00      1.00      1.00      6108



In [16]:
# 12. Model 2: XGBoost
!pip install xgboost --quiet
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

# Fit the encoder to your training target variable
le.fit(y_train)

# Transform both training and testing target variables
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train_encoded)



Parameters: { "use_label_encoder" } are not used.



In [17]:
y_pred_xgb = xgb.predict(X_test)

y_pred_xgb_original = le.inverse_transform(y_pred_xgb)

eval_model('XGBoost', y_test, y_pred_xgb_original)


--- XGBoost ---
Accuracy: 0.9993451211525868
              precision    recall  f1-score   support

     Высокий       1.00      1.00      1.00      3386
      Низкий       1.00      1.00      1.00      1508
     Средний       1.00      1.00      1.00      1214

    accuracy                           1.00      6108
   macro avg       1.00      1.00      1.00      6108
weighted avg       1.00      1.00      1.00      6108



In [18]:
# 13. Model 3: LightGBM
!pip install lightgbm --quiet
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1034
[LightGBM] [Info] Number of data points in the train set: 24432, number of used features: 11
[LightGBM] [Info] Start training from score -0.589876
[LightGBM] [Info] Start training from score -1.399147
[LightGBM] [Info] Start training from score -1.615473


In [19]:
y_pred_lgbm = lgbm.predict(X_test)
eval_model('LightGBM', y_test, y_pred_lgbm)


--- LightGBM ---
Accuracy: 0.9993451211525868
              precision    recall  f1-score   support

     Высокий       1.00      1.00      1.00      3386
      Низкий       1.00      1.00      1.00      1508
     Средний       1.00      1.00      1.00      1214

    accuracy                           1.00      6108
   macro avg       1.00      1.00      1.00      6108
weighted avg       1.00      1.00      1.00      6108



In [20]:
# 14. Model 4: MLP (Dense NN)
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=200, random_state=42)

# Impute NaNs with a specific value
num_cols = X_train_scaled.select_dtypes(include=['int64', 'float64']).columns

# Convert all columns to numeric before filling NaNs
X_train_scaled = X_train_scaled.astype(float)
X_test_scaled = X_test_scaled.astype(float)
X_train_scaled[num_cols] = X_train_scaled[num_cols].fillna(X_train_scaled[num_cols].median())
X_test_scaled[num_cols] = X_test_scaled[num_cols].fillna(X_train_scaled[num_cols].median())


mlp.fit(X_train_scaled, y_train)

In [21]:
y_pred_mlp = mlp.predict(X_test_scaled)
eval_model('MLPClassifier', y_test, y_pred_mlp)


--- MLPClassifier ---
Accuracy: 0.9981990831696136
              precision    recall  f1-score   support

     Высокий       1.00      1.00      1.00      3386
      Низкий       1.00      1.00      1.00      1508
     Средний       1.00      0.99      1.00      1214

    accuracy                           1.00      6108
   macro avg       1.00      1.00      1.00      6108
weighted avg       1.00      1.00      1.00      6108



In [22]:
# 15. Model 5: 1D-CNN
import tensorflow as tf
from tensorflow.keras.models   import Sequential
from tensorflow.keras.layers   import (
    Conv1D, MaxPooling1D, Flatten,
    Dense, Dropout, BatchNormalization
)

cnn = Sequential([
    Conv1D(64, 3, activation='relu',  input_shape=(n_features,1)),
    BatchNormalization(),
    MaxPooling1D(2),
    Conv1D(128,3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(2),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(n_classes, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
cnn.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [24]:
cnn.fit(
    X_train_seq, y_train_cat,
    validation_data=(X_test_seq, y_test_cat),
    epochs=20,
    batch_size=32,
    verbose=2
)

Epoch 1/20
764/764 - 11s - 15ms/step - accuracy: 0.9874 - loss: 0.0372 - val_accuracy: 0.9931 - val_loss: 0.0193
Epoch 2/20
764/764 - 10s - 13ms/step - accuracy: 0.9935 - loss: 0.0204 - val_accuracy: 0.9954 - val_loss: 0.0165
Epoch 3/20
764/764 - 10s - 13ms/step - accuracy: 0.9938 - loss: 0.0196 - val_accuracy: 0.9951 - val_loss: 0.0154
Epoch 4/20
764/764 - 8s - 10ms/step - accuracy: 0.9947 - loss: 0.0153 - val_accuracy: 0.9954 - val_loss: 0.0167
Epoch 5/20
764/764 - 6s - 8ms/step - accuracy: 0.9950 - loss: 0.0150 - val_accuracy: 0.9964 - val_loss: 0.0127
Epoch 6/20
764/764 - 5s - 6ms/step - accuracy: 0.9955 - loss: 0.0145 - val_accuracy: 0.9972 - val_loss: 0.0106
Epoch 7/20
764/764 - 5s - 6ms/step - accuracy: 0.9955 - loss: 0.0144 - val_accuracy: 0.9962 - val_loss: 0.0154
Epoch 8/20
764/764 - 5s - 6ms/step - accuracy: 0.9956 - loss: 0.0141 - val_accuracy: 0.9959 - val_loss: 0.0149
Epoch 9/20
764/764 - 5s - 7ms/step - accuracy: 0.9965 - loss: 0.0108 - val_accuracy: 0.9959 - val_loss: 0

<keras.src.callbacks.history.History at 0x7e30e05b40d0>

In [25]:
y_pred_cnn = le.inverse_transform(
    np.argmax(cnn.predict(X_test_seq), axis=1)
)
eval_model('1D-CNN', y_test, y_pred_cnn)

[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step

--- 1D-CNN ---
Accuracy: 0.9959070072036673
              precision    recall  f1-score   support

     Высокий       1.00      1.00      1.00      3386
      Низкий       1.00      1.00      1.00      1508
     Средний       0.99      0.99      0.99      1214

    accuracy                           1.00      6108
   macro avg       0.99      1.00      0.99      6108
weighted avg       1.00      1.00      1.00      6108



In [26]:
# 16. Model 6: LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense

# build the model
lstm_model = Sequential([
    # LSTM layer over your feature “sequence”
    LSTM(64, input_shape=(n_features, 1)),
    Dropout(0.2),               # lighter dropout to avoid nan loss
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(n_classes, activation='softmax')
])

  super().__init__(**kwargs)


In [27]:
# compile
lstm_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
# fit—using the same sequence arrays and one‐hot labels you prepared
lstm_model.fit(
    X_train_seq,
    y_train_cat,
    validation_data=(X_test_seq, y_test_cat),
    epochs=30,
    batch_size=32,
    verbose=2
)

Epoch 1/30
764/764 - 9s - 12ms/step - accuracy: 0.9662 - loss: 0.0984 - val_accuracy: 0.9874 - val_loss: 0.0379
Epoch 2/30
764/764 - 7s - 9ms/step - accuracy: 0.9882 - loss: 0.0370 - val_accuracy: 0.9913 - val_loss: 0.0271
Epoch 3/30
764/764 - 6s - 8ms/step - accuracy: 0.9928 - loss: 0.0231 - val_accuracy: 0.9898 - val_loss: 0.0339
Epoch 4/30
764/764 - 10s - 13ms/step - accuracy: 0.9933 - loss: 0.0217 - val_accuracy: 0.9943 - val_loss: 0.0184
Epoch 5/30
764/764 - 8s - 10ms/step - accuracy: 0.9934 - loss: 0.0189 - val_accuracy: 0.9948 - val_loss: 0.0175
Epoch 6/30
764/764 - 9s - 11ms/step - accuracy: 0.9943 - loss: 0.0171 - val_accuracy: 0.9946 - val_loss: 0.0179
Epoch 7/30
764/764 - 7s - 9ms/step - accuracy: 0.9947 - loss: 0.0161 - val_accuracy: 0.9961 - val_loss: 0.0177
Epoch 8/30
764/764 - 10s - 13ms/step - accuracy: 0.9944 - loss: 0.0166 - val_accuracy: 0.9951 - val_loss: 0.0197
Epoch 9/30
764/764 - 6s - 8ms/step - accuracy: 0.9952 - loss: 0.0142 - val_accuracy: 0.9956 - val_loss: 0

<keras.src.callbacks.history.History at 0x7e30dedf8d90>

In [28]:
y_pred_lstm = np.argmax(lstm_model.predict(X_test_seq), axis=1)
y_pred_lstm = le.inverse_transform(y_pred_lstm)
eval_model('LSTM', y_test, y_pred_lstm)

[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step

--- LSTM ---
Accuracy: 0.9968893254747871
              precision    recall  f1-score   support

     Высокий       1.00      1.00      1.00      3386
      Низкий       1.00      1.00      1.00      1508
     Средний       0.99      0.99      0.99      1214

    accuracy                           1.00      6108
   macro avg       1.00      1.00      1.00      6108
weighted avg       1.00      1.00      1.00      6108



In [30]:
results = []
models = ['Random Forest', 'XGBoost', 'LightGBM', 'MLPClassifier', '1D-CNN', 'LSTM']
predictions = [y_pred_rf, y_pred_xgb_original, y_pred_lgbm, y_pred_mlp, y_pred_cnn, y_pred_lstm]


for model, prediction in zip(models, predictions):
    accuracy = accuracy_score(y_test, prediction)
    results.append({'Model': model, 'Accuracy': accuracy})

accuracy_table = pd.DataFrame(results)
accuracy_table


Unnamed: 0,Model,Accuracy
0,Random Forest,0.998527
1,XGBoost,0.999345
2,LightGBM,0.999345
3,MLPClassifier,0.998199
4,1D-CNN,0.995907
5,LSTM,0.996889


In [31]:
import joblib

# Save the models to files
joblib.dump(rf, 'random_forest_model.pkl')
joblib.dump(xgb, 'xgboost_model.pkl')
joblib.dump(lgbm, 'lightgbm_model.pkl')
joblib.dump(mlp, 'mlp_model.pkl')

# Save Keras models differently
cnn.save('cnn_model.h5')
lstm_model.save('lstm_model.h5')

# Download the files using the files utility in Google Colab
from google.colab import files
files.download('random_forest_model.pkl')
files.download('xgboost_model.pkl')
files.download('lightgbm_model.pkl')
files.download('mlp_model.pkl')
files.download('cnn_model.h5')
files.download('lstm_model.h5')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>