In [1]:
!pip install xgboost lightgbm catboost --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, Flatten, LSTM
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
# Load data
df = pd.read_csv('transport.csv')

In [3]:
# Drop identifiers
df.drop(columns=['ID ТС', 'Дата записи'], inplace=True)

In [4]:
# Split target and features
y = df['Поломка в ближайший месяц']
X = df.drop(columns=['Поломка в ближайший месяц'])

In [5]:
# Handle missing values
X = X.fillna(X.median())

In [6]:
# feature engineering
X['Средняя скорость (км/ч)'] = X['Средняя скорость (км/ч)'].replace(0, 0.1)
X['Загрузка (%)'] = X['Загрузка (%)'].replace(0, 0.1)

# 1. Mileage per year (Пробег / Возраст)
X['Пробег на год'] = X['Пробег (км)'] / (X['Возраст ТС (лет)'] + 0.1)

# 2. Engine load vs. vehicle load (Средняя нагрузка двигателя / Загрузка)
X['Нагрузка двигателя на загрузку'] = X['Средняя нагрузка двигателя (%)'] / X['Загрузка (%)']

# 3. Maintenance frequency (Количество ТО / Возраст)
X['ТО на год'] = X['Количество ТО за год'] / (X['Возраст ТС (лет)'] + 0.1)

# 4. Speed per load (скорость / загрузка)
X['Скорость на загрузку'] = X['Средняя скорость (км/ч)'] / X['Загрузка (%)']

# 5. High mileage indicator
X['Пробег > 600k'] = (X['Пробег (км)'] > 600_000).astype(int)

# 6. Old vehicle indicator
X['ТС старше 15 лет'] = (X['Возраст ТС (лет)'] > 15).astype(int)

# 7. High load indicator
X['Загрузка > 90%'] = (X['Загрузка (%)'] > 90).astype(int)

In [7]:

# One-hot encode categoricals
cat_cols = X.select_dtypes(include=['object', 'category']).columns
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
# Scale for neural nets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [10]:
# Evaluation helper
def evaluate_model(name, y_true, y_proba, threshold=0.5):
    y_pred = (y_proba >= threshold).astype(int)
    print(f'=== {name} ===')
    print('ROC AUC:', roc_auc_score(y_true, y_proba))
    print('Accuracy:', accuracy_score(y_true, y_pred))
    print('Brier score:', brier_score_loss(y_true, y_proba))
    print()


In [11]:
# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)


In [12]:
rf_proba = rf.predict_proba(X_test)[:, 1]
evaluate_model('Random Forest', y_test, rf_proba)

=== Random Forest ===
ROC AUC: 0.9079625237748609
Accuracy: 0.8270874424720579
Brier score: 0.1015431295200526



In [13]:
# XGBoost
xgb = XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)



Parameters: { "use_label_encoder" } are not used.



In [14]:
xgb_proba = xgb.predict_proba(X_test)[:, 1]
evaluate_model('XGBoost', y_test, xgb_proba)

=== XGBoost ===
ROC AUC: 0.9003475238922677
Accuracy: 0.814595660749507
Brier score: 0.12921999116016797



In [15]:
# LightGBM
lgbm = LGBMClassifier(n_estimators=200, random_state=42)
lgbm.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 1482, number of negative: 4602
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000931 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 6084, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243590 -> initscore=-1.133098
[LightGBM] [Info] Start training from score -1.133098


In [16]:
lgbm_proba = lgbm.predict_proba(X_test)[:, 1]
evaluate_model('LightGBM', y_test, lgbm_proba)

=== LightGBM ===
ROC AUC: 0.9046187803789889
Accuracy: 0.8165680473372781
Brier score: 0.11205498174965786



In [17]:
# CatBoost
cat = CatBoostClassifier(iterations=200, eval_metric='AUC', verbose=False, random_state=42)
cat.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7c356ea8a0d0>

In [18]:
cat_proba = cat.predict_proba(X_test)[:, 1]
evaluate_model('CatBoost', y_test, cat_proba)

=== CatBoost ===
ROC AUC: 0.9125272970624839
Accuracy: 0.8284023668639053
Brier score: 0.10097848202791122



In [19]:
# MLP Neural Net
mlp = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
mlp.compile(loss='binary_crossentropy', optimizer='adam')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
mlp.fit(X_train_scaled, y_train, validation_split=0.1,
        epochs=50, batch_size=32,
        callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
        verbose=2)

Epoch 1/50
172/172 - 6s - 36ms/step - loss: 0.4433 - val_loss: 0.3798
Epoch 2/50
172/172 - 2s - 9ms/step - loss: 0.3648 - val_loss: 0.3538
Epoch 3/50
172/172 - 1s - 8ms/step - loss: 0.3513 - val_loss: 0.3446
Epoch 4/50
172/172 - 1s - 7ms/step - loss: 0.3360 - val_loss: 0.3344
Epoch 5/50
172/172 - 1s - 5ms/step - loss: 0.3356 - val_loss: 0.3341
Epoch 6/50
172/172 - 1s - 7ms/step - loss: 0.3323 - val_loss: 0.3321
Epoch 7/50
172/172 - 1s - 5ms/step - loss: 0.3270 - val_loss: 0.3255
Epoch 8/50
172/172 - 1s - 4ms/step - loss: 0.3261 - val_loss: 0.3270
Epoch 9/50
172/172 - 1s - 4ms/step - loss: 0.3212 - val_loss: 0.3263
Epoch 10/50
172/172 - 1s - 4ms/step - loss: 0.3198 - val_loss: 0.3237
Epoch 11/50
172/172 - 1s - 4ms/step - loss: 0.3206 - val_loss: 0.3216
Epoch 12/50
172/172 - 1s - 8ms/step - loss: 0.3169 - val_loss: 0.3232
Epoch 13/50
172/172 - 1s - 6ms/step - loss: 0.3127 - val_loss: 0.3201
Epoch 14/50
172/172 - 1s - 3ms/step - loss: 0.3135 - val_loss: 0.3198
Epoch 15/50
172/172 - 0s - 3

<keras.src.callbacks.history.History at 0x7c365061b6d0>

In [21]:
mlp_proba = mlp.predict(X_test_scaled).ravel()
evaluate_model('MLP Neural Net', y_test, mlp_proba)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
=== MLP Neural Net ===
ROC AUC: 0.9036677859440676
Accuracy: 0.8198553583168968
Brier score: 0.10532620140069328



In [22]:
# 1D CNN (features as “sequence”)
X_train_cnn = X_train_scaled.reshape(-1, X_train_scaled.shape[1], 1)
X_test_cnn  = X_test_scaled.reshape(-1, X_test_scaled.shape[1], 1)

cnn = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
cnn.compile(loss='binary_crossentropy', optimizer='adam')
cnn.fit(X_train_cnn, y_train, validation_split=0.1,
        epochs=50, batch_size=32,
        callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
        verbose=2)

Epoch 1/50
172/172 - 6s - 32ms/step - loss: 0.4080 - val_loss: 0.3723
Epoch 2/50
172/172 - 2s - 9ms/step - loss: 0.3527 - val_loss: 0.3559
Epoch 3/50
172/172 - 1s - 8ms/step - loss: 0.3398 - val_loss: 0.3523
Epoch 4/50
172/172 - 2s - 14ms/step - loss: 0.3323 - val_loss: 0.3423
Epoch 5/50
172/172 - 1s - 8ms/step - loss: 0.3314 - val_loss: 0.3401
Epoch 6/50
172/172 - 1s - 6ms/step - loss: 0.3262 - val_loss: 0.3310
Epoch 7/50
172/172 - 1s - 7ms/step - loss: 0.3214 - val_loss: 0.3309
Epoch 8/50
172/172 - 1s - 5ms/step - loss: 0.3200 - val_loss: 0.3462
Epoch 9/50
172/172 - 1s - 5ms/step - loss: 0.3155 - val_loss: 0.3324
Epoch 10/50
172/172 - 1s - 8ms/step - loss: 0.3178 - val_loss: 0.3319
Epoch 11/50
172/172 - 1s - 5ms/step - loss: 0.3121 - val_loss: 0.3318
Epoch 12/50
172/172 - 1s - 4ms/step - loss: 0.3119 - val_loss: 0.3273
Epoch 13/50
172/172 - 1s - 3ms/step - loss: 0.3072 - val_loss: 0.3169
Epoch 14/50
172/172 - 1s - 4ms/step - loss: 0.3058 - val_loss: 0.3301
Epoch 15/50
172/172 - 1s - 

<keras.src.callbacks.history.History at 0x7c356c112d10>

In [24]:
cnn_proba = cnn.predict(X_test_cnn).ravel()
evaluate_model('1D CNN', y_test, cnn_proba)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
=== 1D CNN ===
ROC AUC: 0.9029022941273157
Accuracy: 0.8198553583168968
Brier score: 0.1062427028756396



In [25]:
# LSTM
lstm = Sequential([
    LSTM(64, input_shape=(X_train_cnn.shape[1], 1)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
lstm.compile(loss='binary_crossentropy', optimizer='adam')

  super().__init__(**kwargs)


In [26]:
lstm.fit(X_train_cnn, y_train, validation_split=0.1,
         epochs=50, batch_size=32,
         callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
         verbose=2)

Epoch 1/50
172/172 - 4s - 22ms/step - loss: 0.5076 - val_loss: 0.4701
Epoch 2/50
172/172 - 3s - 17ms/step - loss: 0.4197 - val_loss: 0.4057
Epoch 3/50
172/172 - 2s - 12ms/step - loss: 0.3877 - val_loss: 0.4352
Epoch 4/50
172/172 - 2s - 9ms/step - loss: 0.3769 - val_loss: 0.3888
Epoch 5/50
172/172 - 2s - 9ms/step - loss: 0.3747 - val_loss: 0.3912
Epoch 6/50
172/172 - 2s - 14ms/step - loss: 0.3651 - val_loss: 0.3932
Epoch 7/50
172/172 - 1s - 9ms/step - loss: 0.3593 - val_loss: 0.3913
Epoch 8/50
172/172 - 2s - 9ms/step - loss: 0.3549 - val_loss: 0.3765
Epoch 9/50
172/172 - 3s - 16ms/step - loss: 0.3500 - val_loss: 0.3635
Epoch 10/50
172/172 - 2s - 14ms/step - loss: 0.3520 - val_loss: 0.3876
Epoch 11/50
172/172 - 2s - 14ms/step - loss: 0.3472 - val_loss: 0.3634
Epoch 12/50
172/172 - 3s - 15ms/step - loss: 0.3410 - val_loss: 0.3601
Epoch 13/50
172/172 - 4s - 25ms/step - loss: 0.3389 - val_loss: 0.3505
Epoch 14/50
172/172 - 2s - 12ms/step - loss: 0.3395 - val_loss: 0.3627
Epoch 15/50
172/172

<keras.src.callbacks.history.History at 0x7c3564e63990>

In [27]:
lstm_proba = lstm.predict(X_test_cnn).ravel()
evaluate_model('LSTM', y_test, lstm_proba)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
=== LSTM ===
ROC AUC: 0.8988846361565737
Accuracy: 0.8080210387902695
Brier score: 0.11064143791027999



In [28]:
results = {
    'Model': ['Random Forest', 'XGBoost', 'LightGBM', 'CatBoost', 'MLP Neural Net', '1D CNN', 'LSTM'],
    'ROC AUC': [],
    'Accuracy': [],
    'Brier Score': []
}

def update_results(name, y_true, y_proba):
    y_pred = (y_proba >= 0.5).astype(int)
    results['ROC AUC'].append(roc_auc_score(y_true, y_proba))
    results['Accuracy'].append(accuracy_score(y_true, y_pred))
    results['Brier Score'].append(brier_score_loss(y_true, y_proba))

update_results('Random Forest', y_test, rf_proba)
update_results('XGBoost', y_test, xgb_proba)
update_results('LightGBM', y_test, lgbm_proba)
update_results('CatBoost', y_test, cat_proba)
update_results('MLP Neural Net', y_test, mlp_proba)
update_results('1D CNN', y_test, cnn_proba)
update_results('LSTM', y_test, lstm_proba)

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,ROC AUC,Accuracy,Brier Score
0,Random Forest,0.907963,0.827087,0.101543
1,XGBoost,0.900348,0.814596,0.12922
2,LightGBM,0.904619,0.816568,0.112055
3,CatBoost,0.912527,0.828402,0.100978
4,MLP Neural Net,0.903668,0.819855,0.105326
5,1D CNN,0.902902,0.819855,0.106243
6,LSTM,0.898885,0.808021,0.110641


In [29]:
import joblib

# Save models
joblib.dump(rf, 'random_forest_model.pkl')
joblib.dump(xgb, 'xgboost_model.pkl')
joblib.dump(lgbm, 'lightgbm_model.pkl')
joblib.dump(cat, 'catboost_model.pkl')

# Save Keras models
mlp.save('mlp_model.h5')
cnn.save('cnn_model.h5')
lstm.save('lstm_model.h5')

# Download the models (replace with your Google Drive path)
from google.colab import files
files.download('random_forest_model.pkl')
files.download('xgboost_model.pkl')
files.download('lightgbm_model.pkl')
files.download('catboost_model.pkl')
files.download('mlp_model.h5')
files.download('cnn_model.h5')
files.download('lstm_model.h5')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
# TRANFORMER MODEL

# 1) Reload raw data and drop IDs/dates
df_tr = pd.read_csv('transport.csv')
df_tr.drop(columns=['ID ТС', 'Дата записи'], inplace=True)


In [31]:
# 2) Split target vs features
y_tr = df_tr['Поломка в ближайший месяц']
X_tr = df_tr.drop(columns=['Поломка в ближайший месяц'])

In [32]:
# 3) Identify categorical & numeric columns
cat_cols = X_tr.select_dtypes(include=['object','category']).columns.tolist()
num_cols = X_tr.select_dtypes(include=['number']).columns.tolist()

In [33]:
# 4) Fill missing
for c in cat_cols:
    X_tr[c] = X_tr[c].fillna('missing')
for c in num_cols:
    X_tr[c] = X_tr[c].fillna(X_tr[c].median())

In [34]:
# 5) Label-encode categoricals
from sklearn.preprocessing import LabelEncoder
encoders = {}
for c in cat_cols:
    le = LabelEncoder()
    X_tr[c] = le.fit_transform(X_tr[c])
    encoders[c] = le


In [35]:
# 6) Scale numerics
from sklearn.preprocessing import StandardScaler
scaler_num = StandardScaler()
X_tr[num_cols] = scaler_num.fit_transform(X_tr[num_cols])

In [36]:
# 7) Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_tr, y_tr, test_size=0.2, random_state=42, stratify=y_tr)

In [52]:
# 8) Prepare Keras inputs dict
def make_inputs(df):
    inputs = {}
    for c in cat_cols:
        inputs[f'cat_{c}'] = df[c].values  # Add 'cat_' prefix for categorical features
    for c in num_cols:
        inputs[f'num_{c.replace("/", "_")}'] = df[c].values  # Add 'num_' prefix and replace '/' for numerical features
    return inputs

train_inputs = make_inputs(X_train)
test_inputs  = make_inputs(X_test)

In [53]:
# 9) Transformer encoder block
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Dropout, LayerNormalization,
    MultiHeadAttention, GlobalAveragePooling1D, Reshape, Lambda
)
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

def transformer_encoder(x, head_size, num_heads, ff_dim, dropout=0):
    # Pre-norm
    x1 = LayerNormalization(epsilon=1e-6)(x)
    # Self-attention
    x1 = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x1, x1)
    x1 = Dropout(dropout)(x1)
    x1 = x1 + x
    # Feed-forward
    x2 = LayerNormalization(epsilon=1e-6)(x1)
    x2 = Dense(ff_dim, activation='relu')(x2)
    x2 = Dropout(dropout)(x2)
    x2 = Dense(x.shape[-1])(x2)
    return x1 + x2

In [54]:
# 10) Build TabTransformer model
embed_dim = 32
inputs, embeddings = [], []

# Categorical embeddings
for c in cat_cols:
    inp = Input(shape=(1,), name=f'cat_{c}') # Add a prefix 'cat_' to categorical input names
    vocab_size = X_tr[c].nunique() + 1
    emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)(inp)
    emb = Reshape((embed_dim,))(emb)
    inputs.append(inp)
    embeddings.append(emb)

# Numeric “embeddings” via dense projection
for c in num_cols:
    inp = Input(shape=(1,), name=f'num_{c.replace("/", "_")}') # Add a prefix 'num_' to numerical input names
    proj = Dense(embed_dim)(inp)
    inputs.append(inp)
    embeddings.append(proj)

In [55]:
# Stack into sequence: (batch, seq_len, embed_dim)
x = Lambda(lambda ts: K.stack(ts, axis=1))(embeddings)

# Two Transformer blocks
x = transformer_encoder(x, head_size=64, num_heads=4, ff_dim=128, dropout=0.1)
x = transformer_encoder(x, head_size=64, num_heads=4, ff_dim=128, dropout=0.1)

# Pool & full-connect to sigmoid
x = GlobalAveragePooling1D()(x)
x = Dropout(0.3)(x)
out = Dense(1, activation='sigmoid')(x)




In [56]:
model = Model(inputs, out)
model.compile(optimizer='adam', loss='binary_crossentropy')

In [57]:
# 11) Train with early stopping
model.fit(
    train_inputs, y_train,
    validation_split=0.1,
    epochs=50, batch_size=32,
    callbacks=[tf.keras.callbacks.EarlyStopping(
        patience=5, restore_best_weights=True)],
    verbose=2
)

Epoch 1/50
172/172 - 15s - 87ms/step - loss: 0.4557 - val_loss: 0.3876
Epoch 2/50
172/172 - 4s - 22ms/step - loss: 0.3800 - val_loss: 0.3619
Epoch 3/50
172/172 - 4s - 22ms/step - loss: 0.3698 - val_loss: 0.3477
Epoch 4/50
172/172 - 6s - 32ms/step - loss: 0.3540 - val_loss: 0.3344
Epoch 5/50
172/172 - 4s - 22ms/step - loss: 0.3520 - val_loss: 0.3218
Epoch 6/50
172/172 - 8s - 45ms/step - loss: 0.3485 - val_loss: 0.3335
Epoch 7/50
172/172 - 7s - 44ms/step - loss: 0.3384 - val_loss: 0.3284
Epoch 8/50
172/172 - 5s - 29ms/step - loss: 0.3309 - val_loss: 0.3077
Epoch 9/50
172/172 - 4s - 22ms/step - loss: 0.3286 - val_loss: 0.3220
Epoch 10/50
172/172 - 5s - 30ms/step - loss: 0.3320 - val_loss: 0.3094
Epoch 11/50
172/172 - 5s - 31ms/step - loss: 0.3337 - val_loss: 0.3285
Epoch 12/50
172/172 - 5s - 28ms/step - loss: 0.3323 - val_loss: 0.3209
Epoch 13/50
172/172 - 5s - 30ms/step - loss: 0.3271 - val_loss: 0.3115


<keras.src.callbacks.history.History at 0x7c355f2de350>

In [58]:
# 12) Evaluate probabilities
proba = model.predict(test_inputs).ravel()
evaluate_model('Tabular Transformer', y_test, proba)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step
=== Tabular Transformer ===
ROC AUC: 0.9009556907037359
Accuracy: 0.8270874424720579
Brier score: 0.10872395323606868



In [60]:
model.save('tabular_transformer_model.h5')

# Download the Tabular Transformer model
files.download('tabular_transformer_model.h5')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>