In [60]:
import pandas as pd
import re

import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss

from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten
from tensorflow.keras.models import Model


In [61]:
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')

y_train = pd.read_csv('../data/processed/y_train.csv')
y_val = pd.read_csv('../data/processed/y_val.csv')

In [62]:
X_train.columns = [re.sub(r"[\[\]<>]", "", col) for col in X_train.columns]
X_val.columns = [re.sub(r"[\[\]<>]", "", col) for col in X_val.columns]

#### 1. Model XGBoost 

In [63]:
# convert DataFrames to format suitable for XGBoost
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

In [64]:
# initialize XGBoost model and train it
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [65]:
# class predictions and probabilities
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:,1]


In [66]:
# calculate evaluation metrics
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_proba)
logloss = log_loss(y_val, y_proba)

# print evaluation metrics
print(f"XGBoost REsults:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"Log Loss: {logloss:.4f}")

XGBoost REsults:
Accuracy: 0.9835
F1 Score: 0.7027
ROC AUC: 0.9748
Log Loss: 0.0520


Celem modelu było przewidzenie, czy wystąpi awaria maszyny (Machine failure = 1) na podstawie danych procesowych.


Metryka	Wartość	Interpretacja

Accuracy	0.9835	Model ogólnie bardzo dobrze przewiduje, czy wystąpi awaria – poprawny wynik w ~98% przypadków.

F1 Score	0.7027	Skuteczność wykrywania awarii – uwzględnia precyzję i czułość. Wynik 0.70 to dobry balans między fałszywymi alarmami a pominięciami.

ROC AUC	0.9748	Model bardzo dobrze rozróżnia przypadki awarii i normalnej pracy (blisko 1.0 = idealny).

Log Loss	0.0520	Błąd w przewidywaniu prawdopodobieństw – niska wartość = model przewiduje z dużą pewnością i trafnością.

#### 2. Neutral Networsk and embendding 

In [67]:
X_train_nn = pd.read_csv('../data/processed/X_train.csv')
X_val_nn = pd.read_csv('../data/processed/X_val.csv')
y_train_nn = pd.read_csv('../data/processed/y_train.csv')
y_val_nn = pd.read_csv('../data/processed/y_val.csv')

In [68]:
categorical_cols = ['Product ID', 'Type']
numeric_cols = [col for col in X_train_nn.columns if col not in categorical_cols]

In [69]:
# Na wypadek gdyby kategorie nie były intami
for col in categorical_cols:
    X_train_nn[col] = X_train_nn[col].astype(int)
    X_val_nn[col] = X_val_nn[col].astype(int)

# NumPy arrays
X_train_cat = [X_train_nn[col].values for col in categorical_cols]
X_val_cat = [X_val_nn[col].values for col in categorical_cols]

X_train_num = X_train_nn[numeric_cols].values
X_val_num = X_val_nn[numeric_cols].values

# Target
y_train = y_train_nn.values.ravel()
y_val = y_val_nn.values.ravel()

In [70]:
#Embedding for categorical features
inputs_cat = []
embedding = []

for col in categorical_cols:
    col_clean = col.replace(" ", "_").lower()  # np. 'Product ID' → 'product_id'
    vocab_size = int(X_train_nn[col].max()) + 1
    input_cat = Input(shape=(1,), name=f'{col_clean}_input')
    embed = Embedding(input_dim=vocab_size, output_dim=4, name=f'{col_clean}_embed')(input_cat)
    embed = Flatten()(embed)
    inputs_cat.append(input_cat)
    embedding.append(embed)

In [71]:
#numerical inputs
input_num = Input(shape=(X_train_num.shape[1],), name='num_input')

In [72]:
# connect embeddings and numerical inputs
x = Concatenate()(embedding + [input_num])

In [73]:
# Dense layers 
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

In [74]:
model = Model(inputs=inputs_cat + [input_num], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [75]:
history = model.fit(
    x=X_train_cat + [X_train_num],              # wejście: [Product ID, Type, liczby]
    y=y_train,                                  # target: Machine failure
    validation_data=(X_val_cat + [X_val_num], y_val),
    epochs=10,                                  # możesz zmienić np. na 20
    batch_size=128,
    verbose=1
)

Epoch 1/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9659 - loss: 0.3827 - val_accuracy: 0.9660 - val_loss: 0.1954
Epoch 2/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9664 - loss: 0.1773 - val_accuracy: 0.9660 - val_loss: 0.1547
Epoch 3/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9634 - loss: 0.1609 - val_accuracy: 0.9660 - val_loss: 0.1293
Epoch 4/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9662 - loss: 0.1199 - val_accuracy: 0.9665 - val_loss: 0.1099
Epoch 5/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9688 - loss: 0.0950 - val_accuracy: 0.9710 - val_loss: 0.1016
Epoch 6/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9741 - loss: 0.0851 - val_accuracy: 0.9710 - val_loss: 0.0993
Epoch 7/10
[1m47/47[0m [32m━━━━━━━━━━

In [76]:
# Predykcja prawdopodobieństw
y_pred_proba = model.predict(X_val_cat + [X_val_num]).ravel()

# Predykcja klas (0/1) na podstawie progu 0.5
y_pred = (y_pred_proba > 0.5).astype(int)

# Metryki
acc_nn = accuracy_score(y_val, y_pred)
f1_nn = f1_score(y_val, y_pred)
auc_nn = roc_auc_score(y_val, y_pred_proba)
loss_nn = log_loss(y_val, y_pred_proba)

# Wyświetlenie wyników
print("NN + Embedding Results:")
print(f"  Accuracy : {acc_nn:.4f}")
print(f"  F1 Score : {f1_nn:.4f}")
print(f"  AUC      : {auc_nn:.4f}")
print(f"  LogLoss  : {loss_nn:.4f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
NN + Embedding Results:
  Accuracy : 0.9540
  F1 Score : 0.3947
  AUC      : 0.9205
  LogLoss  : 0.1140


In [77]:
results_summary = {
    "Model": ["XGBoost", "NN + Embedding"],
    "Accuracy": [acc, acc_nn],
    "F1 Score": [f1, f1_nn],
    "AUC": [roc_auc, auc_nn],
    "LogLoss": [logloss, loss_nn]
}

# Przekonwertuj do DataFrame
df_summary = pd.DataFrame(results_summary).round(4)

# Wyświetl
df_summary

Unnamed: 0,Model,Accuracy,F1 Score,AUC,LogLoss
0,XGBoost,0.9835,0.7027,0.9748,0.052
1,NN + Embedding,0.954,0.3947,0.9205,0.114
