In [5]:
import pandas as pd
import re

import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss

from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten
from tensorflow.keras.models import Model


In [6]:
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')

y_train = pd.read_csv('../data/processed/y_train.csv')
y_val = pd.read_csv('../data/processed/y_val.csv')

In [7]:
X_train.columns = [re.sub(r"[\[\]<>]", "", col) for col in X_train.columns]
X_val.columns = [re.sub(r"[\[\]<>]", "", col) for col in X_val.columns]

#### 1. Model XGBoost 

In [8]:
# convert DataFrames to format suitable for XGBoost
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

In [9]:
# initialize XGBoost model and train it
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [10]:
# class predictions and probabilities
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:,1]


In [12]:
# calculate evaluation metrics
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_proba)
logloss = log_loss(y_val, y_proba)

# print evaluation metrics
print(f"XGBoost REsults:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"Log Loss: {logloss:.4f}")

XGBoost REsults:
Accuracy: 0.9835
F1 Score: 0.7027
ROC AUC: 0.9748
Log Loss: 0.0520


Celem modelu było przewidzenie, czy wystąpi awaria maszyny (Machine failure = 1) na podstawie danych procesowych.


Metryka	Wartość	Interpretacja

Accuracy	0.9835	Model ogólnie bardzo dobrze przewiduje, czy wystąpi awaria – poprawny wynik w ~98% przypadków.

F1 Score	0.7027	Skuteczność wykrywania awarii – uwzględnia precyzję i czułość. Wynik 0.70 to dobry balans między fałszywymi alarmami a pominięciami.

ROC AUC	0.9748	Model bardzo dobrze rozróżnia przypadki awarii i normalnej pracy (blisko 1.0 = idealny).

Log Loss	0.0520	Błąd w przewidywaniu prawdopodobieństw – niska wartość = model przewiduje z dużą pewnością i trafnością.

#### 2. Neutral Networsk and embendding 

In [6]:
X_train_nn = pd.read_csv('../data/processed/X_train.csv')
X_val_nn = pd.read_csv('../data/processed/X_val.csv')
y_train_nn = pd.read_csv('../data/processed/y_train.csv')
y_val_nn = pd.read_csv('../data/processed/y_val.csv')

In [7]:
categorical_cols = ['Product ID', 'Type']
numeric_cols = [col for col in X_train_nn.columns if col not in categorical_cols]

In [8]:
# Na wypadek gdyby kategorie nie były intami
for col in categorical_cols:
    X_train_nn[col] = X_train_nn[col].astype(int)
    X_val_nn[col] = X_val_nn[col].astype(int)

# NumPy arrays
X_train_cat = [X_train_nn[col].values for col in categorical_cols]
X_val_cat = [X_val_nn[col].values for col in categorical_cols]

X_train_num = X_train_nn[numeric_cols].values
X_val_num = X_val_nn[numeric_cols].values

# Target
y_train = y_train_nn.values.ravel()
y_val = y_val_nn.values.ravel()

In [9]:
#Embedding for categorical features
inputs_cat = []
embedding = []

for col in categorical_cols:
    vocab_size = int(X_train_nn[col].max()) + 1
    input_cat = Input(shape=(1,), name=f'{col}_input')
    embed = Embedding(input_dim=vocab_size, output_dim=4, name=f'{col}_embed')(input_cat)
    embed = Flatten()(embed)
    inputs_cat.append(input_cat)
    embedding.append(embed)

In [10]:
#numerical inputs
input_num = Input(shape=(X_train_num.shape[1],), name='num_input')

In [12]:
# connect embeddings and numerical inputs
x = Concatenate()(embedding + [input_num])

In [13]:
# Dense layers 
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

In [15]:
model = Model(inputs=inputs_cat + [input_num], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()