In [1]:
import warnings
warnings.filterwarnings(action="ignore")
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

In [2]:
window_size = 2240
n_components = 30

In [3]:
# Cargar las variables independientes (X_train) sin cabecera desde archivo
X_train = np.loadtxt(f'../data/v8.4/transformed_train_data_comp_{n_components}.csv', delimiter=',')

# Cargar las variables dependientes (y_train) con cabecera desde archivo
y_train = pd.read_csv(f'../data/train_value_min_label_windows_{window_size}_llm.csv')

# Cargar el conjunto de test (X_test sin cabeceras y y_test con cabeceras)
X_test = np.loadtxt(f'../data/v8.4/transformed_test_data_comp_{n_components}.csv', delimiter=',')
y_test = pd.read_csv(f'../data/test_value_min_label_windows_{window_size}_llm.csv')

# Eliminar la columna "row" que es solo un índice
y_train = y_train.drop(columns=['row'])
y_test = y_test.drop(columns=['row'])


In [4]:
# Escalar los datos de entrenamiento y test
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.shape, X_test_scaled.shape)

(1193, 30) (534, 30)


In [5]:
# Function to generate the class for each appliance
def generate_class(row, appliance):
    appliance_value = row[appliance]
    other_values = row.drop(appliance).values  # Values for the other appliances

    if appliance_value == 0 and not any(other_values):
        return 'off'
    elif appliance_value == 1 and not any(other_values):
        return 'on'
    elif appliance_value == 0 and any(other_values):
        return 'off w int'
    elif appliance_value == 1 and any(other_values):
        return 'on w int'

In [6]:
from xgboost import XGBClassifier

# Bucle para realizar clasificación binaria para cada columna (clase)
for col in y_train.columns:
    # Create the class labels for each appliance
    y_train_bin = y_train.apply(lambda row: generate_class(row, col), axis=1)
    y_test_bin = y_test.apply(lambda row: generate_class(row, col), axis=1)
    
    label_encoder = LabelEncoder()
    y_train_bin_encoded = label_encoder.fit_transform(y_train_bin)
    y_test_bin_encoded = label_encoder.transform(y_test_bin)

    # Get unique classes for the current appliance
    unique_classes = np.unique(y_train_bin_encoded)
    
    # Handle class imbalance by computing sample weights
    class_weights = compute_class_weight('balanced', classes=unique_classes, y=y_train_bin_encoded)
    class_weight_dict = {cls: weight for cls, weight in zip(unique_classes, class_weights)}
    sample_weight = np.array([class_weight_dict[cls] for cls in y_train_bin_encoded])
    
    # Train XGBoost with class weights
    xgb_clf = XGBClassifier(
        eval_metric='mlogloss'
    )

    xgb_clf.fit(X_train_scaled, y_train_bin_encoded, sample_weight=sample_weight)

    # Create mapping for target names based on the fitted label encoder
    class_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}

    # Make predictions
    y_pred_bin = xgb_clf.predict(X_test_scaled)
    y_pred_proba = xgb_clf.predict_proba(X_test_scaled)

    # Verificar que haya más de una clase en el conjunto de entrenamiento
    if len(np.unique(y_test_bin_encoded)) > 1:
        # Calculate metrics
        accuracy = accuracy_score(y_test_bin_encoded, y_pred_bin)
        f1_macro = f1_score(y_test_bin_encoded, y_pred_bin, average="macro")
        f1_weighted = f1_score(y_test_bin_encoded, y_pred_bin, average="weighted")
        # Check if there are more than two classes to calculate AUC
        if len(np.unique(y_test_bin_encoded)) > 2:
            auc_macro = roc_auc_score(y_test_bin_encoded, y_pred_proba, multi_class="ovr", average="macro")
            auc_weighted = roc_auc_score(y_test_bin_encoded, y_pred_proba, multi_class="ovr", average="weighted")
        else:
            # If only two classes, calculate AUC differently
            auc_macro = roc_auc_score(y_test_bin_encoded, y_pred_proba[:, 1])  # Use probabilities of the positive class
            auc_weighted = auc_macro

        
        # Get unique classes in the predictions
        unique_pred_classes = np.unique(y_pred_bin)

        # Create the classification report using the unique classes found
        report = classification_report(
            y_test_bin_encoded, 
            y_pred_bin, 
            target_names=[class_mapping[label] for label in unique_pred_classes],
            labels=unique_pred_classes
        )
        cm = confusion_matrix(y_test_bin_encoded, y_pred_bin)

        # Mostrar resultados
        print(f"Resultados para la clase {col}:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score (macro): {f1_macro:.4f}")
        print(f"F1 Score (weighted): {f1_weighted:.4f}")
        print(f"AUC (macro): {auc_macro:.4f}")
        print(f"AUC (weighted): {auc_weighted:.4f}")
        print("Classification Report:\n", report)
        print(f"Confusion Matrix:\n{cm}")

Resultados para la clase Sockets01:
Accuracy: 0.9139
F1 Score (macro): 0.6151
F1 Score (weighted): 0.9100
AUC (macro): 0.8404
AUC (weighted): 0.9305
Classification Report:
               precision    recall  f1-score   support

         off       0.88      0.99      0.93       243
   off w int       0.96      0.87      0.91       284
    on w int       0.00      0.00      0.00         7

    accuracy                           0.91       534
   macro avg       0.61      0.62      0.62       534
weighted avg       0.91      0.91      0.91       534

Confusion Matrix:
[[241   2   0]
 [ 34 247   3]
 [  0   7   0]]
Resultados para la clase Sockets02:
Accuracy: 0.9326
F1 Score (macro): 0.9325
F1 Score (weighted): 0.9327
AUC (macro): 0.9398
AUC (weighted): 0.9398
Classification Report:
               precision    recall  f1-score   support

         off       0.88      0.99      0.93       243
   off w int       0.99      0.88      0.93       291

    accuracy                           0.93  