# MSL z pyCLAD

### Wczytanie danych

In [1]:
import os
import numpy as np
import pandas as pd

train_data_dir = os.path.join("data/train")
test_data_dir = os.path.join("data/test")

metadata = pd.read_csv("labeled_anomalies.csv")
msl_channels = metadata[metadata['spacecraft'] == 'MSL']['chan_id'].tolist()

# Wczytanie danych treningowych
train_data = {}
for channel in msl_channels:
    npy_path = os.path.join(train_data_dir, f"{channel}.npy")
    if os.path.exists(npy_path):
        arr = np.load(npy_path)
        train_data[channel] = arr[:, 0] 
    else:
        print(f"Brak pliku: {npy_path}")

# Wczytanie danych testowych
test_data = {}
for channel in msl_channels:
    npy_path = os.path.join(test_data_dir, f"{channel}.npy")
    if os.path.exists(npy_path):
        arr = np.load(npy_path)
        test_data[channel] = arr[:, 0] 
    else:
        print(f"Brak pliku: {npy_path}")

# Normalizacja danych do 0-1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for channel in list(train_data.keys()):
    train_data[channel] = scaler.fit_transform(train_data[channel].reshape(-1, 1))
    test_data[channel] = scaler.transform(test_data[channel].reshape(-1, 1))

# Tworzenie etykiet
test_labels = {}
for channel in msl_channels:
    values = test_data[channel]
    labels = np.zeros(len(values), dtype=int) 

    channel_meta = metadata[metadata['chan_id'] == channel]
    if not channel_meta.empty:
        seq_str = channel_meta.iloc[0]['anomaly_sequences']
        if pd.notnull(seq_str) and seq_str != '[]':
            anomaly_intervals = eval(seq_str)
            for start, end in anomaly_intervals:
                start_idx = max(0, start)
                end_idx = min(len(values)-1, end)
                labels[start_idx:end_idx+1] = 1

    test_labels[channel] = labels

del arr, channel, npy_path, train_data_dir, test_data_dir, values, labels, start, end, start_idx, end_idx, seq_str, channel_meta, anomaly_intervals


### Funkcja tworzenia okien czasowych

In [2]:
def create_windows(data, window_size=10, step_size=1):
    arr = np.asarray(data)
    if arr.ndim > 1:
        arr = arr.reshape(-1)
    windows = []
    for start in range(0, len(arr) - window_size + 1, step_size):
        end = start + window_size
        windows.append(arr[start:end].astype(float))
    return np.array(windows)

In [3]:
def create_window_labels(labels, window_size=10, step_size=1):
    window_labels = []
    for start in range(0, len(labels) - window_size + 1, step_size):
        end = start + window_size
        window_labels.append(int(labels[start:end].any()))  # jeśli choć jedna próbka jest anomalna, etykieta okna mówi o anomalii
    return np.array(window_labels)


### Tworzenie okien czasowych

In [None]:
# Parametry okna czasowego
window_size = 10
step_size = 1

# Tworzenie okien dla wszystkich kanałów
train_windows = {}
test_windows = {}

for channel in train_data.keys():
    train_windows[channel] = create_windows(train_data[channel], window_size, step_size)
    test_windows[channel] = create_windows(test_data[channel], window_size, step_size)
    test_labels[channel] = create_window_labels(test_labels[channel], window_size, step_size)



### Tworzenie Konceptów dla pyCLAD

In [5]:
from pyclad.data.concept import Concept

train_concepts = []
test_concepts = []

for channel in msl_channels:
    train_concept = Concept(name=channel, data=train_windows[channel])
    train_concepts.append(train_concept)

    test_concept = Concept(name=channel, data=test_windows[channel], labels=test_labels[channel])
    test_concepts.append(test_concept)

### Definicja datasetu

In [6]:
from pyclad.data.datasets.concepts_dataset import ConceptsDataset

dataset = ConceptsDataset(
    name="MSL_widows_dataset",
    train_concepts=train_concepts,
    test_concepts=test_concepts
)

### Definicja modelu

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from pyclad.models.model import Model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class AE(nn.Module):
    def __init__(self, input_dim):
        super(AE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

class AEp(Model):
    def __init__(self, input_dim, lr=1e-3, percentile=95, epochs=80):
        super().__init__()
        self.input_dim = input_dim
        self.module = AE(self.input_dim).to(device)
        self.lr = lr
        self.epochs = epochs
        self.percentile = percentile
        self.threshold = None  # zostanie ustawiony po treningu

    def fit(self, data: np.ndarray):
        dataset = TensorDataset(torch.Tensor(data))
        loader = DataLoader(dataset, batch_size=128, shuffle=True)
        optimizer = torch.optim.Adam(self.module.parameters(), lr=self.lr)
        loss_fn = nn.MSELoss()
        self.module.train()
        for epoch in range(self.epochs):
            total_loss = 0.0
            for batch in loader:
                x = batch[0].to(device)
                optimizer.zero_grad()
                x_hat = self.module(x)
                loss = loss_fn(x_hat, x)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

        self.module.eval()
        with torch.no_grad():
            x = torch.Tensor(data).to(device)
            x_hat = self.module(x)
            rec_error = ((data - x_hat.cpu().numpy()) ** 2).mean(axis=1)
            self.threshold = np.percentile(rec_error, self.percentile)

    def predict(self, data: np.ndarray):
        self.module.eval()
        with torch.no_grad():
            x = torch.Tensor(data).to(device)
            x_hat = self.module(x)
            rec_error = ((data - x_hat.cpu().numpy()) ** 2).mean(axis=1)
            binary_preds = (rec_error > self.threshold).astype(int)
        return binary_preds, rec_error

    def name(self):
        return "AEp"



Using device: cuda


In [8]:
model = AEp(input_dim=window_size, lr=1e-3, epochs=80, percentile=99)

### Definicja strategii

In [None]:
from pyclad.strategies.baselines.cumulative import CumulativeStrategy
from pyclad.strategies.baselines.naive import NaiveStrategy
from pyclad.strategies.replay.replay import ReplayEnhancedStrategy
from pyclad.strategies.baselines.mste import MSTE
from pyclad.strategies.replay.buffers.adaptive_balanced import AdaptiveBalancedReplayBuffer

from pyclad.strategies.replay.selection.random import RandomSelection

replay_buffer = AdaptiveBalancedReplayBuffer(selection_method=RandomSelection(), max_size=1000)

#strategy = NaiveStrategy(model)
#strategy = CumulativeStrategy(model)
#strategy = ReplayEnhancedStrategy(model, replay_buffer=replay_buffer)

strategy = MSTE(lambda: AEp(input_dim=window_size, epochs=80, percentile=99, lr=1e-3))

### Callbacki

In [10]:
from pyclad.callbacks.evaluation.concept_metric_evaluation import ConceptMetricCallback
from pyclad.callbacks.evaluation.memory_usage import MemoryUsageCallback
from pyclad.callbacks.evaluation.time_evaluation import TimeEvaluationCallback

from pyclad.metrics.base.roc_auc import RocAuc
from pyclad.metrics.continual.average_continual import ContinualAverage
from pyclad.metrics.continual.backward_transfer import BackwardTransfer
from pyclad.metrics.continual.forward_transfer import ForwardTransfer

from pyclad.callbacks.callback import Callback
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

class F1ScoreCallback(Callback):
    def __init__(self):
        super().__init__()
        self.scores = {} 

    def after_evaluation(self, *args, **kwargs):
        y_true = kwargs.get("y_true")
        y_pred = kwargs.get("y_pred")
        concept = kwargs.get("evaluated_concept")
        if y_true is not None and y_pred is not None:
            f1 = f1_score(y_true, y_pred)
            concept_name = concept.name if concept else f"concept_{len(self.scores)+1}"
            self.scores[concept_name] = f1

    def info(self):
        return {"F1Score": self.scores}
    
class ClassificationReportCallback(Callback):
    def __init__(self):
        super().__init__()
        self.reports = {}  
        
    def after_evaluation(self, *args, **kwargs):
        y_true = kwargs.get("y_true")
        y_pred = kwargs.get("y_pred")
        concept = kwargs.get("evaluated_concept")

        if y_true is not None and y_pred is not None:
            report = classification_report(
                y_true, 
                y_pred, 
                target_names=["Normal", "Anomaly"], 
                output_dict=True,
                #zero_division=0
            )
            concept_name = concept.name if concept else f"concept_{len(self.reports)+1}"
            self.reports[concept_name] = report

    def info(self):
        return {"ClassificationReports": self.reports}


callbacks = [
    ConceptMetricCallback(
        base_metric=RocAuc(),
        metrics=[ContinualAverage(), BackwardTransfer(), ForwardTransfer()]
    ),
    TimeEvaluationCallback(),
    MemoryUsageCallback(),
    F1ScoreCallback(),
    ClassificationReportCallback()
]

### Uruchomienie scenariusza

In [11]:

from pyclad.output.json_writer import JsonOutputWriter
from pyclad.scenarios.concept_aware import ConceptAwareScenario
from pyclad.scenarios.concept_agnostic import ConceptAgnosticScenario
import pathlib


scenario = ConceptAwareScenario(
    dataset=dataset,
    strategy=strategy,
    callbacks=callbacks
)

# Uruchomienie eksperymentu
scenario.run()

output_writer = JsonOutputWriter(pathlib.Path("output-strategy2.json"))
output_writer.write([model, dataset, strategy, *callbacks])


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [12]:
import json
import pandas as pd

def extract_classification_reports(json_path, output_csv):
    with open(json_path, "r") as f:
        data = json.load(f)

    reports = data.get("ClassificationReports", {})

    rows = []
    for concept, metrics in reports.items():
        row = {
            "concept": concept,
            "accuracy": metrics.get("accuracy", None),
            "f1_norm": metrics.get("Normal", {}).get("f1-score", None),
            "f1_anom": metrics.get("Anomaly", {}).get("f1-score", None),
            "prec_norm": metrics.get("Normal", {}).get("precision", None),
            "prec_anom": metrics.get("Anomaly", {}).get("precision", None),
            "recall_norm": metrics.get("Normal", {}).get("recall", None),
            "recall_anom": metrics.get("Anomaly", {}).get("recall", None),
        }
        rows.append(row)

    df = pd.DataFrame(rows)

    df.to_csv(output_csv, index=False)
    print(f" Zapisano wyniki do {output_csv}")
    return df


if __name__ == "__main__":
    input_json = "output-strategy2.json"
    output_csv = "pyclad_classifiaction_report.csv"
    df = extract_classification_reports(input_json, output_csv)
    print(df.head())

 Zapisano wyniki do pyclad_classifiaction_report.csv
  concept  accuracy   f1_norm   f1_anom  prec_norm  prec_anom  recall_norm  \
0     M-6  0.569231  0.686567  0.311475   1.000000   0.184466     0.522727   
1     M-1  0.344037  0.511945  0.000000   0.392670   0.000000     0.735294   
2     M-2  0.862385  0.846939  0.875000   0.882979   0.846774     0.813725   
3     S-2  0.994220  0.996923  0.952381   0.993865   1.000000     1.000000   
4    P-10  0.768719  0.863859  0.232044   0.995485   0.132911     0.762976   

   recall_anom  
0     1.000000  
1     0.000000  
2     0.905172  
3     0.909091  
4     0.913043  
