In [2]:
!pip install --quiet wandb

In [1]:
import wandb
print(wandb.__version__)
# from wandb.keras import WandbCallback


0.19.9


In [3]:
import torch
import torch.nn as nn
import pytorch_lightning as pl

class LSTMIDS(pl.LightningModule):
    def __init__(self, input_size, hidden_size, dense_units, num_classes, learning_rate):
        super().__init__()
        self.save_hyperparameters()

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.batch_norm = nn.BatchNorm1d(hidden_size)

        layers = []
        prev_units = hidden_size
        for units in dense_units:
            layers.extend([
                nn.Linear(prev_units, units),
                nn.ReLU(),
                nn.BatchNorm1d(units),
                nn.Dropout(0.3)
            ])
            prev_units = units

        self.dense_block = nn.Sequential(*layers)
        self.output = nn.Linear(prev_units, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        x = lstm_out[:, -1, :]
        x = self.batch_norm(x)
        x = self.dropout(x)
        x = self.dense_block(x)
        return self.output(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log("train_loss", loss)
        self.log("train_acc", acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log("val_loss", loss)
        self.log("val_acc", acc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)


In [4]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

class IDSDataModule(pl.LightningDataModule):
    def __init__(self, X, y, sequence_length, batch_size):
        super().__init__()
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.X, self.y = X, y

    def setup(self, stage=None):
        X_train, X_val, y_train, y_val = train_test_split(self.X, self.y, test_size=0.2, stratify=self.y)
        self.train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                           torch.tensor(y_train, dtype=torch.long))
        self.val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                                         torch.tensor(y_val, dtype=torch.long))

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)


In [1]:
import pandas as pd
df = pd.read_parquet('/kaggle/input/cic-ids-2017-parquet/cic_ids_2017.parquet')
df.head()
# df.columns_type

262024980

In [2]:
df.columns


Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
       'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
       'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std',
       'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
       'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count',
       'ECE Flag Cnt', 'Dow

In [3]:
df.shape

(3119345, 84)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import wandb
from omegaconf import OmegaConf
import os
import pickle
import warnings
from kaggle_secrets import UserSecretsClient

warnings.filterwarnings('ignore')

# Configuration
config = OmegaConf.create({
  "wandb": {
    "project": "DL-NIDS-2--cic-ids-2017",
    "entity": "mohammad-fleity-lebanese-university",
    "tags": ["LSTM", "CIC-IDS-2017", "PyTorch"],
    "notes": "LSTM model for network intrusion detection with PyTorch Lightning"
  },
  "model": {
    "name": "lstm",
    "hidden_size": 80,
    "num_layers": 1,
    "dropout": 0.3,
    "dense_units": [80],
    "learning_rate": 0.001,
    "weight_decay": 1e-5
  },
  "training": {
    "sequence_length": 3,
    "batch_size": 64,
    "max_epochs": 10,
    "early_stopping_patience": 5,
    "gpus": 1 if torch.cuda.is_available() else 0
  },
  "data": {
    "raw": "cic_ids_2017.parquet",
    "test_size": 0.2,
    "num_workers": 4
  }
})

class LSTMModel(pl.LightningModule):
    def __init__(self, input_size, num_classes, config):
        super().__init__()
        self.config = config
        self.save_hyperparameters()
        
        # LSTM Layer
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=config.model.hidden_size,
            num_layers=config.model.num_layers,
            batch_first=True,
            dropout=config.model.dropout if config.model.num_layers > 1 else 0
        )
        
        # Batch normalization after LSTM
        self.lstm_bn = nn.BatchNorm1d(config.model.hidden_size)
        
        # Dropout layer
        self.dropout = nn.Dropout(config.model.dropout)
        
        # Dense layers
        self.dense_layers = nn.ModuleList()
        prev_units = config.model.hidden_size
        for units in config.model.dense_units:
            self.dense_layers.append(nn.Linear(prev_units, units))
            self.dense_layers.append(nn.BatchNorm1d(units))
            self.dense_layers.append(nn.ReLU())
            self.dense_layers.append(nn.Dropout(config.model.dropout))
            prev_units = units
        
        # Output layer
        self.output = nn.Linear(prev_units, num_classes)
        
        # Loss function
        self.criterion = nn.CrossEntropyLoss()
    
    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        # We only need the last timestep's output for classification
        lstm_out = lstm_out[:, -1, :]
        
        # Batch norm
        lstm_out = self.lstm_bn(lstm_out)
        
        # Dense layers
        x = lstm_out
        for layer in self.dense_layers:
            x = layer(x)
        
        # Output
        return self.output(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        
        # Log training metrics
        self.log('train_loss', loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        
        # Calculate accuracy
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        
        # Log validation metrics
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return {'val_loss': loss, 'val_acc': acc}
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        
        # Log test metrics
        self.log('test_loss', loss)
        self.log('test_acc', acc)
        return {'test_loss': loss, 'test_acc': acc, 'preds': preds, 'targets': y}
    
    def configure_optimizers(self):
        optimizer = optim.Adam(
            self.parameters(),
            lr=self.config.model.learning_rate,
            weight_decay=self.config.model.weight_decay
        )
        return optimizer

# class NIDSDataModule(pl.LightningDataModule):
#     def __init__(self, config):
#         super().__init__()
#         self.config = config
#         self.batch_size = config.training.batch_size
#         self.sequence_length = config.training.sequence_length
#         self.num_workers = config.data.num_workers
        
#     def prepare_data(self):
#         # Load and preprocess data
#         df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
        
#         # Clean data
#         df.replace([np.inf, -np.inf], np.nan, inplace=True)
#         df.dropna(inplace=True)
        
#         # Remove duplicates
#         df.drop_duplicates(inplace=True)
        
#         # Encode labels
#         self.label_encoder = LabelEncoder()
#         df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
#         self.classes = self.label_encoder.classes_
        
#         # Split data
#         train_df, test_df = train_test_split(
#             df, test_size=self.config.data.test_size, 
#             random_state=42, 
#             stratify=df['Label_Num']
#         )
        
#         # Prepare features and labels
#         X_train = train_df.drop(['Label', 'Label_Num', 'Timestamp', 'Flow ID'], axis=1)
#         y_train = train_df['Label_Num']
#         X_test = test_df.drop(['Label', 'Label_Num', 'Timestamp', 'Flow ID'], axis=1)
#         y_test = test_df['Label_Num']
        
#         # Standardize features
#         self.scaler = StandardScaler()
#         X_train = self.scaler.fit_transform(X_train)
#         X_test = self.scaler.transform(X_test)
        
#         # Create sequences
#         self.X_train_seq, self.y_train_seq = self.create_sequences(X_train, y_train)
#         self.X_test_seq, self.y_test_seq = self.create_sequences(X_test, y_test)

class NIDSDataModule(pl.LightningDataModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.batch_size = config.training.batch_size
        self.sequence_length = config.training.sequence_length
        self.num_workers = config.data.num_workers
        
    def prepare_data(self):
        # Load and preprocess data
        df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
        
        # Clean data
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)
        
        # Remove duplicates
        df.drop_duplicates(inplace=True)
        # Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
        # Identify non-numeric columns to exclude
        self.non_numeric_cols = ['Label', 'Timestamp', 'Flow ID', 'Src IP', 'Src Port','Attack', 'Dst IP', 'Dst Port', 'Protocol']
        self.non_numeric_cols = [col for col in self.non_numeric_cols if col in df.columns]
        
        # Encode labels
        self.label_encoder = LabelEncoder()
        df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
        self.classes = self.label_encoder.classes_
        
        # Split data
        train_df, test_df = train_test_split(
            df, test_size=self.config.data.test_size, 
            random_state=42, 
            stratify=df['Label_Num']
        )
        
        # Prepare features and labels
        X_train = train_df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
        y_train = train_df['Label_Num']
        X_test = test_df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
        y_test = test_df['Label_Num']
        
        # Standardize features
        self.scaler = StandardScaler()
        X_train = self.scaler.fit_transform(X_train)
        X_test = self.scaler.transform(X_test)
        
        # Create sequences
        self.X_train_seq, self.y_train_seq = self.create_sequences(X_train, y_train)
        self.X_test_seq, self.y_test_seq = self.create_sequences(X_test, y_test)        
    
    def create_sequences(self, X, y):
        X_sequences = []
        y_sequences = []
        
        for i in range(len(X) - self.sequence_length):
            X_sequences.append(X[i:i+self.sequence_length])
            y_sequences.append(y.iloc[i+self.sequence_length-1])
            
        return np.array(X_sequences), np.array(y_sequences)
    
    def setup(self, stage=None):
        # Convert to tensors
        self.train_dataset = TensorDataset(
            torch.FloatTensor(self.X_train_seq),
            torch.LongTensor(self.y_train_seq)
        )
        self.test_dataset = TensorDataset(
            torch.FloatTensor(self.X_test_seq),
            torch.LongTensor(self.y_test_seq)
        )
    
    def train_dataloader(self):
        # return DataLoader(
        #     self.train_dataset,
        #     batch_size=self.batch_size,
        #     shuffle=True,
        #     num_workers=self.num_workers,
        #     pin_memory=True
        # )
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=True,
            persistent_workers=True
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,  # Using test set for validation in this example
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )

def init_wandb():
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("mohammad_wandb_secret")
    wandb.login(key=wandb_api_key)
    wandb_logger = WandbLogger(
        project=config.wandb.project,
        entity=config.wandb.entity,
        tags=config.wandb.tags,
        notes=config.wandb.notes,
        log_model='all'
    )
    
    return wandb_logger

def main():
    # Initialize wandb
    wandb_logger = init_wandb()
    
    # Initialize data module
    data_module = NIDSDataModule(config)
    data_module.prepare_data()
    data_module.setup()
    
    # Get input size from data
    sample_x, _ = next(iter(data_module.train_dataloader()))
    input_size = sample_x.shape[2]
    num_classes = len(data_module.classes)
    # Log dataset info to wandb
    wandb.config.update({
        "input_size": input_size,
        "num_classes": num_classes,
        "sequence_length": config.training.sequence_length,
        "train_samples": len(data_module.train_dataset),
        "test_samples": len(data_module.test_dataset)
    })
    
    # Initialize model
    model = LSTMModel(input_size, num_classes, config)
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=config.training.early_stopping_patience,
        mode='min'
    )
    
    checkpoint_callback = ModelCheckpoint(
        monitor='val_acc',
        mode='max',
        save_top_k=1,
        dirpath='checkpoints',
        filename='best_model'
    )
    
    # Initialize trainer
    trainer = pl.Trainer(
        logger=wandb_logger,
        max_epochs=config.training.max_epochs,
        # gpus=config.training.gpus,
        callbacks=[early_stopping, checkpoint_callback],
        deterministic=True,
        enable_progress_bar=True,
        log_every_n_steps=1000
    )
    
    # Train model
    trainer.fit(model, datamodule=data_module)
    
    # Test model
    test_results = trainer.test(model, datamodule=data_module)
    
    # Log confusion matrix
    test_loader = data_module.test_dataloader()
    all_preds = []
    all_targets = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            y_hat = model(x)
            preds = torch.argmax(y_hat, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y.cpu().numpy())
    
    wandb.log({
        "confusion_matrix": wandb.plot.confusion_matrix(
            y_true=all_targets,
            preds=all_preds,
            class_names=data_module.classes.tolist())
    })
    
    # Log classification report
    report = classification_report(
        all_targets, all_preds, 
        target_names=data_module.classes.tolist(),
        output_dict=True
    )
    
    wandb.log({
        "classification_report": report,
        "test_accuracy": accuracy_score(all_targets, all_preds)
    })
    
    # Finish wandb run
    wandb.finish()

if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohammad-fleity[0m ([33mmohammad-fleity-lebanese-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset, WeightedRandomSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import wandb
from omegaconf import OmegaConf
import os
import pickle
import warnings
from kaggle_secrets import UserSecretsClient
from sklearn.metrics import f1_score
warnings.filterwarnings('ignore')

# Configuration
config = OmegaConf.create({
  "wandb": {
    "project": "DL-NIDS-2--cic-ids-2017",
    "entity": "mohammad-fleity-lebanese-university",
    "tags": ["LSTM", "CIC-IDS-2017", "PyTorch"],
    "notes": "LSTM model for network intrusion detection with PyTorch Lightning"
  },
  "model": {
    "name": "lstm",
    "hidden_size": 80,
    "num_layers": 2,
    "dropout": 0.3,
    "dense_units": [160],
    "learning_rate": 0.0001,
    "weight_decay": 1e-5
  },
  "training": {
    "sequence_length": 3,
    "batch_size": 64,
    "max_epochs": 8,
    "early_stopping_patience": 5,
    "oversample": True,
    "gpus": 1 if torch.cuda.is_available() else 0
  },
  "data": {
    "raw": "cic_ids_2017.parquet",
    "test_size": 0.2,
    "num_workers": 4
  }
})

class LSTMModel(pl.LightningModule):
    # def __init__(self, input_size, num_classes, config):
    def __init__(self, input_size, num_classes, config):
        super().__init__()
        self.config = config
        self.save_hyperparameters()
        
        # LSTM Layer
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=config.model.hidden_size,
            num_layers=config.model.num_layers,
            batch_first=True,
            # dropout=config.model.dropout if config.model.num_layers > 1 else 0
            dropout=0.5 if config.model.num_layers > 1 else 0.3
        )
        
        # Batch normalization after LSTM
        self.lstm_bn = nn.BatchNorm1d(config.model.hidden_size)
        
        # Dropout layer
        self.dropout = nn.Dropout(config.model.dropout)
        
        # Dense layers
        self.dense_layers = nn.ModuleList()
        prev_units = config.model.hidden_size
        for units in config.model.dense_units:
            self.dense_layers.append(nn.Linear(prev_units, units))
            self.dense_layers.append(nn.BatchNorm1d(units))
            self.dense_layers.append(nn.ReLU())
            self.dense_layers.append(nn.Dropout(config.model.dropout))
            prev_units = units
        
        # Output layer
        self.output = nn.Linear(prev_units, num_classes)
        
        # Loss function
        self.criterion = nn.CrossEntropyLoss()
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        
        # Calculate F1 score
        f1 = f1_score(y.cpu().numpy(), preds.cpu().numpy(), average='weighted')
        
        # Log metrics
        self.log('train_loss_epoch', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train_acc_epoch', acc*100, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train_f1_score', torch.tensor(f1), on_step=False, on_epoch=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        
        # Calculate F1 score
        f1 = f1_score(y.cpu().numpy(), preds.cpu().numpy(), average='weighted')
        
        # Log metrics
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc*100, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_f1_score', torch.tensor(f1), on_step=False, on_epoch=True)
        
        return {'val_loss': loss, 'val_acc': acc, 'val_f1_score': f1, 'preds': preds, 'targets': y}
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        
        # Calculate F1 score
        f1 = f1_score(y.cpu().numpy(), preds.cpu().numpy(), average='weighted')
        
        # Log metrics
        self.log('test_loss', loss, on_step=False, on_epoch=True)
        self.log('test_acc', acc*100, on_step=False, on_epoch=True)
        self.log('test_f1', torch.tensor(f1), on_step=False, on_epoch=True)
        
        return {'test_loss': loss, 'test_acc': acc, 'test_f1': f1, 'preds': preds, 'targets': y}
    
    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        # We only need the last timestep's output for classification
        lstm_out = lstm_out[:, -1, :]
        
        # Batch norm
        lstm_out = self.lstm_bn(lstm_out)
        
        # Dense layers
        x = lstm_out
        for layer in self.dense_layers:
            x = layer(x)
        
        # Output
        return self.output(x)
    
 
    def configure_optimizers(self):
        optimizer = optim.Adam(
            self.parameters(),
            lr=self.config.model.learning_rate,
            weight_decay=self.config.model.weight_decay
        )
        return optimizer


class NIDSDataModule(pl.LightningDataModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.batch_size = config.training.batch_size
        self.sequence_length = config.training.sequence_length
        self.num_workers = config.data.num_workers
        self.oversample=config.training.oversample
        self.alpha = 0.5 
        
    def prepare_data(self):
        # Load and preprocess data
        df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
        
        # Clean data
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)
        
        # Remove duplicates
        df.drop_duplicates(inplace=True)
        # Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
        # Identify non-numeric columns to exclude
        self.non_numeric_cols = ['Label', 'Timestamp', 'Flow ID', 'Src IP', 'Src Port','Attack', 'Dst IP', 'Dst Port', 'Protocol']
        self.non_numeric_cols = [col for col in self.non_numeric_cols if col in df.columns]
        
        # Encode labels
        self.label_encoder = LabelEncoder()
        df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
        self.classes = self.label_encoder.classes_
        
        # Split data
        train_df, test_df = train_test_split(
            df, test_size=self.config.data.test_size, 
            random_state=42, 
            stratify=df['Label_Num']
        )
        
        # Prepare features and labels
        X_train = train_df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
        y_train = train_df['Label_Num']
        X_test = test_df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
        y_test = test_df['Label_Num']
        
        # Standardize features
        self.scaler = StandardScaler()
        X_train = self.scaler.fit_transform(X_train)
        X_test = self.scaler.transform(X_test)
        
        # Create sequences
        self.X_train_seq, self.y_train_seq = self.create_sequences(X_train, y_train)
        self.X_test_seq, self.y_test_seq = self.create_sequences(X_test, y_test)        
    
    def create_sequences(self, X, y):
        X_sequences = []
        y_sequences = []
        
        for i in range(len(X) - self.sequence_length):
            X_sequences.append(X[i:i+self.sequence_length])
            y_sequences.append(y.iloc[i+self.sequence_length-1])
            
        return np.array(X_sequences), np.array(y_sequences)
    
    def setup(self, stage=None):
        # Convert to tensors
        self.train_dataset = TensorDataset(
            torch.FloatTensor(self.X_train_seq),
            torch.LongTensor(self.y_train_seq)
        )
        self.test_dataset = TensorDataset(
            torch.FloatTensor(self.X_test_seq),
            torch.LongTensor(self.y_test_seq)
        )
    
    def train_dataloader(self):
        if self.oversample:
            # Compute sample weights based on class frequency
            y_train_np = self.y_train_seq  # Use the already processed sequences
            class_counts = np.bincount(y_train_np)
            inv_freq = 1.0 / class_counts
            class_weights = inv_freq ** self.alpha
            sample_weights = class_weights[y_train_np]
            
            # sampler = WeightedRandomSampler(
            #     weights=sample_weights,
            #     num_samples=len(sample_weights),  # Maintain full dataset size
            #     replacement=True
            # )
            sampler = WeightedRandomSampler(
                weights=sample_weights,
                num_samples=8000,  # FIXED number of samples per epoch
                replacement=True
            )
        else:
            sampler = RandomSampler(
                self.train_dataset,
                num_samples=8000,  # Same fixed size for consistency
                replacement=True
            )
        # else:
        #     sampler = RandomSampler(
        #         self.train_dataset,
        #         replacement=False
        #     )
        return DataLoader(
            dataset=self.train_dataset,
            batch_size=self.batch_size,
            sampler=sampler,
            num_workers=self.num_workers,
            persistent_workers=True,
            pin_memory=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,  # Using test set for validation in this example
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )


def init_wandb():
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("mohammad_wandb_secret")
    wandb.login(key=wandb_api_key)
    
    # Initialize the run first
    run = wandb.init(
        project=config.wandb.project,
        entity=config.wandb.entity,
        tags=config.wandb.tags,
        notes=config.wandb.notes,
        config={
            "input_size": None,  # Will be updated later
            "num_classes": None,
            "sequence_length": config.training.sequence_length,
            "train_samples": None,
            "test_samples": None,
            "model_config": dict(config.model),
            "training_config": dict(config.training)
        }
    )
    
    # Then create the logger
    wandb_logger = WandbLogger(
        experiment=run,
        log_model='all'
    )
    
    return wandb_logger, run

def main():
    # Initialize wandb - now returns both logger and run
    wandb_logger, run = init_wandb()
    
    # Initialize data module
    data_module = NIDSDataModule(config)
    data_module.prepare_data()
    data_module.setup()
    
    # Get input size from data
    sample_x, _ = next(iter(data_module.train_dataloader()))
    input_size = sample_x.shape[2]
    num_classes = len(data_module.classes)
    
    # Update the config with actual values
    run.config.update({
        "input_size": input_size,
        "num_classes": num_classes,
        "train_samples": len(data_module.train_dataset),
        "test_samples": len(data_module.test_dataset)
    })
    
    model = LSTMModel(input_size, num_classes, config)
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=config.training.early_stopping_patience,
        mode='min'
    )
    
    # checkpoint_callback = ModelCheckpoint(
    #     monitor='val_f1',  # Now monitoring F1 score
    #     mode='max',
    #     save_top_k=1,
    #     dirpath='checkpoints',
    #     filename='best_model'
    # )
    checkpoint_callback = ModelCheckpoint(
        monitor='val_acc',
        mode='max',
        save_top_k=1,
        dirpath='checkpoints',
        filename='best_model'
    )
 
    # Initialize trainer
    trainer = pl.Trainer(
        logger=wandb_logger,
        max_epochs=config.training.max_epochs,
        callbacks=[early_stopping, checkpoint_callback],
        deterministic=True,
        gradient_clip_val=1.0,
        enable_progress_bar=True,
        log_every_n_steps=1000
    )
    
    # Train model
    trainer.fit(model, datamodule=data_module)
    
    # Test model
    test_results = trainer.test(model, datamodule=data_module)
    
    # Collect all predictions and targets for final evaluation
    test_loader = data_module.test_dataloader()
    all_preds = []
    all_targets = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            y_hat = model(x)
            preds = torch.argmax(y_hat, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y.cpu().numpy())
    
    # Final metrics calculation
    test_acc = accuracy_score(all_targets, all_preds)
    test_f1 = f1_score(all_targets, all_preds, average='weighted')
    
    # Log final test metrics
    wandb.log({
        'test_acc': test_acc,
        'test_f1': test_f1,
        'test_loss': test_results[0]['test_loss']
    })
    
    # Confusion matrix and classification report
    class_names = data_module.classes.tolist()
    
    # Confusion Matrix
    wandb.log({
        "confusion_matrix": wandb.plot.confusion_matrix(
            y_true=all_targets,
            preds=all_preds,
            class_names=class_names,
            title="Confusion Matrix"
        )
    })
    
    # Classification Report
    report = classification_report(
        all_targets, all_preds, 
        target_names=class_names,
        output_dict=True
    )
    
    # Create a wandb Table for the classification report
    report_table = wandb.Table(columns=["Class", "Precision", "Recall", "F1-Score", "Support"])
    for class_name in class_names:
        report_table.add_data(
            class_name,
            report[class_name]["precision"],
            report[class_name]["recall"],
            report[class_name]["f1-score"],
            report[class_name]["support"]
        )
    
    # Add weighted averages
    report_table.add_data(
        "Weighted Avg",
        report["weighted avg"]["precision"],
        report["weighted avg"]["recall"],
        report["weighted avg"]["f1-score"],
        report["weighted avg"]["support"]
    )
    
    wandb.log({"classification_report": report_table})
    
    # Finish wandb run
    wandb.finish()
if __name__ == "__main__":
    main()



Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import wandb
from omegaconf import OmegaConf
import os
import pickle
import warnings
from kaggle_secrets import UserSecretsClient

warnings.filterwarnings('ignore')

# Configuration
config = OmegaConf.create({
  "wandb": {
    "project": "DL-NIDS-2--cic-ids-2017",
    "entity": "mohammad-fleity-lebanese-university",
    "tags": ["LSTM", "CIC-IDS-2017", "PyTorch"],
    "notes": "LSTM model for network intrusion detection with PyTorch Lightning"
  },
  "model": {
    "name": "lstm",
    "hidden_size": 80,
    "num_layers": 1,
    "dropout": 0.3,
    "dense_units": [80],
    "learning_rate": 0.001,
    "weight_decay": 1e-5
  },
  "training": {
    "sequence_length": 3,
    "batch_size": 64,
    "max_epochs": 10,
    "early_stopping_patience": 5,
    "gpus": 1 if torch.cuda.is_available() else 0
  },
  "data": {
    "raw": "cic_ids_2017.parquet",
    "test_size": 0.2,
    "num_workers": 4
  }
})

class LSTMModel(pl.LightningModule):
    def __init__(self, input_size, num_classes, config):
        super().__init__()
        self.config = config
        self.save_hyperparameters()
        
        # LSTM Layer
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=config.model.hidden_size,
            num_layers=config.model.num_layers,
            batch_first=True,
            dropout=config.model.dropout if config.model.num_layers > 1 else 0
        )
        
        # Batch normalization after LSTM
        self.lstm_bn = nn.BatchNorm1d(config.model.hidden_size)
        
        # Dropout layer
        self.dropout = nn.Dropout(config.model.dropout)
        
        # Dense layers
        self.dense_layers = nn.ModuleList()
        prev_units = config.model.hidden_size
        for units in config.model.dense_units:
            self.dense_layers.append(nn.Linear(prev_units, units))
            self.dense_layers.append(nn.BatchNorm1d(units))
            self.dense_layers.append(nn.ReLU())
            self.dense_layers.append(nn.Dropout(config.model.dropout))
            prev_units = units
        
        # Output layer
        self.output = nn.Linear(prev_units, num_classes)
        
        # Loss function
        self.criterion = nn.CrossEntropyLoss()
    
    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        # We only need the last timestep's output for classification
        lstm_out = lstm_out[:, -1, :]
        
        # Batch norm
        lstm_out = self.lstm_bn(lstm_out)
        
        # Dense layers
        x = lstm_out
        for layer in self.dense_layers:
            x = layer(x)
        
        # Output
        return self.output(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        
        # Log training metrics
        self.log('train_loss', loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        
        # Calculate accuracy
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        
        # Log validation metrics
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return {'val_loss': loss, 'val_acc': acc}
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        
        # Log test metrics
        self.log('test_loss', loss)
        self.log('test_acc', acc)
        return {'test_loss': loss, 'test_acc': acc, 'preds': preds, 'targets': y}
    
    def configure_optimizers(self):
        optimizer = optim.Adam(
            self.parameters(),
            lr=self.config.model.learning_rate,
            weight_decay=self.config.model.weight_decay
        )
        return optimizer

# class NIDSDataModule(pl.LightningDataModule):
#     def __init__(self, config):
#         super().__init__()
#         self.config = config
#         self.batch_size = config.training.batch_size
#         self.sequence_length = config.training.sequence_length
#         self.num_workers = config.data.num_workers
        
#     def prepare_data(self):
#         # Load and preprocess data
#         df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
        
#         # Clean data
#         df.replace([np.inf, -np.inf], np.nan, inplace=True)
#         df.dropna(inplace=True)
        
#         # Remove duplicates
#         df.drop_duplicates(inplace=True)
        
#         # Encode labels
#         self.label_encoder = LabelEncoder()
#         df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
#         self.classes = self.label_encoder.classes_
        
#         # Split data
#         train_df, test_df = train_test_split(
#             df, test_size=self.config.data.test_size, 
#             random_state=42, 
#             stratify=df['Label_Num']
#         )
        
#         # Prepare features and labels
#         X_train = train_df.drop(['Label', 'Label_Num', 'Timestamp', 'Flow ID'], axis=1)
#         y_train = train_df['Label_Num']
#         X_test = test_df.drop(['Label', 'Label_Num', 'Timestamp', 'Flow ID'], axis=1)
#         y_test = test_df['Label_Num']
        
#         # Standardize features
#         self.scaler = StandardScaler()
#         X_train = self.scaler.fit_transform(X_train)
#         X_test = self.scaler.transform(X_test)
        
#         # Create sequences
#         self.X_train_seq, self.y_train_seq = self.create_sequences(X_train, y_train)
#         self.X_test_seq, self.y_test_seq = self.create_sequences(X_test, y_test)

class NIDSDataModule(pl.LightningDataModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.batch_size = config.training.batch_size
        self.sequence_length = config.training.sequence_length
        self.num_workers = config.data.num_workers
        
    def prepare_data(self):
        # Load and preprocess data
        df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
        
        # Clean data
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)
        
        # Remove duplicates
        df.drop_duplicates(inplace=True)
        # Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
        # Identify non-numeric columns to exclude
        self.non_numeric_cols = ['Label', 'Timestamp', 'Flow ID', 'Src IP', 'Src Port','Attack', 'Dst IP', 'Dst Port', 'Protocol']
        self.non_numeric_cols = [col for col in self.non_numeric_cols if col in df.columns]
        
        # Encode labels
        self.label_encoder = LabelEncoder()
        df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
        self.classes = self.label_encoder.classes_
        
        # Split data
        train_df, test_df = train_test_split(
            df, test_size=self.config.data.test_size, 
            random_state=42, 
            stratify=df['Label_Num']
        )
        print(len(train_df))
        print(len(test_df))
        # max_rows = 90_000
        # if len(train_df) > max_rows:
        #     train_df = train_df.sample(n=max_rows, random_state=42)

        # Prepare features and labels
        X_train = train_df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
        y_train = train_df['Label_Num']
        X_test = test_df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
        y_test = test_df['Label_Num']
        
        # Standardize features
        self.scaler = StandardScaler()
        X_train = self.scaler.fit_transform(X_train)
        X_test = self.scaler.transform(X_test)
        
        # Create sequences
        self.X_train_seq, self.y_train_seq = self.create_sequences(X_train, y_train)
        self.X_test_seq, self.y_test_seq = self.create_sequences(X_test, y_test)        
    
    def create_sequences(self, X, y):
        X_sequences = []
        y_sequences = []
        
        for i in range(len(X) - self.sequence_length):
            X_sequences.append(X[i:i+self.sequence_length])
            y_sequences.append(y.iloc[i+self.sequence_length-1])
            
        return np.array(X_sequences), np.array(y_sequences)
    
    def setup(self, stage=None):
        # Convert to tensors
        self.train_dataset = TensorDataset(
            torch.FloatTensor(self.X_train_seq),
            torch.LongTensor(self.y_train_seq)
        )
        self.test_dataset = TensorDataset(
            torch.FloatTensor(self.X_test_seq),
            torch.LongTensor(self.y_test_seq)
        )
    
    def train_dataloader(self):
        # return DataLoader(
        #     self.train_dataset,
        #     batch_size=self.batch_size,
        #     shuffle=True,
        #     num_workers=self.num_workers,
        #     pin_memory=True
        # )
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=True,
            persistent_workers=True
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,  # Using test set for validation in this example
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )

# def init_wandb():
#     user_secrets = UserSecretsClient()
#     wandb_api_key = user_secrets.get_secret("mohammad_wandb_secret")
#     wandb.login(key=wandb_api_key)
#     wandb_logger = WandbLogger(
#         project=config.wandb.project,
#         entity=config.wandb.entity,
#         tags=config.wandb.tags,
#         notes=config.wandb.notes,
#         log_model='all'
#     )
    
#     return wandb_logger

def init_wandb():
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("mohammad_wandb_secret")
    wandb.login(key=wandb_api_key)
    
    # Initialize the run first
    run = wandb.init(
        project=config.wandb.project,
        entity=config.wandb.entity,
        tags=config.wandb.tags,
        notes=config.wandb.notes,
        config={
            "input_size": None,  # Will be updated later
            "num_classes": None,
            "sequence_length": config.training.sequence_length,
            "train_samples": None,
            "test_samples": None,
            "model_config": dict(config.model),
            "training_config": dict(config.training)
        }
    )
    
    # Then create the logger
    wandb_logger = WandbLogger(
        experiment=run,
        log_model='all'
    )
    
    return wandb_logger, run

def main():
    # Initialize wandb - now returns both logger and run
    wandb_logger, run = init_wandb()
    
    # Initialize data module
    data_module = NIDSDataModule(config)
    data_module.prepare_data()
    data_module.setup()
    
    # Get input size from data
    sample_x, _ = next(iter(data_module.train_dataloader()))
    input_size = sample_x.shape[2]
    num_classes = len(data_module.classes)
    
    # Update the config with actual values
    run.config.update({
        "input_size": input_size,
        "num_classes": num_classes,
        "train_samples": len(data_module.train_dataset),
        "test_samples": len(data_module.test_dataset)
    })
    
    # Initialize model
    model = LSTMModel(input_size, num_classes, config)
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=config.training.early_stopping_patience,
        mode='min'
    )
    
    checkpoint_callback = ModelCheckpoint(
        monitor='val_acc',
        mode='max',
        save_top_k=1,
        dirpath='checkpoints',
        filename='best_model'
    )
    
    # Initialize trainer
    trainer = pl.Trainer(
        logger=wandb_logger,
        max_epochs=config.training.max_epochs,
        callbacks=[early_stopping, checkpoint_callback],
        deterministic=True,
        enable_progress_bar=True,
        log_every_n_steps=1000
    )
    
    # Train model
    trainer.fit(model, datamodule=data_module)
    
    # Test model
    test_results = trainer.test(model, datamodule=data_module)
    
    # Collect all predictions and targets
    test_loader = data_module.test_dataloader()
    all_preds = []
    all_targets = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            y_hat = model(x)
            preds = torch.argmax(y_hat, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y.cpu().numpy())
    
    # Convert to numpy arrays
    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)
    
    # Enhanced Confusion Matrix Logging
    class_names = data_module.classes.tolist()
    
    # Create a more detailed confusion matrix
    wandb.log({
        "confusion_matrix": wandb.plot.confusion_matrix(
            y_true=all_targets,
            preds=all_preds,
            class_names=class_names,
            title="Confusion Matrix",
            normalize=None
        ),
        "normalized_confusion_matrix": wandb.plot.confusion_matrix(
            y_true=all_targets,
            preds=all_preds,
            class_names=class_names,
            title="Normalized Confusion Matrix",
            normalize="all"
        )
    })
    
    # Log classification report
    report = classification_report(
        all_targets, all_preds, 
        target_names=class_names,
        output_dict=True
    )
    
    # Convert report to wandb.Table for better visualization
    report_table = wandb.Table(columns=["Class", "Precision", "Recall", "F1-Score", "Support"])
    for class_name in class_names:
        report_table.add_data(
            class_name,
            report[class_name]["precision"],
            report[class_name]["recall"],
            report[class_name]["f1-score"],
            report[class_name]["support"]
        )
    
    # Add overall metrics
    report_table.add_data(
        "Weighted Avg",
        report["weighted avg"]["precision"],
        report["weighted avg"]["recall"],
        report["weighted avg"]["f1-score"],
        report["weighted avg"]["support"]
    )
    
    wandb.log({
        "classification_report": report_table,
        "test_accuracy": accuracy_score(all_targets, all_preds),
        "test_precision": report["weighted avg"]["precision"],
        "test_recall": report["weighted avg"]["recall"],
        "test_f1": report["weighted avg"]["f1-score"]
    })
    
    # Save the confusion matrix as an image as well
    plt.figure(figsize=(12, 10))
    cm = confusion_matrix(all_targets, all_preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    wandb.log({"confusion_matrix_image": wandb.Image(plt)})
    plt.close()
    
    # Finish wandb run
    wandb.finish()

if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohammad-fleity[0m ([33mmohammad-fleity-lebanese-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler, RandomSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import wandb
from omegaconf import OmegaConf
import os
import warnings
from kaggle_secrets import UserSecretsClient

warnings.filterwarnings('ignore')

# Optimized Configuration
config = OmegaConf.create({
    "wandb": {
        "project": "DL-NIDS-2--cic-ids-2017",
        "entity": "mohammad-fleity-lebanese-university",
        "tags": ["LSTM", "CIC-IDS-2017", "PyTorch"],
        "notes": "Optimized LSTM for network intrusion detection"
    },
    "model": {
        "hidden_size": 128,          # Increased capacity
        "num_layers": 2,             # Deeper network
        "dropout": 0.4,              # Stronger regularization
        "dense_units": [128, 64],    # Better feature extraction
        "learning_rate": 0.0001,     # Slower learning
        "weight_decay": 1e-4         # Stronger L2 regularization
    },
    "training": {
        "sequence_length": 5,        # Longer temporal context
        "batch_size": 128,           # Larger batches
        "max_epochs": 10,            # More training time
        "early_stopping_patience": 7,# More patience
        "oversample": True,          # Class balancing
        "gpus": 1 if torch.cuda.is_available() else 0,
        "train_size": 0.7,           # Proper train/val split
        "val_size": 0.15             # 70/15/15 split
    },
    "data": {
        "raw": "cic_ids_2017.parquet",
        "num_workers": 4
    }
})

class LSTMModel(pl.LightningModule):
    def __init__(self, input_size, num_classes, config):
        super().__init__()
        self.save_hyperparameters()
        
        # Enhanced LSTM with layer normalization
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=config.model.hidden_size,
            num_layers=config.model.num_layers,
            batch_first=True,
            dropout=config.model.dropout if config.model.num_layers > 1 else 0
        )
        
        self.lstm_ln = nn.LayerNorm(config.model.hidden_size)  # Better for sequences
        
        # Improved dense layers
        self.dense = nn.Sequential(
            nn.Linear(config.model.hidden_size, config.model.dense_units[0]),
            nn.LayerNorm(config.model.dense_units[0]),
            nn.ReLU(),
            nn.Dropout(config.model.dropout),
            nn.Linear(config.model.dense_units[0], config.model.dense_units[1]),
            nn.LayerNorm(config.model.dense_units[1]),
            nn.ReLU(),
            nn.Dropout(config.model.dropout)
        )
        
        self.output = nn.Linear(config.model.dense_units[1], num_classes)
        self.criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Regularization

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Last timestep
        lstm_out = self.lstm_ln(lstm_out)
        features = self.dense(lstm_out)
        return self.output(features)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_acc', acc, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log('test_loss', loss)
        self.log('test_acc', acc)
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.config.model.learning_rate,
                                weight_decay=self.hparams.config.model.weight_decay)
        return optimizer

class NIDSDataModule(pl.LightningDataModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.batch_size = config.training.batch_size
        self.sequence_length = config.training.sequence_length
        self.num_workers = config.data.num_workers
        self.oversample = config.training.oversample
        self.alpha = 0.5

    def prepare_data(self):
        df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
    
        # Clean data
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)
        df.drop_duplicates(inplace=True)
    
        # Identify non-numeric columns
        self.non_numeric_cols = ['Label', 'Timestamp', 'Flow ID', 'Src IP', 
                                 'Src Port', 'Attack', 'Dst IP', 'Dst Port', 'Protocol']
        self.non_numeric_cols = [col for col in self.non_numeric_cols if col in df.columns]
    
        # Encode labels
        self.label_encoder = LabelEncoder()
        df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
        self.classes = self.label_encoder.classes_
    
        # Initialize scaler
        self.scaler = StandardScaler()
    
        # Train/Val/Test split
        train_df, test_df = train_test_split(
            df,
            test_size=1 - self.config.training.train_size,
            random_state=42,
            stratify=df['Label_Num']
        )
        val_df, test_df = train_test_split(
            test_df,
            test_size=0.5,
            random_state=42,
            stratify=test_df['Label_Num']
        )
        print(len(train_df))
        print(len(test_df))
        # max_rows = 90_000
        # if len(train_df) > max_rows:
        #     train_df = train_df.sample(n=max_rows, random_state=42)

        # Use `fit=True` only for training data
        self.X_train, self.y_train = self._prepare_features(train_df, fit=True)
        self.X_val, self.y_val = self._prepare_features(val_df, fit=False)
        self.X_test, self.y_test = self._prepare_features(test_df, fit=False)

    
    # def prepare_data(self):
    #     df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
        
    #     # Clean data
    #     df.replace([np.inf, -np.inf], np.nan, inplace=True)
    #     df.dropna(inplace=True)
    #     df.drop_duplicates(inplace=True)
        
    #     # Identify non-numeric columns
    #     self.non_numeric_cols = ['Label', 'Timestamp', 'Flow ID', 'Src IP', 
    #                            'Src Port', 'Attack', 'Dst IP', 'Dst Port', 'Protocol']
    #     self.non_numeric_cols = [col for col in self.non_numeric_cols if col in df.columns]
        
    #     # Encode labels
    #     self.label_encoder = LabelEncoder()
    #     df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
    #     self.classes = self.label_encoder.classes_
    #     self.scaler = StandardScaler()

    #     # Proper train/val/test split
    #     train_df, test_df = train_test_split(
    #         df, 
    #         test_size=1 - self.config.training.train_size,
    #         random_state=42,
    #         stratify=df['Label_Num']
    #     )
    #     val_df, test_df = train_test_split(
    #         test_df,
    #         test_size=0.5,  # Splits remaining 30% into 15% val, 15% test
    #         random_state=42,
    #         stratify=test_df['Label_Num']
    #     )
        
    #     # Process each split
    #     self.X_train, self.y_train = self._prepare_features(train_df)
    #     self.X_val, self.y_val = self._prepare_features(val_df)
    #     self.X_test, self.y_test = self._prepare_features(test_df)
 
    # def _prepare_features(self, df):
    #     X = df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
    #     y = df['Label_Num']
    #     X = self.scaler.transform(X) if hasattr(self, 'scaler') else self.scaler.fit_transform(X)
    #     return self.create_sequences(X, y)
    
    def _prepare_features(self, df, fit=False):
        X = df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
        y = df['Label_Num']
        if fit:
            X = self.scaler.fit_transform(X)
        else:
            X = self.scaler.transform(X)
        return self.create_sequences(X, y)

    def create_sequences(self, X, y):
        sequences = []
        labels = []
        for i in range(len(X) - self.sequence_length):
            sequences.append(X[i:i+self.sequence_length])
            labels.append(y.iloc[i+self.sequence_length-1])
        return np.array(sequences), np.array(labels)
    
    def setup(self, stage=None):
        self.scaler = StandardScaler()
        self.scaler.fit(self.X_train.reshape(-1, self.X_train.shape[-1]))
        
        self.X_train = self.scaler.transform(self.X_train.reshape(-1, self.X_train.shape[-1])).reshape(self.X_train.shape)
        self.X_val = self.scaler.transform(self.X_val.reshape(-1, self.X_val.shape[-1])).reshape(self.X_val.shape)
        self.X_test = self.scaler.transform(self.X_test.reshape(-1, self.X_test.shape[-1])).reshape(self.X_test.shape)
        
        self.train_dataset = TensorDataset(torch.FloatTensor(self.X_train), torch.LongTensor(self.y_train))
        self.val_dataset = TensorDataset(torch.FloatTensor(self.X_val), torch.LongTensor(self.y_val))
        self.test_dataset = TensorDataset(torch.FloatTensor(self.X_test), torch.LongTensor(self.y_test))
        print("the model will be trained on: ",len(self.train_dataset)," samples.")
        print("the model will be validated on: ",len(self.val_dataset)," samples.")
        print("the model will be tested on: ",len(self.test_dataset)," samples.")
    
    def train_dataloader(self):
        if self.oversample:
            class_counts = np.bincount(self.y_train)
            weights = 1. / class_counts[self.y_train]
            sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
        else:
            sampler = RandomSampler(self.train_dataset)
            
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            sampler=sampler,
            num_workers=self.num_workers,
            persistent_workers=True,
            pin_memory=True
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )


def init_wandb():
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("mohammad_wandb_secret")
    wandb.login(key=wandb_api_key)
    
    # Initialize the run first
    run = wandb.init(
        project=config.wandb.project,
        entity=config.wandb.entity,
        tags=config.wandb.tags,
        notes=config.wandb.notes,
        config={
            "input_size": None,  # Will be updated later
            "num_classes": None,
            "sequence_length": config.training.sequence_length,
            "train_samples": None,
            "test_samples": None,
            "model_config": dict(config.model),
            "training_config": dict(config.training)
        }
    )
    
    # Then create the logger
    wandb_logger = WandbLogger(
        experiment=run,
        log_model='all'
    )
    
    return wandb_logger, run

def main():
    # Initialize wandb - now returns both logger and run
    wandb_logger, run = init_wandb()
    
    # Initialize data module
    data_module = NIDSDataModule(config)
    data_module.prepare_data()
    data_module.setup()
    
    # Get input size from data
    sample_x, _ = next(iter(data_module.train_dataloader()))
    input_size = sample_x.shape[2]
    num_classes = len(data_module.classes)
    
    # Update the config with actual values
    run.config.update({
        "input_size": input_size,
        "num_classes": num_classes,
        "train_samples": len(data_module.train_dataset),
        "test_samples": len(data_module.test_dataset)
    })
    
    model = LSTMModel(input_size, num_classes, config)
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=config.training.early_stopping_patience,
        mode='min'
    )
    
    # checkpoint_callback = ModelCheckpoint(
    #     monitor='val_f1',  # Now monitoring F1 score
    #     mode='max',
    #     save_top_k=1,
    #     dirpath='checkpoints',
    #     filename='best_model'
    # )
    checkpoint_callback = ModelCheckpoint(
        monitor='val_acc',
        mode='max',
        save_top_k=1,
        dirpath='checkpoints',
        filename='best_model'
    )
 
    # Initialize trainer
    trainer = pl.Trainer(
        precision=16,
        logger=wandb_logger,
        max_epochs=config.training.max_epochs,
        callbacks=[early_stopping, checkpoint_callback],
        deterministic=True,
        gradient_clip_val=1.0,
        enable_progress_bar=True,
        log_every_n_steps=1000
    )
    
    # Train model
    trainer.fit(model, datamodule=data_module)
    
    # Test model
    test_results = trainer.test(model, datamodule=data_module)
    
    # Collect all predictions and targets for final evaluation
    test_loader = data_module.test_dataloader()
    all_preds = []
    all_targets = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            y_hat = model(x)
            preds = torch.argmax(y_hat, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y.cpu().numpy())
    
    # Final metrics calculation
    test_acc = accuracy_score(all_targets, all_preds)
    test_f1 = f1_score(all_targets, all_preds, average='weighted')
    
    # Log final test metrics
    wandb.log({
        'test_acc': test_acc,
        'test_f1': test_f1,
        'test_loss': test_results[0]['test_loss']
    })
    
    # Confusion matrix and classification report
    class_names = data_module.classes.tolist()
    
    # Confusion Matrix
    wandb.log({
        "confusion_matrix": wandb.plot.confusion_matrix(
            y_true=all_targets,
            preds=all_preds,
            class_names=class_names,
            title="Confusion Matrix"
        )
    })
    
    # Classification Report
    report = classification_report(
        all_targets, all_preds, 
        target_names=class_names,
        output_dict=True
    )
    
    # Create a wandb Table for the classification report
    report_table = wandb.Table(columns=["Class", "Precision", "Recall", "F1-Score", "Support"])
    for class_name in class_names:
        report_table.add_data(
            class_name,
            report[class_name]["precision"],
            report[class_name]["recall"],
            report[class_name]["f1-score"],
            report[class_name]["support"]
        )
    
    # Add weighted averages
    report_table.add_data(
        "Weighted Avg",
        report["weighted avg"]["precision"],
        report["weighted avg"]["recall"],
        report["weighted avg"]["f1-score"],
        report["weighted avg"]["support"]
    )
    
    wandb.log({"classification_report": report_table})
    
    # Finish wandb run
    wandb.finish()
if __name__ == "__main__":
    main()

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler, RandomSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import wandb
from omegaconf import OmegaConf
import os
import warnings
from kaggle_secrets import UserSecretsClient

warnings.filterwarnings('ignore')

# Optimized Configuration
config = OmegaConf.create({
    "wandb": {
        "project": "DL-NIDS-2--cic-ids-2017",
        "entity": "mohammad-fleity-lebanese-university",
        "tags": ["LSTM", "CIC-IDS-2017", "PyTorch"],
        "notes": "Optimized LSTM for network intrusion detection"
    },
    "model": {
        "hidden_size": 128,          # Increased capacity
        "num_layers": 2,             # Deeper network
        "dropout": 0.4,              # Stronger regularization
        "dense_units": [128, 64],    # Better feature extraction
        "learning_rate": 0.0001,     # Slower learning
        "weight_decay": 1e-4         # Stronger L2 regularization
    },
    "training": {
        "sequence_length": 5,        # Longer temporal context
        "batch_size": 128,           # Larger batches
        "max_epochs": 7,            # More training time
        "early_stopping_patience": 7,# More patience
        "oversample": True,          # Class balancing
        "gpus": 1 if torch.cuda.is_available() else 0,
        "train_size": 0.7,           # Proper train/val split
        "val_size": 0.15             # 70/15/15 split
    },
    "data": {
        "raw": "cic_ids_2017.parquet",
        "num_workers": 4
    }
})

class LSTMModel(pl.LightningModule):
    def __init__(self, input_size, num_classes, config):
        super().__init__()
        self.save_hyperparameters()
        
        # Enhanced LSTM with layer normalization
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=config.model.hidden_size,
            num_layers=config.model.num_layers,
            batch_first=True,
            dropout=config.model.dropout if config.model.num_layers > 1 else 0
        )
        
        self.lstm_ln = nn.LayerNorm(config.model.hidden_size)  # Better for sequences
        
        # Improved dense layers
        self.dense = nn.Sequential(
            nn.Linear(config.model.hidden_size, config.model.dense_units[0]),
            nn.LayerNorm(config.model.dense_units[0]),
            nn.ReLU(),
            nn.Dropout(config.model.dropout),
            nn.Linear(config.model.dense_units[0], config.model.dense_units[1]),
            nn.LayerNorm(config.model.dense_units[1]),
            nn.ReLU(),
            nn.Dropout(config.model.dropout)
        )
        
        self.output = nn.Linear(config.model.dense_units[1], num_classes)
        self.criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Regularization

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Last timestep
        lstm_out = self.lstm_ln(lstm_out)
        features = self.dense(lstm_out)
        return self.output(features)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log('train_loss_epoch', loss, prog_bar=True)
        self.log('train_acc_epoch', acc*100, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc*100, prog_bar=True)
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log('test_loss', loss)
        self.log('test_acc', acc*100)
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.config.model.learning_rate,
                                weight_decay=self.hparams.config.model.weight_decay)
        return optimizer

class NIDSDataModule(pl.LightningDataModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.batch_size = config.training.batch_size
        self.sequence_length = config.training.sequence_length
        self.num_workers = config.data.num_workers
        self.oversample = config.training.oversample
        self.alpha = 0.5

    def prepare_data(self):
        df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
    
        # Clean data
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)
        df.drop_duplicates(inplace=True)
    
        # Identify non-numeric columns
        self.non_numeric_cols = ['Label', 'Timestamp', 'Flow ID', 'Src IP', 
                                 'Src Port', 'Attack', 'Dst IP', 'Dst Port', 'Protocol']
        self.non_numeric_cols = [col for col in self.non_numeric_cols if col in df.columns]
    
        # Encode labels
        self.label_encoder = LabelEncoder()
        df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
        self.classes = self.label_encoder.classes_
    
        # Initialize scaler
        self.scaler = StandardScaler()
    
        # Train/Val/Test split
        train_df, test_df = train_test_split(
            df,
            test_size=1 - self.config.training.train_size,
            random_state=42,
            stratify=df['Label_Num']
        )
        val_df, test_df = train_test_split(
            test_df,
            test_size=0.5,
            random_state=42,
            stratify=test_df['Label_Num']
        )
        print(len(train_df))
        print(len(test_df))
        # max_rows = 90_000
        # if len(train_df) > max_rows:
        #     train_df = train_df.sample(n=max_rows, random_state=42)

        # Use `fit=True` only for training data
        self.X_train, self.y_train = self._prepare_features(train_df, fit=True)
        self.X_val, self.y_val = self._prepare_features(val_df, fit=False)
        self.X_test, self.y_test = self._prepare_features(test_df, fit=False)

    
    # def prepare_data(self):
    #     df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
        
    #     # Clean data
    #     df.replace([np.inf, -np.inf], np.nan, inplace=True)
    #     df.dropna(inplace=True)
    #     df.drop_duplicates(inplace=True)
        
    #     # Identify non-numeric columns
    #     self.non_numeric_cols = ['Label', 'Timestamp', 'Flow ID', 'Src IP', 
    #                            'Src Port', 'Attack', 'Dst IP', 'Dst Port', 'Protocol']
    #     self.non_numeric_cols = [col for col in self.non_numeric_cols if col in df.columns]
        
    #     # Encode labels
    #     self.label_encoder = LabelEncoder()
    #     df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
    #     self.classes = self.label_encoder.classes_
    #     self.scaler = StandardScaler()

    #     # Proper train/val/test split
    #     train_df, test_df = train_test_split(
    #         df, 
    #         test_size=1 - self.config.training.train_size,
    #         random_state=42,
    #         stratify=df['Label_Num']
    #     )
    #     val_df, test_df = train_test_split(
    #         test_df,
    #         test_size=0.5,  # Splits remaining 30% into 15% val, 15% test
    #         random_state=42,
    #         stratify=test_df['Label_Num']
    #     )
        
    #     # Process each split
    #     self.X_train, self.y_train = self._prepare_features(train_df)
    #     self.X_val, self.y_val = self._prepare_features(val_df)
    #     self.X_test, self.y_test = self._prepare_features(test_df)
 
    # def _prepare_features(self, df):
    #     X = df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
    #     y = df['Label_Num']
    #     X = self.scaler.transform(X) if hasattr(self, 'scaler') else self.scaler.fit_transform(X)
    #     return self.create_sequences(X, y)
    
    def _prepare_features(self, df, fit=False):
        X = df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
        y = df['Label_Num']
        if fit:
            X = self.scaler.fit_transform(X)
        else:
            X = self.scaler.transform(X)
        return self.create_sequences(X, y)

    def create_sequences(self, X, y):
        sequences = []
        labels = []
        for i in range(len(X) - self.sequence_length):
            sequences.append(X[i:i+self.sequence_length])
            labels.append(y.iloc[i+self.sequence_length-1])
        return np.array(sequences), np.array(labels)
    
    def setup(self, stage=None):
        self.scaler = StandardScaler()
        self.scaler.fit(self.X_train.reshape(-1, self.X_train.shape[-1]))
        
        self.X_train = self.scaler.transform(self.X_train.reshape(-1, self.X_train.shape[-1])).reshape(self.X_train.shape)
        self.X_val = self.scaler.transform(self.X_val.reshape(-1, self.X_val.shape[-1])).reshape(self.X_val.shape)
        self.X_test = self.scaler.transform(self.X_test.reshape(-1, self.X_test.shape[-1])).reshape(self.X_test.shape)
        
        self.train_dataset = TensorDataset(torch.FloatTensor(self.X_train), torch.LongTensor(self.y_train))
        self.val_dataset = TensorDataset(torch.FloatTensor(self.X_val), torch.LongTensor(self.y_val))
        self.test_dataset = TensorDataset(torch.FloatTensor(self.X_test), torch.LongTensor(self.y_test))
        print("the model will be trained on: ",len(self.train_dataset)," samples.")
        print("the model will be validated on: ",len(self.val_dataset)," samples.")
        print("the model will be tested on: ",len(self.test_dataset)," samples.")
    
    def train_dataloader(self):
        if self.oversample:
            class_counts = np.bincount(self.y_train)
            weights = 1. / class_counts[self.y_train]
            sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
        else:
            sampler = RandomSampler(self.train_dataset)
            
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            sampler=sampler,
            num_workers=self.num_workers,
            persistent_workers=True,
            pin_memory=True
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )


def init_wandb():
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("mohammad_wandb_secret")
    wandb.login(key=wandb_api_key)
    
    # Initialize the run first
    run = wandb.init(
        project=config.wandb.project,
        entity=config.wandb.entity,
        tags=config.wandb.tags,
        notes=config.wandb.notes,
        config={
            "input_size": None,  # Will be updated later
            "num_classes": None,
            "sequence_length": config.training.sequence_length,
            "train_samples": None,
            "test_samples": None,
            "model_config": dict(config.model),
            "training_config": dict(config.training)
        }
    )
    
    # Then create the logger
    wandb_logger = WandbLogger(
        experiment=run,
        log_model='all'
    )
    
    return wandb_logger, run

def main():
    # Initialize wandb - now returns both logger and run
    wandb_logger, run = init_wandb()
    
    # Initialize data module
    data_module = NIDSDataModule(config)
    data_module.prepare_data()
    data_module.setup()
    
    # Get input size from data
    sample_x, _ = next(iter(data_module.train_dataloader()))
    input_size = sample_x.shape[2]
    num_classes = len(data_module.classes)
    
    # Update the config with actual values
    run.config.update({
        "input_size": input_size,
        "num_classes": num_classes,
        "train_samples": len(data_module.train_dataset),
        "test_samples": len(data_module.test_dataset)
    })
    
    model = LSTMModel(input_size, num_classes, config)
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=config.training.early_stopping_patience,
        mode='min'
    )
    
    # checkpoint_callback = ModelCheckpoint(
    #     monitor='val_f1',  # Now monitoring F1 score
    #     mode='max',
    #     save_top_k=1,
    #     dirpath='checkpoints',
    #     filename='best_model'
    # )
    checkpoint_callback = ModelCheckpoint(
        monitor='val_acc',
        mode='max',
        save_top_k=1,
        dirpath='checkpoints',
        filename='best_model'
    )
 
    # Initialize trainer
    trainer = pl.Trainer(
        precision=16,
        logger=wandb_logger,
        max_epochs=config.training.max_epochs,
        callbacks=[early_stopping, checkpoint_callback],
        deterministic=True,
        gradient_clip_val=1.0,
        enable_progress_bar=True,
        log_every_n_steps=1000
    )
    
    # Train model
    trainer.fit(model, datamodule=data_module)
    
    # Test model
    test_results = trainer.test(model, datamodule=data_module)
    
    # Collect all predictions and targets for final evaluation
    test_loader = data_module.test_dataloader()
    all_preds = []
    all_targets = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            y_hat = model(x)
            preds = torch.argmax(y_hat, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y.cpu().numpy())
    
    # Final metrics calculation
    test_acc = accuracy_score(all_targets, all_preds)
    test_f1 = f1_score(all_targets, all_preds, average='weighted')
    
    # Log final test metrics
    wandb.log({
        'test_acc': test_acc,
        'test_f1': test_f1,
        'test_loss': test_results[0]['test_loss']
    })
    
    # Confusion matrix and classification report
    class_names = data_module.classes.tolist()
    
    # Confusion Matrix
    wandb.log({
        "confusion_matrix": wandb.plot.confusion_matrix(
            y_true=all_targets,
            preds=all_preds,
            class_names=class_names,
            title="Confusion Matrix"
        )
    })
    
    # Classification Report
    report = classification_report(
        all_targets, all_preds, 
        target_names=class_names,
        output_dict=True
    )
    
    # Create a wandb Table for the classification report
    report_table = wandb.Table(columns=["Class", "Precision", "Recall", "F1-Score", "Support"])
    for class_name in class_names:
        report_table.add_data(
            class_name,
            report[class_name]["precision"],
            report[class_name]["recall"],
            report[class_name]["f1-score"],
            report[class_name]["support"]
        )
    
    # Add weighted averages
    report_table.add_data(
        "Weighted Avg",
        report["weighted avg"]["precision"],
        report["weighted avg"]["recall"],
        report["weighted avg"]["f1-score"],
        report["weighted avg"]["support"]
    )
    
    wandb.log({"classification_report": report_table})
    
    # Finish wandb run
    wandb.finish()
if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohammad-fleity[0m ([33mmohammad-fleity-lebanese-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


1979373
424152
the model will be trained on:  1979368  samples.
the model will be validated on:  424147  samples.
the model will be tested on:  424147  samples.
1979373
424152
the model will be trained on:  1979368  samples.
the model will be validated on:  424147  samples.
the model will be tested on:  424147  samples.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

<h2>second dataset</h2>

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler, RandomSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import wandb
from omegaconf import OmegaConf
import os
import warnings
from kaggle_secrets import UserSecretsClient

warnings.filterwarnings('ignore')

# Optimized Configuration
config = OmegaConf.create({
    "wandb": {
        "project": "DL-NIDS-2--cic-ton-iot",
        "entity": "mohammad-fleity-lebanese-university",
        "tags": ["LSTM", "CIC-IDS-2017", "PyTorch"],
        "notes": "Optimized LSTM for network intrusion detection"
    },
    "model": {
        "hidden_size": 128,          # Increased capacity
        "num_layers": 2,             # Deeper network
        "dropout": 0.4,              # Stronger regularization
        "dense_units": [128, 64],    # Better feature extraction
        "learning_rate": 0.0001,     # Slower learning
        "weight_decay": 1e-4         # Stronger L2 regularization
    },
    "training": {
        "sequence_length": 5,        # Longer temporal context
        "batch_size": 128,           # Larger batches
        "max_epochs": 25,            # More training time
        "early_stopping_patience": 7,# More patience
        "oversample": True,          # Class balancing
        "gpus": 1 if torch.cuda.is_available() else 0,
        "train_size": 0.7,           # Proper train/val split
        "val_size": 0.15             # 70/15/15 split
    },
    "data": {
        "raw": "cic_ton_iot.parquet",
        "num_workers": 4
    }
})

class LSTMModel(pl.LightningModule):
    def __init__(self, input_size, num_classes, config):
        super().__init__()
        self.save_hyperparameters()
        
        # Enhanced LSTM with layer normalization
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=config.model.hidden_size,
            num_layers=config.model.num_layers,
            batch_first=True,
            dropout=config.model.dropout if config.model.num_layers > 1 else 0
        )
        
        self.lstm_ln = nn.LayerNorm(config.model.hidden_size)  # Better for sequences
        
        # Improved dense layers
        self.dense = nn.Sequential(
            nn.Linear(config.model.hidden_size, config.model.dense_units[0]),
            nn.LayerNorm(config.model.dense_units[0]),
            nn.ReLU(),
            nn.Dropout(config.model.dropout),
            nn.Linear(config.model.dense_units[0], config.model.dense_units[1]),
            nn.LayerNorm(config.model.dense_units[1]),
            nn.ReLU(),
            nn.Dropout(config.model.dropout)
        )
        
        self.output = nn.Linear(config.model.dense_units[1], num_classes)
        self.criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Regularization

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Last timestep
        lstm_out = self.lstm_ln(lstm_out)
        features = self.dense(lstm_out)
        return self.output(features)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log('train_loss_epoch', loss, prog_bar=True)
        self.log('train_acc_epoch', acc*100, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc*100, prog_bar=True)
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log('test_loss', loss)
        self.log('test_acc', acc*100)
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.config.model.learning_rate,
                                weight_decay=self.hparams.config.model.weight_decay)
        return optimizer

class NIDSDataModule(pl.LightningDataModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.batch_size = config.training.batch_size
        self.sequence_length = config.training.sequence_length
        self.num_workers = config.data.num_workers
        self.oversample = config.training.oversample
        self.alpha = 0.5

    def prepare_data(self):
        # /kaggle/input/cic-ton-iot-parquet
        df = pd.read_parquet(os.path.join('/kaggle/input/cic-ton-iot-parquet', self.config.data.raw))
    
        # Clean data
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)
        df.drop_duplicates(inplace=True)
    
        # Identify non-numeric columns
        self.non_numeric_cols = ['Label', 'Timestamp', 'Flow ID', 'Src IP', 
                                 'Src Port', 'Attack', 'Dst IP', 'Dst Port', 'Protocol']
        self.non_numeric_cols = [col for col in self.non_numeric_cols if col in df.columns]
    
        # Encode labels
        self.label_encoder = LabelEncoder()
        df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
        self.classes = self.label_encoder.classes_
    
        # Initialize scaler
        self.scaler = StandardScaler()
    
        # Train/Val/Test split
        train_df, test_df = train_test_split(
            df,
            test_size=1 - self.config.training.train_size,
            random_state=42,
            stratify=df['Label_Num']
        )
        val_df, test_df = train_test_split(
            test_df,
            test_size=0.5,
            random_state=42,
            stratify=test_df['Label_Num']
        )
        print(len(train_df))
        print(len(test_df))
        train_max_rows = 300000
        if len(train_df) > train_max_rows:
            train_df = train_df.sample(n=train_max_rows, random_state=42)
        val_max_rows = 45000
        if len(val_df) > val_max_rows:
            val_df = val_df.sample(n=val_max_rows, random_state=42)

        # max_rows = 90_000
        # if len(train_df) > max_rows:
        #     train_df = train_df.sample(n=max_rows, random_state=42)

        # Use `fit=True` only for training data
        self.X_train, self.y_train = self._prepare_features(train_df, fit=True)
        self.X_val, self.y_val = self._prepare_features(val_df, fit=False)
        self.X_test, self.y_test = self._prepare_features(test_df, fit=False)

    
    # def prepare_data(self):
    #     df = pd.read_parquet(os.path.join('/kaggle/input/cic-ids-2017-parquet', self.config.data.raw))
        
    #     # Clean data
    #     df.replace([np.inf, -np.inf], np.nan, inplace=True)
    #     df.dropna(inplace=True)
    #     df.drop_duplicates(inplace=True)
        
    #     # Identify non-numeric columns
    #     self.non_numeric_cols = ['Label', 'Timestamp', 'Flow ID', 'Src IP', 
    #                            'Src Port', 'Attack', 'Dst IP', 'Dst Port', 'Protocol']
    #     self.non_numeric_cols = [col for col in self.non_numeric_cols if col in df.columns]
        
    #     # Encode labels
    #     self.label_encoder = LabelEncoder()
    #     df['Label_Num'] = self.label_encoder.fit_transform(df['Label'])
    #     self.classes = self.label_encoder.classes_
    #     self.scaler = StandardScaler()

    #     # Proper train/val/test split
    #     train_df, test_df = train_test_split(
    #         df, 
    #         test_size=1 - self.config.training.train_size,
    #         random_state=42,
    #         stratify=df['Label_Num']
    #     )
    #     val_df, test_df = train_test_split(
    #         test_df,
    #         test_size=0.5,  # Splits remaining 30% into 15% val, 15% test
    #         random_state=42,
    #         stratify=test_df['Label_Num']
    #     )
        
    #     # Process each split
    #     self.X_train, self.y_train = self._prepare_features(train_df)
    #     self.X_val, self.y_val = self._prepare_features(val_df)
    #     self.X_test, self.y_test = self._prepare_features(test_df)
 
    # def _prepare_features(self, df):
    #     X = df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
    #     y = df['Label_Num']
    #     X = self.scaler.transform(X) if hasattr(self, 'scaler') else self.scaler.fit_transform(X)
    #     return self.create_sequences(X, y)
    
    def _prepare_features(self, df, fit=False):
        X = df.drop(['Label_Num'] + self.non_numeric_cols, axis=1)
        y = df['Label_Num']
        if fit:
            X = self.scaler.fit_transform(X)
        else:
            X = self.scaler.transform(X)
        return self.create_sequences(X, y)

    def create_sequences(self, X, y):
        sequences = []
        labels = []
        for i in range(len(X) - self.sequence_length):
            sequences.append(X[i:i+self.sequence_length])
            labels.append(y.iloc[i+self.sequence_length-1])
        return np.array(sequences), np.array(labels)
    
    def setup(self, stage=None):
        self.scaler = StandardScaler()
        self.scaler.fit(self.X_train.reshape(-1, self.X_train.shape[-1]))
        
        self.X_train = self.scaler.transform(self.X_train.reshape(-1, self.X_train.shape[-1])).reshape(self.X_train.shape)
        self.X_val = self.scaler.transform(self.X_val.reshape(-1, self.X_val.shape[-1])).reshape(self.X_val.shape)
        self.X_test = self.scaler.transform(self.X_test.reshape(-1, self.X_test.shape[-1])).reshape(self.X_test.shape)
        
        self.train_dataset = TensorDataset(torch.FloatTensor(self.X_train), torch.LongTensor(self.y_train))
        self.val_dataset = TensorDataset(torch.FloatTensor(self.X_val), torch.LongTensor(self.y_val))
        self.test_dataset = TensorDataset(torch.FloatTensor(self.X_test), torch.LongTensor(self.y_test))
        print("the model will be trained on: ",len(self.train_dataset)," samples.")
        print("the model will be validated on: ",len(self.val_dataset)," samples.")
        print("the model will be tested on: ",len(self.test_dataset)," samples.")
    
    def train_dataloader(self):
        if self.oversample:
            class_counts = np.bincount(self.y_train)
            weights = 1. / class_counts[self.y_train]
            sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
        else:
            sampler = RandomSampler(self.train_dataset)
            
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            sampler=sampler,
            num_workers=self.num_workers,
            persistent_workers=True,
            pin_memory=True
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True
        )


def init_wandb():
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("mohammad_wandb_secret")
    wandb.login(key=wandb_api_key)
    
    # Initialize the run first
    run = wandb.init(
        project=config.wandb.project,
        entity=config.wandb.entity,
        tags=config.wandb.tags,
        notes=config.wandb.notes,
        config={
            "input_size": None,  # Will be updated later
            "num_classes": None,
            "sequence_length": config.training.sequence_length,
            "train_samples": None,
            "test_samples": None,
            "model_config": dict(config.model),
            "training_config": dict(config.training)
        }
    )
    
    # Then create the logger
    wandb_logger = WandbLogger(
        experiment=run,
        log_model='all'
    )
    
    return wandb_logger, run

def main():
    # Initialize wandb - now returns both logger and run
    wandb_logger, run = init_wandb()
    
    # Initialize data module
    data_module = NIDSDataModule(config)
    data_module.prepare_data()
    data_module.setup()
    
    # Get input size from data
    sample_x, _ = next(iter(data_module.train_dataloader()))
    input_size = sample_x.shape[2]
    num_classes = len(data_module.classes)
    
    # Update the config with actual values
    run.config.update({
        "input_size": input_size,
        "num_classes": num_classes,
        "train_samples": len(data_module.train_dataset),
        "test_samples": len(data_module.test_dataset)
    })
    
    model = LSTMModel(input_size, num_classes, config)
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=config.training.early_stopping_patience,
        mode='min'
    )
    
    # checkpoint_callback = ModelCheckpoint(
    #     monitor='val_f1',  # Now monitoring F1 score
    #     mode='max',
    #     save_top_k=1,
    #     dirpath='checkpoints',
    #     filename='best_model'
    # )
    checkpoint_callback = ModelCheckpoint(
        monitor='val_acc',
        mode='max',
        save_top_k=1,
        dirpath='checkpoints',
        filename='best_model'
    )
 
    # Initialize trainer
    trainer = pl.Trainer(
        precision=16,
        logger=wandb_logger,
        max_epochs=config.training.max_epochs,
        callbacks=[early_stopping, checkpoint_callback],
        deterministic=True,
        gradient_clip_val=1.0,
        enable_progress_bar=True,
        log_every_n_steps=1000
    )
    
    # Train model
    trainer.fit(model, datamodule=data_module)
    
    # Test model
    test_results = trainer.test(model, datamodule=data_module)
    
    # Collect all predictions and targets for final evaluation
    test_loader = data_module.test_dataloader()
    all_preds = []
    all_targets = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            y_hat = model(x)
            preds = torch.argmax(y_hat, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y.cpu().numpy())
    
    # Final metrics calculation
    test_acc = accuracy_score(all_targets, all_preds)
    test_f1 = f1_score(all_targets, all_preds, average='weighted')
    
    # Log final test metrics
    wandb.log({
        'test_acc': test_acc,
        'test_f1': test_f1,
        'test_loss': test_results[0]['test_loss']
    })
    
    # Confusion matrix and classification report
    class_names = data_module.classes.tolist()
    
    # Confusion Matrix
    wandb.log({
        "confusion_matrix": wandb.plot.confusion_matrix(
            y_true=all_targets,
            preds=all_preds,
            class_names=class_names,
            title="Confusion Matrix"
        )
    })
    
    # Classification Report
    report = classification_report(
        all_targets, all_preds, 
        target_names=class_names,
        output_dict=True
    )
    
    # Create a wandb Table for the classification report
    report_table = wandb.Table(columns=["Class", "Precision", "Recall", "F1-Score", "Support"])
    for class_name in class_names:
        report_table.add_data(
            class_name,
            report[class_name]["precision"],
            report[class_name]["recall"],
            report[class_name]["f1-score"],
            report[class_name]["support"]
        )
    
    # Add weighted averages
    report_table.add_data(
        "Weighted Avg",
        report["weighted avg"]["precision"],
        report["weighted avg"]["recall"],
        report["weighted avg"]["f1-score"],
        report["weighted avg"]["support"]
    )
    
    wandb.log({"classification_report": report_table})
    
    # Finish wandb run
    wandb.finish()
if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohammad-fleity[0m ([33mmohammad-fleity-lebanese-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


3745408
802588
the model will be trained on:  299995  samples.
the model will be validated on:  44995  samples.
the model will be tested on:  802583  samples.
3745408
802588
the model will be trained on:  299995  samples.
the model will be validated on:  44995  samples.
the model will be tested on:  802583  samples.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

3745408
802588
the model will be trained on:  299995  samples.
the model will be validated on:  44995  samples.
the model will be tested on:  802583  samples.


Testing: |          | 0/? [00:00<?, ?it/s]

TypeError: Data row contained incompatible types:
{'Class': 'Weighted Avg', 'Precision': 0.9803419318795807, 'Recall': 0.9803135625848043, 'F1-Score': 0.9803077994903139, 'Support': 802583} of type {'Class': String, 'Precision': Number, 'Recall': Number, 'F1-Score': Number, 'Support': Number} is not assignable to {'Class': None or Number, 'Precision': None or Number, 'Recall': None or Number, 'F1-Score': None or Number, 'Support': None or Number}
Key 'Class':
	String not assignable to None or Number
		String not assignable to None
	and
		String not assignable to Number