In [47]:
import os
import numpy as np
from dotenv import load_dotenv
import pandas as pd


load_dotenv()
hf_token = os.getenv("HF_TOKEN")

In [49]:
df_xsum = pd.read_parquet("gemma_att_kl_0_7000_df.parquet")
df_cnn = pd.read_parquet("cnn_att_kl_0_1000_df.parquet")

In [50]:
df_xsum.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '40', 'label', 'dataset'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

: 

In [None]:
datasets = [
    {
        "train_name": "xsum",
        "train": df_xsum,
        "test": df_cnn
    },
    {
        "train_name": "cnn",
        "train": df_cnn,
        "test": df_xsum
    }
]

results = []

for dataset in datasets:
    # kfold validation over the dataset
    df_train = dataset['train']
    df_test = dataset['test']

    X_train = df_train.drop(columns=['label', 'dataset'])
    y_train = df_train['label']

    X_test = df_test.drop(columns=['label', 'dataset'])
    y_test = df_test['label']

    models = {
        'LogisticRegression': LogisticRegression(max_iter=10000, class_weight='balanced', random_state=42),
        'LGBMClassifier': LGBMClassifier(
            n_estimators=25,
            learning_rate=0.005,
            max_depth=5,
            num_leaves=7,
            class_weight='balanced',
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42,
            n_jobs=-1,
            silent=True,
            verbose=-1
        )
    }

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:, 1]


        results.append({
            'dataset': dataset['train_name'],
            'model': model_name,
            'train_auc': roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]),
            'roc_auc': roc_auc_score(y_test, y_pred_proba)
        })

In [None]:

df_results = pd.DataFrame(results)
df_results

Unnamed: 0,dataset,model,train_auc,roc_auc
0,xsum,LogisticRegression,0.709079,0.637637
1,xsum,LGBMClassifier,0.631963,0.587371
2,cnn,LogisticRegression,0.754208,0.603219
3,cnn,LGBMClassifier,0.769693,0.544042


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import lightning as L
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torchmetrics.classification import AUROC
from lightning.pytorch.callbacks import EarlyStopping

# 1. Create a Synthetic Dataset
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data.iloc[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)


# 2. Define the LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        x = x.unsqueeze(1)  # (batch_size, seq_len, input_dim)
        _, (hn, _) = self.lstm(x)  # hn is (num_layers, batch_size, hidden_dim)
        out = self.fc(hn[-1])  # Use the last layer's hidden state
        return out

# 3. Create the Lightning Module
class LSTMClassifier(L.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim, lr=5e-4, num_layers=1):
        super().__init__()
        self.model = LSTMModel(input_dim, hidden_dim, output_dim, num_layers=num_layers)
        self.criterion = nn.CrossEntropyLoss()
        self.lr = lr

        # AUROC metric for binary classification
        self.train_auc = AUROC(task="binary")
        self.val_auc = AUROC(task="binary")

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        data, labels = batch
        preds = self(data)
        loss = self.criterion(preds, labels)
        # Calculate AUC during training
        prob = torch.softmax(preds, dim=1)[:, 1]  # Take probabilities for class 1
        auc = self.train_auc(prob, labels)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_auc", auc, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        data, labels = batch
        preds = self(data)
        loss = self.criterion(preds, labels)
        # Calculate AUC during validation
        prob = torch.softmax(preds, dim=1)[:, 1]  # Take probabilities for class 1
        auc = self.val_auc(prob, labels)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_auc", auc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)




In [None]:
X_train = df_cnn.drop(columns=['label', 'dataset'])
y_train = df_cnn['label']

X_val = df_xsum.drop(columns=['label', 'dataset'])
y_val = df_xsum['label']

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

# 4. Train the Model
input_dim = X_train.shape[1]
hidden_dim = 64
output_dim = 2


model = LSTMClassifier(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, lr=1e-5, num_layers=2)

# 1. Set up EarlyStopping callback
early_stop_callback = EarlyStopping(
    monitor="val_loss",  # The metric to monitor
    patience=4,          # Number of epochs with no improvement after which training will stop
    verbose=True,        # Display a message when stopping
    mode="min",          # Minimize the monitored metric (for loss)
)

# 2. Initialize the Trainer with the early stopping callback
trainer = L.Trainer(
    max_epochs=100,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    log_every_n_steps=10,
    callbacks=[early_stop_callback],  # Add the early stopping callback here
)

# 3. Start the training
trainer.fit(model, train_loader, val_loader)



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | LSTMModel        | 60.8 K | train
1 | criterion | CrossEntropyLoss | 0      | train
2 | train_auc | BinaryAUROC      | 0      | train
3 | val_auc   | BinaryAUROC      | 0      | train
-------------------------------------------------------
60.8 K    Trainable params
0         Non-trainable params
60.8 K    Total params
0.243     Total estimated model params size (MB)
6         Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/net/tscratch/people/plgkonkie311/miniconda3/envs/hallu/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


                                                                           

/net/tscratch/people/plgkonkie311/miniconda3/envs/hallu/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 0/244 [00:00<?, ?it/s] 



Epoch 0: 100%|██████████| 244/244 [00:03<00:00, 75.17it/s, v_num=23, val_loss=0.644, val_auc=0.543, train_loss=0.667, train_auc=0.291]

Metric val_loss improved. New best score: 0.644


Epoch 1: 100%|██████████| 244/244 [00:05<00:00, 43.56it/s, v_num=23, val_loss=0.544, val_auc=0.510, train_loss=0.578, train_auc=0.337]

Metric val_loss improved by 0.100 >= min_delta = 0.0. New best score: 0.544


Epoch 2: 100%|██████████| 244/244 [00:07<00:00, 33.64it/s, v_num=23, val_loss=0.446, val_auc=0.523, train_loss=0.436, train_auc=0.317]

Metric val_loss improved by 0.098 >= min_delta = 0.0. New best score: 0.446


Epoch 3: 100%|██████████| 244/244 [00:09<00:00, 26.26it/s, v_num=23, val_loss=0.389, val_auc=0.523, train_loss=0.333, train_auc=0.294]

Metric val_loss improved by 0.057 >= min_delta = 0.0. New best score: 0.389


Epoch 4: 100%|██████████| 244/244 [00:11<00:00, 21.87it/s, v_num=23, val_loss=0.360, val_auc=0.521, train_loss=0.263, train_auc=0.329]

Metric val_loss improved by 0.029 >= min_delta = 0.0. New best score: 0.360


Epoch 5: 100%|██████████| 244/244 [00:13<00:00, 18.37it/s, v_num=23, val_loss=0.346, val_auc=0.522, train_loss=0.222, train_auc=0.318]

Metric val_loss improved by 0.015 >= min_delta = 0.0. New best score: 0.346


Epoch 6: 100%|██████████| 244/244 [00:14<00:00, 16.53it/s, v_num=23, val_loss=0.339, val_auc=0.521, train_loss=0.194, train_auc=0.336]

Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.339


Epoch 7: 100%|██████████| 244/244 [00:17<00:00, 14.19it/s, v_num=23, val_loss=0.337, val_auc=0.523, train_loss=0.172, train_auc=0.325]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.337


Epoch 11: 100%|██████████| 244/244 [00:26<00:00,  9.32it/s, v_num=23, val_loss=0.358, val_auc=0.525, train_loss=0.133, train_auc=0.318]

Monitored metric val_loss did not improve in the last 4 records. Best score: 0.337. Signaling Trainer to stop.


Epoch 11: 100%|██████████| 244/244 [00:26<00:00,  9.31it/s, v_num=23, val_loss=0.358, val_auc=0.525, train_loss=0.133, train_auc=0.318]
