In [9]:
import os
import numpy as np
from dotenv import load_dotenv
import pandas as pd


load_dotenv()
hf_token = os.getenv("HF_TOKEN")

In [10]:
df_gemma = pd.read_parquet("../data/gemma_att_kl_0_7000_df.parquet")
df_cnn = pd.read_parquet("../data/cnn_att_kl_0_1000_df.parquet")

In [11]:
# df_gemma = pd.DataFrame()

# for file in os.listdir("./"):
#     if file.startswith("gemma_att_diff_mean") and file.endswith(".parquet"):
#         df_gemma = pd.concat([df_gemma, pd.read_parquet(f"{file}")])

# df_gemma.to_parquet("data/gemma_att_diff_mean_df.parquet")
# df_gemma.shape

In [12]:
# df_cnn = pd.DataFrame()

# for file in os.listdir("./"):
#     if file.startswith("cnn_att_diff_mean") and file.endswith(".parquet"):
#         df_cnn = pd.concat([df_cnn, pd.read_parquet(f"{file}")])

# df_cnn.to_parquet("data/cnn_att_diff_mean_df.parquet")
# df_cnn.shape

In [13]:
# from ydata_profiling import ProfileReport

# df_gemma_eda = df_gemma.drop(['dataset'], axis=1)
# # Create a profile report
# profile = ProfileReport(
#     df_gemma_eda, 
#     title="Gemma DataFrame Profiling Report",
#     correlations={
#         "chi_squared": {"calculate": False},
#     }

# )
# # To display the report in a Jupyter Notebook
# profile.to_notebook_iframe()

In [14]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

In [15]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import IsolationForest
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from imblearn.under_sampling import RandomUnderSampler


datasets = [
    # {
    #     "train_name": "xsum_cnn",
    #     "train": df_xsum,
    #     "test": df_cnn
    # },
    # {
    #     "train_name": "cnn_xsum",
    #     "train": df_cnn,
    #     "test": df_xsum
    # },
    # {
    #     "train_name": "gemma_xsum",
    #     "train": df_gemma,
    #     "test": df_xsum
    # },
    {
        "train_name": "gemma_cnn",
        "train": df_gemma,
        "test": df_cnn
    },
    {
        "train_name": "cnn_gemma",
        "train": df_cnn,
        "test": df_gemma
    }
]

results = []

for dataset in datasets:
    # kfold validation over the dataset
    df_train = dataset['train']
    df_test = dataset['test']

    X_train = df_train.drop(columns=['label', 'dataset'])
    y_train = df_train['label']

    X_test = df_test.drop(columns=['label', 'dataset'])
    y_test = df_test['label']

    models = {
        'LogisticRegression': LogisticRegression(max_iter=10000, class_weight='balanced', random_state=42),
        # 'LGBMClassifier': LGBMClassifier(
        #     n_estimators=50,
        #     learning_rate=0.001,
        #     max_depth=10,
        #     num_leaves=15,
        #     class_weight='balanced',
        #     reg_alpha=0.1,
        #     reg_lambda=0.1,
        #     random_state=42,
        #     n_jobs=-1,
        #     silent=True,
        #     verbose=-1
        # )
    }

    for model_name, model in models.items():

        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Scale the data
            ('undersampler', RandomUnderSampler(random_state=42)),  # Undersample the majority class
            ('model', model)  # Fit the model
        ])
        pipeline.fit(X_train, y_train)
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]


        results.append({
            'dataset': dataset['train_name'],
            'model': model_name,
            'train_auc': roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1]),
            'roc_auc': roc_auc_score(y_test, y_pred_proba)
        })

In [16]:
from golemai.ml.wandb_trainer import WandbTrainer

trainer = WandbTrainer(
    project_name="hallu-sklearn",
)


trainer.train_model_and_evaluate_kfold(
    pipeline,
    df_gemma,
    df_cnn,
    description="gemma_cnn",
)


Train datasets: dataset      label
bioask       0        4098
             1         236
hotpotqa_en  0        2710
             1         411
hotpotqa_pl  0        1927
             1         281
nq           0        7058
             1         972
polqa        0        1492
             1         377
poquad_v2    0        5822
             1         425
dtype: int64
Test datasets: dataset  label
cnndm    0        7586
         1         207
dtype: int64


0,1
test_auc,▁
test_auc_std,▁
test_auprc,▁
test_auprc_std,▁
train_auc,▁
train_auc_std,▁
train_auprc,▁
train_auprc_std,▁
validation_auc,▁
validation_auc_std,▁

0,1
test_auc,0.46127
test_auc_std,0.01079
test_auprc,0.02433
test_auprc_std,0.00069
train_auc,0.73945
train_auc_std,0.00493
train_auprc,0.25718
train_auprc_std,0.00705
validation_auc,0.73327
validation_auc_std,0.02014


{'train_auc': 0.7394528113731648,
 'train_auprc': 0.2571783988492045,
 'train_auc_std': 0.0049277856751590915,
 'train_auprc_std': 0.007053974685373899,
 'validation_auc': 0.7332722181974926,
 'validation_auprc': 0.25436180868907865,
 'validation_auc_std': 0.020140528786518636,
 'validation_auprc_std': 0.024595163860917705,
 'test_auc': 0.4612734365746206,
 'test_auprc': 0.024332753137335943,
 'test_auc_std': 0.010790846190100635,
 'test_auprc_std': 0.00068860373097293}

In [None]:

df_results = pd.DataFrame(results)
df_results

NameError: name 'pd' is not defined

In [90]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import lightning as L
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torchmetrics.classification import AUROC
from lightning.pytorch.callbacks import EarlyStopping

# 1. Create a Synthetic Dataset
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)


# 2. Define the LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        x = x.unsqueeze(1)  # (batch_size, seq_len, input_dim)
        _, (hn, _) = self.lstm(x)  # hn is (num_layers, batch_size, hidden_dim)
        out = self.fc(hn[-1])  # Use the last layer's hidden state
        return out

# 3. Create the Lightning Module
class LSTMClassifier(L.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim, lr=5e-4, num_layers=1):
        super().__init__()
        self.model = LSTMModel(input_dim, hidden_dim, output_dim, num_layers=num_layers)
        self.criterion = nn.CrossEntropyLoss()
        self.lr = lr

        # AUROC metric for binary classification
        self.train_auc = AUROC(task="binary")
        self.val_auc = AUROC(task="binary")

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        data, labels = batch
        preds = self(data)
        loss = self.criterion(preds, labels)
        # Calculate AUC during training
        prob = torch.softmax(preds, dim=1)[:, 1]  # Take probabilities for class 1
        auc = self.train_auc(prob, labels)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_auc", auc, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        data, labels = batch
        preds = self(data)
        loss = self.criterion(preds, labels)
        # Calculate AUC during validation
        prob = torch.softmax(preds, dim=1)[:, 1]  # Take probabilities for class 1
        auc = self.val_auc(prob, labels)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_auc", auc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)




In [None]:
from sklearn.preprocessing import RobustScaler


X_train = df_gemma.drop(columns=['label', 'dataset'])
y_train = df_gemma['label']

X_val = df_cnn.drop(columns=['label', 'dataset'])
y_val = df_cnn['label']

scaler = RobustScaler()

X_train_normalized = scaler.fit_transform(X_train)

X_val_normalized = scaler.transform(X_val)

train_dataset = CustomDataset(X_train_normalized, y_train)
val_dataset = CustomDataset(X_val_normalized, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

# 4. Train the Model
input_dim = X_train.shape[1]
hidden_dim = 512
output_dim = 2


model = LSTMClassifier(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, lr=5e-4, num_layers=3)

# 1. Set up EarlyStopping callback
early_stop_callback = EarlyStopping(
    monitor="val_loss",  # The metric to monitor
    patience=3,          # Number of epochs with no improvement after which training will stop
    verbose=True,        # Display a message when stopping
    mode="min",          # Minimize the monitored metric (for loss)
)

# 2. Initialize the Trainer with the early stopping callback
trainer = L.Trainer(
    max_epochs=100,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    log_every_n_steps=10,
    callbacks=[early_stop_callback],  # Add the early stopping callback here
)

# 3. Start the training
trainer.fit(model, train_loader, val_loader)



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | LSTMModel        | 5.3 M  | train
1 | criterion | CrossEntropyLoss | 0      | train
2 | train_auc | BinaryAUROC      | 0      | train
3 | val_auc   | BinaryAUROC      | 0      | train
-------------------------------------------------------
5.3 M     Trainable params
0         Non-trainable params
5.3 M     Total params
21.361    Total estimated model params size (MB)
6         Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


                                                                            

/net/tscratch/people/plgkonkie311/miniconda3/envs/hallu/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.
/net/tscratch/people/plgkonkie311/miniconda3/envs/hallu/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.


Epoch 0:   8%|▊         | 68/807 [00:00<00:04, 152.73it/s, v_num=35]



Epoch 2:  61%|██████▏   | 124/202 [01:31<00:57,  1.35it/s, v_num=33, val_loss=0.163, val_auc=0.283, train_loss=0.304, train_auc=0.717]
Epoch 0: 100%|██████████| 807/807 [00:13<00:00, 58.42it/s, v_num=35, val_loss=0.129, val_auc=0.339, train_loss=0.317, train_auc=0.673]

Metric val_loss improved. New best score: 0.129


Epoch 1: 100%|██████████| 807/807 [00:31<00:00, 25.53it/s, v_num=35, val_loss=0.124, val_auc=0.379, train_loss=0.305, train_auc=0.699]

Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.124


Epoch 4: 100%|██████████| 807/807 [01:29<00:00,  8.99it/s, v_num=35, val_loss=0.134, val_auc=0.304, train_loss=0.295, train_auc=0.727]

Monitored metric val_loss did not improve in the last 3 records. Best score: 0.124. Signaling Trainer to stop.


Epoch 4: 100%|██████████| 807/807 [01:29<00:00,  8.98it/s, v_num=35, val_loss=0.134, val_auc=0.304, train_loss=0.295, train_auc=0.727]
