In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

# Define your configuration variables
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 1e-05
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load your CSV data for both training and validation
train_df = pd.read_csv("data/Training set (Clinical_dataset).csv")[["story", "anger", "fear", "sadness", "calmness", "disgust", "pleasantness", "eagerness", "joy"]]
val_df = pd.read_csv("data/Validation set (Clinical_dataset).csv")[["story", "anger", "fear", "sadness", "calmness", "disgust", "pleasantness", "eagerness", "joy"]]

# Initialize the ClinicalLongFormer tokenizer
tokenizer = AutoTokenizer.from_pretrained('yikuan8/Clinical-Longformer')

# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.story
        self.targets = self.data[list(dataframe.columns[1:])]
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index].values, dtype=torch.float)
        }




Downloading tokenizer_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

# New Section

In [None]:
# Create datasets and data loaders for both training and validation
training_set = CustomDataset(train_df, tokenizer, MAX_LEN)
validation_set = CustomDataset(val_df, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
valid_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': False, 'num_workers': 0}

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **valid_params)

# Define the ClinicalBERT model
model = AutoModelForSequenceClassification.from_pretrained('yikuan8/Clinical-Longformer', num_labels=8)
model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# # Define the loss function
# def loss_fn(outputs, targets):
#     return torch.nn.BCEWithLogitsLoss()(outputs, targets)

# Define the loss function
def loss_fn(outputs, targets):
    # Extract logits from the outputs
    logits = outputs.logits

    return torch.nn.BCEWithLogitsLoss()(logits, targets)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    train_losses = []

    for data in training_loader:
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()

    avg_train_loss = sum(train_losses) / len(train_losses)
    print(f'Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}')


Downloading config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/595M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at yikuan8/Clinical-Longformer and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Training Loss: 0.3248
Epoch 2, Training Loss: 0.2213
Epoch 3, Training Loss: 0.1817
Epoch 4, Training Loss: 0.1587
Epoch 5, Training Loss: 0.1382


In [None]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
def validate_model():
    model.eval()
    val_losses = []

    val_targets = []
    val_predictions = []

    with torch.no_grad():
        for data in validation_loader:
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)

            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            val_losses.append(loss.item())

            val_predictions.extend(outputs.logits.cpu().numpy())
            val_targets.extend(targets.cpu().numpy())

    # Convert val_targets and val_predictions to NumPy arrays
    val_targets = np.array(val_targets)
    val_predictions = np.array(val_predictions)

    # Apply threshold of 0.5 to convert to binary values
    val_predictions = (val_predictions >= 0.5).astype(int)

    # Calculate Hamming Loss
    hamming_loss_value = hamming_loss(val_targets, val_predictions)

    # Calculate Exact Match Ratio
    exact_match_ratio = accuracy_score(val_targets, val_predictions)

    print(f'Hamming Loss: {hamming_loss_value:.4f}')
    print(f'Exact Match Ratio: {exact_match_ratio:.4f}')

    # Calculate and print accuracy, precision, recall, and F1-score for each class
    for i, target_name in enumerate(val_df.columns[1:]):
        target_labels = val_targets[:, i]
        predicted_labels = (val_predictions[:, i] > 0.5)

        accuracy = accuracy_score(target_labels, predicted_labels)
        precision = precision_score(target_labels, predicted_labels, zero_division=0)
        recall = recall_score(target_labels, predicted_labels)
        f1 = f1_score(target_labels, predicted_labels)

        print(f'Class: {target_name}')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Precision: {precision:.4f}')
        print(f'Recall: {recall:.4f}')
        print(f'F1-Score: {f1:.4f}')

    # Return the average validation loss
    return sum(val_losses) / len(val_losses)

In [None]:
# Validate the model
validate_model()


Hamming Loss: 0.0682
Exact Match Ratio: 0.7258
Class: anger
Accuracy: 0.9339
Precision: 0.8052
Recall: 0.6681
F1-Score: 0.7303
Class: fear
Accuracy: 0.9180
Precision: 0.8784
Recall: 0.8267
F1-Score: 0.8518
Class: sadness
Accuracy: 0.9180
Precision: 0.9247
Recall: 0.8661
F1-Score: 0.8944
Class: calmness
Accuracy: 0.9359
Precision: 0.9304
Recall: 0.8577
F1-Score: 0.8925
Class: disgust
Accuracy: 0.9394
Precision: 0.9181
Recall: 0.7825
F1-Score: 0.8449
Class: pleasantness
Accuracy: 0.9463
Precision: 0.8545
Recall: 0.6168
F1-Score: 0.7165
Class: eagerness
Accuracy: 0.9221
Precision: 0.9033
Recall: 0.8163
F1-Score: 0.8576
Class: joy
Accuracy: 0.9408
Precision: 0.9253
Recall: 0.9260
F1-Score: 0.9256


0.1860846448582404

In [None]:
# Save the trained model
model.save_pretrained('LongFormer_trained_model')