In [1]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertModel
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv("../Data/medical_sentiment_intent_dataset.csv")

# sanity check
assert "text" in df.columns
assert "sentiment" in df.columns
assert "intent" in df.columns


In [10]:
df.head()

Unnamed: 0,text,sentiment,intent
0,"The pain is mostly gone now, just mild discomf...",reassured,reporting_symptoms
1,Is it fine to stop the medication soon?,reassured,asking_question
2,Do I need to worry about this getting worse?,anxious,seeking_reassurance
3,Everything seems to be healing well.,reassured,reporting_symptoms
4,I still have a bit of neck pain from time to t...,neutral,reporting_symptoms


In [13]:
sentiment_labels = {l: i for i, l in enumerate(sorted(df["sentiment"].unique()))}
intent_labels    = {l: i for i, l in enumerate(sorted(df["intent"].unique()))}

df["sentiment_id"] = df["sentiment"].map(sentiment_labels)
df["intent_id"]    = df["intent"].map(intent_labels)

# keep only required columns
df = df[["text", "sentiment_id", "intent_id"]]

# VERY IMPORTANT
df = df.reset_index(drop=True)


In [14]:
dataset = Dataset.from_pandas(df)


In [15]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"]  # remove only text
)


Map: 100%|██████████| 600/600 [00:01<00:00, 411.61 examples/s]


In [16]:
if "__index_level_0__" in dataset.column_names:
    dataset = dataset.remove_columns("__index_level_0__")


In [17]:
dataset = dataset.train_test_split(test_size=0.15)


In [18]:
dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "sentiment_id", "intent_id"]
)

# final check
print(dataset["train"].column_names)


['sentiment_id', 'intent_id', 'input_ids', 'attention_mask']


In [19]:
class MultiTaskDistilBERT(nn.Module):
    def __init__(self, num_sentiment, num_intent):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        hidden = self.bert.config.dim

        self.sentiment_head = nn.Linear(hidden, num_sentiment)
        self.intent_head = nn.Linear(hidden, num_intent)

    def forward(self, input_ids, attention_mask, sentiment_id=None, intent_id=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        pooled = outputs.last_hidden_state[:, 0]

        sentiment_logits = self.sentiment_head(pooled)
        intent_logits = self.intent_head(pooled)

        loss = None
        if sentiment_id is not None and intent_id is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = (
                loss_fn(sentiment_logits, sentiment_id)
                + loss_fn(intent_logits, intent_id)
            )

        return {
            "loss": loss,
            "sentiment_logits": sentiment_logits,
            "intent_logits": intent_logits
        }


In [20]:
def collate(batch):
    return {
        "input_ids": torch.stack([b["input_ids"] for b in batch]),
        "attention_mask": torch.stack([b["attention_mask"] for b in batch]),
        "sentiment_id": torch.tensor([b["sentiment_id"] for b in batch]),
        "intent_id": torch.tensor([b["intent_id"] for b in batch]),
    }


In [21]:
args = TrainingArguments(
    output_dir="./results",
    learning_rate=3e-5,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none"
)


In [22]:
model = MultiTaskDistilBERT(
    num_sentiment=len(sentiment_labels),
    num_intent=len(intent_labels)
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=collate
)


In [23]:
trainer.train()




Step,Training Loss
50,1.2911
100,0.1028


TrainOutput(global_step=128, training_loss=0.5521716941148043, metrics={'train_runtime': 739.503, 'train_samples_per_second': 2.759, 'train_steps_per_second': 0.173, 'total_flos': 0.0, 'train_loss': 0.5521716941148043, 'epoch': 4.0})

In [26]:
# switch model to eval mode
model.eval()

# reverse label maps
id2sentiment = {v: k for k, v in sentiment_labels.items()}
id2intent = {v: k for k, v in intent_labels.items()}

def predict(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    )

    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )

    sentiment_pred = torch.argmax(outputs["sentiment_logits"], dim=1).item()
    intent_pred = torch.argmax(outputs["intent_logits"], dim=1).item()

    return {
        "text": text,
        "sentiment": id2sentiment[sentiment_pred],
        "intent": id2intent[intent_pred]
    }


In [28]:
result = predict("I have severe headache and i am scared")
print(result)


{'text': 'I have severe headache and i am scared', 'sentiment': 'anxious', 'intent': 'reporting_symptoms'}


In [29]:
import numpy as np
from sklearn.metrics import f1_score


In [30]:
model.eval()

sentiment_preds = []
sentiment_true  = []

intent_preds = []
intent_true  = []

for batch in trainer.get_eval_dataloader():
    with torch.no_grad():
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

    # predictions
    s_pred = torch.argmax(outputs["sentiment_logits"], dim=1).cpu().numpy()
    i_pred = torch.argmax(outputs["intent_logits"], dim=1).cpu().numpy()

    sentiment_preds.extend(s_pred)
    intent_preds.extend(i_pred)

    sentiment_true.extend(batch["sentiment_id"].cpu().numpy())
    intent_true.extend(batch["intent_id"].cpu().numpy())




In [31]:
sentiment_f1 = f1_score(sentiment_true, sentiment_preds, average="weighted")
intent_f1    = f1_score(intent_true, intent_preds, average="weighted")

print(f"Sentiment F1 Score : {sentiment_f1:.4f}")
print(f"Intent F1 Score    : {intent_f1:.4f}")


Sentiment F1 Score : 1.0000
Intent F1 Score    : 1.0000


In [None]:
SAVE_DIR = "../Models/multitask_distilbert"

# save model weights + config
trainer.save_model(SAVE_DIR)

# save tokenizer
tokenizer.save_pretrained(SAVE_DIR)


('./multitask_distilbert\\tokenizer_config.json',
 './multitask_distilbert\\special_tokens_map.json',
 './multitask_distilbert\\vocab.txt',
 './multitask_distilbert\\added_tokens.json',
 './multitask_distilbert\\tokenizer.json')