# Goal Is to reduce the parameters needed to detect depression so that it could be used on edge devices (Apples autocorrect can detect and help users find mental health sources) rather than needing expensive API requests to a remote LLM.

# Prep work

## Loading and Processing the Data

In [1]:
DATA_PATH = './depression-detection-lt-edi-2022/data/original_dataset/'
MODEL_PATH = './mha_model'

In [2]:
import os

if not os.path.exists(DATA_PATH):
    !git clone https://github.com/rafalposwiata/depression-detection-lt-edi-2022.git
    print("Dataset has been cloned.")
else:
    print("Dataset already exists.")

Dataset already exists.


In [3]:
import pandas as pd

# Storing the data in a panda table
df_train = pd.read_table(DATA_PATH + "train.tsv")

# Making the column names standardized
df_train.rename(columns={"PID":"pid", "Text_data":"text_data", "Label":"label"}, inplace=True)

In [4]:
# Converting from strings to ints for the labels
label_mapping = {
    'not depression': 0,
    'moderate': 1,
    'severe': 2
}
df_train['label'] = df_train['label'].map(label_mapping)

## Defining Code for Getting Dataloaders

In [5]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import torch

class DepressionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=1024):
        self.texts = dataframe['text_data'].tolist()
        self.labels = dataframe['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def out_of_vocabulary_check(tokenizer, data_loader):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    vocab_size = tokenizer.vocab_size
    
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].type(torch.long).to(device)
    
        # Check for out-of-vocabulary tokens before moving to GPU
        out_of_vocab_mask = input_ids >= tokenizer.vocab_size
        if out_of_vocab_mask.any():
            print("Found input IDs out of vocabulary bounds!")
            # Print the offending tokens (if needed)
            print("Offending tokens:", tokenizer.convert_ids_to_tokens(input_ids[out_of_vocab_mask].tolist()))
            # Replace out-of-vocabulary tokens with a special token (e.g., [UNK])
            input_ids[out_of_vocab_mask] = tokenizer.unk_token_id  # Replace with tokenizer.unk_token_id
    
        input_ids = batch['input_ids'].type(torch.long).to(device)
        if (input_ids >= vocab_size).any():
            print("Found input IDs out of vocabulary bounds!")
            print(input_ids[input_ids >= vocab_size])
            break

In [7]:
def get_dataloader(df_data, tokenizer, dataset_type="Train"):
    print(f"{dataset_type} Dataset:")
    dataset = DepressionDataset(df_data, tokenizer)
    data_loader = DataLoader(dataset, batch_size=4, shuffle=True)
    out_of_vocabulary_check(tokenizer, data_loader)
    
    return data_loader

## Defining the Model

In [8]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MHA(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads=8, dropout=0.4):
        super(MHA, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc_q = nn.Linear(embed_dim, embed_dim)
        self.fc_k = nn.Linear(embed_dim, embed_dim)
        self.fc_v = nn.Linear(embed_dim, embed_dim)
        self.fc_o = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)
        self.classifier = nn.Linear(embed_dim, 3)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        batch_size = x.size(0)
        q = self.fc_q(x)
        k = self.fc_k(x)
        v = self.fc_v(x)
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        v = v.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = torch.softmax(attn_weights, dim=-1)
        attn_output = torch.matmul(attn_weights, v)
        attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.embed_dim)
        output = self.fc_o(attn_output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.layer_norm(output + x)
        output = self.classifier(output)
        output = output[:, 0, :]
        return output

# Running the Code

## Getting Dataloaders

In [9]:
from tqdm import tqdm

In [10]:
my_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_data_loader = get_dataloader(df_train, my_tokenizer, "Train")

Train Dataset:


100%|█████████████████████████████████████████████████████████████████████████████| 2223/2223 [00:20<00:00, 107.81it/s]


## Training the Model

In [11]:
import torch.optim as optim
import torch.nn as nn
from transformers import BertTokenizer
from tqdm import tqdm

mha_model = MHA(vocab_size=my_tokenizer.vocab_size, embed_dim=768).to(device)

optimizer = optim.Adam(mha_model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss().to(device)

num_epochs = 10

for epoch in range(num_epochs):
    loop = tqdm(train_data_loader, leave=True, desc=f"Epoch [{epoch + 1}/{num_epochs}]")

    running_loss = 0.0
    for i, batch in enumerate(loop):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].type(torch.long).to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = mha_model(input_ids)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        loop.set_postfix(loss=running_loss / (i + 1))

Epoch [1/10]: 100%|████████████████████████████████████████████████████| 2223/2223 [52:34<00:00,  1.42s/it, loss=0.781]
Epoch [2/10]:   5%|██▌                                                  | 110/2223 [00:58<18:52,  1.87it/s, loss=0.708]


KeyboardInterrupt: 

## Saving the Model

In [None]:
torch.save(mha_model.state_dict(), MODEL_PATH)