# Goal Is to reduce the parameters needed to detect depression so that it could be used on edge devices (Apples autocorrect can detect and help users find mental health sources) rather than needing expensive API requests to a remote LLM.

In [None]:
import os

directory = './depression-detection-lt-edi-2022/data/original_dataset/'
if not os.path.exists(directory):
    !git clone https://github.com/rafalposwiata/depression-detection-lt-edi-2022.git
    print("Dataset has been cloned.")
else:
    print("Dataset already exists.")

Dataset already exists.


In [None]:
import pandas as pd
import numpy as np
import re

# # Getting the data files
# files = []
# for filename in os.listdir(directory):
#     f = os.path.join(directory, filename)

#     if os.path.isfile(f):
#         print(f.split('.')[1][1:])
#         files.append(f.split('.')[1][1:])
# print(os.listdir(directory))

In [None]:
# Storing the data in a panda table
df_train = pd.read_table(directory + "train.tsv")
df_dev = pd.read_table(directory + "dev.tsv")
df_test = pd.read_table(directory + "test.tsv")

In [None]:
# Making the column names consistent
df_train.rename(columns={"PID":"pid", "Text_data":"text_data", "Label":"label"}, inplace=True)
df_dev.rename(columns={"PID":"pid", "Text data":"text_data", "Label":"label"}, inplace=True)
df_test.rename(columns={"Pid":"pid", "text data":"text_data", "Class labels":"label"}, inplace=True)

In [None]:
# Converting from strings to ints for the labels
label_mapping = {
    'moderate': 0,
    'not depression': 1,
    'severe': 2
}
df_train['label'] = df_train['label'].map(label_mapping)

In [None]:
max_length = 0
for text in df_train['text_data']:
  max_length = max(max_length, len(text))

print(f"The length of the longest 'text_data' string is: {max_length}")

The length of the longest 'text_data' string is: 15996


In [None]:
my_data = []
for data in df_train['text_data']:
  my_data.append(data)

In [None]:
%env CUDA_LAUNCH_BLOCKING=1
# For linux
#!export CUDA_LAUNCH_BLOCKING=1
# For windows
#!set CUDA_LAUNCH_BLOCKING=1

import os
# Set CUDA_LAUNCH_BLOCKING to 1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class DepressionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=1024):
        self.texts = dataframe['text_data'].tolist()
        self.labels = dataframe['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

dataset = DepressionDataset(df_train, tokenizer)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# for batch in data_loader:
#     print("Input IDs:", batch['input_ids'].to(device), "Input IDs type:", type(batch['input_ids']))
#     print("Attention Mask:", batch['attention_mask'].to(device))
#     print("Labels:", batch['label'].to(device))

env: CUDA_LAUNCH_BLOCKING=1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = "cpu"
vocab_size = tokenizer.vocab_size
for batch in data_loader:
    input_ids = batch['input_ids'].type(torch.long).to(device)

    # Check for out-of-vocabulary tokens before moving to GPU
    out_of_vocab_mask = input_ids >= tokenizer.vocab_size
    if out_of_vocab_mask.any():
        print("Found input IDs out of vocabulary bounds!")
        # Print the offending tokens (if needed)
        print("Offending tokens:", tokenizer.convert_ids_to_tokens(input_ids[out_of_vocab_mask].tolist()))
        # Replace out-of-vocabulary tokens with a special token (e.g., [UNK])
        input_ids[out_of_vocab_mask] = tokenizer.unk_token_id  # Replace with tokenizer.unk_token_id

    input_ids = batch['input_ids'].type(torch.long).to(device)
    if (input_ids >= vocab_size).any():
        print("Found input IDs out of vocabulary bounds!")
        print(input_ids[input_ids >= vocab_size])
        break

In [None]:
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MHA(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads=8, dropout=0.4):
        super(MHA, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc_q = nn.Linear(embed_dim, embed_dim)
        self.fc_k = nn.Linear(embed_dim, embed_dim)
        self.fc_v = nn.Linear(embed_dim, embed_dim)
        self.fc_o = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)
        self.classifier = nn.Linear(embed_dim, 3)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        batch_size = x.size(0)
        q = self.fc_q(x)
        k = self.fc_k(x)
        v = self.fc_v(x)
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        v = v.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = torch.softmax(attn_weights, dim=-1)
        attn_output = torch.matmul(attn_weights, v)
        attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.embed_dim)
        output = self.fc_o(attn_output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.layer_norm(output + x)
        output = self.classifier(output)
        output = output[:, 0, :]
        return output

In [None]:
import torch.optim as optim
import torch.nn as nn
from transformers import BertTokenizer
from tqdm import tqdm

mha_model = MHA(vocab_size=tokenizer.vocab_size, embed_dim=768).to(device)

optimizer = optim.Adam(mha_model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss().to(device)

num_epochs = 10

for epoch in range(num_epochs):
    loop = tqdm(data_loader, leave=True, desc=f"Epoch [{epoch + 1}/{num_epochs}]")

    running_loss = 0.0
    for i, batch in enumerate(loop):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].type(torch.long).to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = mha_model(input_ids)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        loop.set_postfix(loss=running_loss / (i + 1))

Epoch [1/10]: 100%|██████████| 2223/2223 [02:41<00:00, 13.80it/s, loss=0.796]
Epoch [2/10]: 100%|██████████| 2223/2223 [02:21<00:00, 15.76it/s, loss=0.767]
Epoch [3/10]: 100%|██████████| 2223/2223 [02:20<00:00, 15.80it/s, loss=0.758]
Epoch [4/10]: 100%|██████████| 2223/2223 [02:21<00:00, 15.76it/s, loss=0.742]
Epoch [5/10]: 100%|██████████| 2223/2223 [02:22<00:00, 15.64it/s, loss=0.695]
Epoch [6/10]: 100%|██████████| 2223/2223 [02:21<00:00, 15.76it/s, loss=0.648]
Epoch [7/10]: 100%|██████████| 2223/2223 [02:21<00:00, 15.74it/s, loss=0.605]
Epoch [8/10]: 100%|██████████| 2223/2223 [02:20<00:00, 15.77it/s, loss=0.568]
Epoch [9/10]: 100%|██████████| 2223/2223 [02:20<00:00, 15.77it/s, loss=0.536]
Epoch [10/10]: 100%|██████████| 2223/2223 [02:21<00:00, 15.68it/s, loss=0.506]


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params = count_parameters(mha_model)
print(f"Total number of trainable parameters: {total_params}")

Total number of trainable parameters: 25807107


## LIME - Making the model explainable.

Currently a work in progress. --Zach

In [None]:
!pip install lime



In [None]:
from lime.lime_text import LimeTextExplainer
class_names = df_train['label'].unique()
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
dev_set = DepressionDataset(df_dev, tokenizer)
dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False)

dev_data = []
for some_data in dev_loader:
  dev_data.append(some_data)

TypeError: new(): invalid data type 'str'

In [None]:
pos_to_predict = 0
datapoint = df_dev.iloc[pos_to_predict]
predicted_class = mha_model.forward(dev_loader[0])

exp = explainer.explain_instance(datapoint['text_data'],
                                 predicted_class, num_features=6,
                                 labels=[0, 1, 2])

print("Position of element in dataset:", pos_to_predict)
print("Predicted class:", predicted_class)
print("Actual class:", datapoint['label'])

TypeError: 'DataLoader' object is not subscriptable

In [None]:
print("Explanation for class %s" % class_names[0])
print("\n".join(map(str, exp.as_list(label=0))))

print("Explanation for class %s" % class_names[0])
print("\n".join(map(str, exp.as_list(label=0))))