# Goal Is to reduce the parameters needed to detect depression so that it could be used on edge devices (Apples autocorrect can detect and help users find mental health sources) rather than needing expensive API requests to a remote LLM.

In [None]:
!git clone https://github.com/rafalposwiata/depression-detection-lt-edi-2022.git

Cloning into 'depression-detection-lt-edi-2022'...
remote: Enumerating objects: 67, done.[K
remote: Counting objects: 100% (67/67), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 67 (delta 23), reused 57 (delta 18), pack-reused 0 (from 0)[K
Receiving objects: 100% (67/67), 8.30 MiB | 10.74 MiB/s, done.
Resolving deltas: 100% (23/23), done.


In [None]:
!pwd
!ls

/content/depression-detection-lt-edi-2022/data/original_dataset/depression-detection-lt-edi-2022/data/original_dataset/depression-detection-lt-edi-2022/data/original_dataset
dev.tsv  test.tsv  train.tsv


In [None]:
%cd depression-detection-lt-edi-2022/data/original_dataset/

[Errno 2] No such file or directory: 'depression-detection-lt-edi-2022/data/original_dataset/'
/content/depression-detection-lt-edi-2022/data/original_dataset/depression-detection-lt-edi-2022/data/original_dataset/depression-detection-lt-edi-2022/data/original_dataset


In [None]:
!ls

dev.tsv  test.tsv  train.tsv


In [None]:
import pandas as pd
import numpy as np
import re

import os
directory = '.'
files = []
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f):
        print(f.split('.')[1][1:])
        files.append(f.split('.')[1][1:])
print(files)

# for file_ in files:
#   with open(file_ + ".tsv", 'r') as myfile:
#     with open(file_ + ".csv", 'w') as csv_file:
#       for line in myfile:

#         fileContent = re.sub("\t", ",", line)

#         csv_file.write(fileContent)

df_train = pd.read_table(files[0] + ".tsv")
#.to_csv(files[0] + ".csv", index=False)

df_train.head()

test
train
dev
['test', 'train', 'dev']


Unnamed: 0,Pid,text data,Class labels
0,test_pid_1,Im scared : This is it. I lie to myself every ...,moderate
1,test_pid_2,New to this but just wanted to vent : I just f...,moderate
2,test_pid_3,I’m sad : It’s kinda always been an issue. I w...,moderate
3,test_pid_4,Lonely but not alone. : All of my immediately ...,moderate
4,test_pid_5,This year has been trash. : I dont know why I’...,moderate


In [None]:
display(df_train['Class labels'].unique())

array(['moderate', 'not depression', 'severe'], dtype=object)

In [None]:
label_mapping = {
    'moderate': 0,
    'not depression': 1,
    'severe': 2
}
df_train['Class labels'] = df_train['Class labels'].map(label_mapping)


In [None]:
df_train['Class labels'].value_counts()

Unnamed: 0_level_0,count
Class labels,Unnamed: 1_level_1
0,2169
1,848
2,228


In [None]:
max_length = 0
for text in df_train['text data']:
  max_length = max(max_length, len(text))

print(f"The length of the longest 'text data' string is: {max_length}")

The length of the longest 'text data' string is: 17342


In [None]:
my_data = []
for data in df_train['text data']:
  my_data.append(data)
for i, datum in enumerate(my_data[:5]):
  print("Datum #%d:\n %s | not depression\n\n" % (i, datum))

Datum #0:
 Im scared : This is it. I lie to myself every day and say ill make it out but i think this might just be it. 
I thought i wanted a girlfriend but i realize that what i really want is to feel loved. Ive spent 20 years on this earth but ive felt completely alone forever. My heart hurts. My subconscious taunts me every night only for me to wake up to the reality that im a loser who is not worth anyones time. 
Im scared i will live like this until the day i die.
Im being torn from the inside but im too much of coward to end it all.
Im sorry | not depression


Datum #1:
 New to this but just wanted to vent : I just finally realized that I’m kind of in a bad mental state. There’s this feeling of dread in me. I don’t want to work so I made an excuse not to go to work. I kind of feel like all of this stem from being financially unstable where I’m living pay check to pay check, hating my job because of the toxic environment I just had enough of it where I’m feeling like I have to wal

In [None]:
print(my_data[1])

New to this but just wanted to vent : I just finally realized that I’m kind of in a bad mental state. There’s this feeling of dread in me. I don’t want to work so I made an excuse not to go to work. I kind of feel like all of this stem from being financially unstable where I’m living pay check to pay check, hating my job because of the toxic environment I just had enough of it where I’m feeling like I have to walk on eggshells around people and having people talk about me because I am not at their standard, school and trying to keep up with grades and I don’t know. Just thoughts of not being here console me. I just don’t want to do anything at all I’m kind of just sad. I don’t know how to explain it. I hope someone can relate.


In [None]:
%ls

dev.tsv  test.tsv  train.tsv


In [None]:
!pwd

/content/depression-detection-lt-edi-2022/data/original_dataset/depression-detection-lt-edi-2022/data/original_dataset/depression-detection-lt-edi-2022/data/original_dataset


In [None]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_length = 0
for text in df_train['text data']:
  max_length = max(max_length, len(text))

print(f"The length of the longest 'Text_data' string is: {max_length}")

class DepressionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

texts = ["sample text here", "another sample text"]
labels = [0, 1]
dataset = DepressionDataset(texts, labels, tokenizer)
data_loader = DataLoader(dataset, batch_size=2)


The length of the longest 'Text_data' string is: 17342


In [None]:

for batch in data_loader:
    print("Input IDs:", batch['input_ids'])
    print("Attention Mask:", batch['attention_mask'])
    print("Labels:", batch['label'])
    break

Input IDs: tensor([[ 101, 7099, 3793,  ...,    0,    0,    0],
        [ 101, 2178, 7099,  ...,    0,    0,    0]])
Attention Mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Labels: tensor([0, 1])


In [None]:
class DepressionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=6390):
        self.texts = dataframe['text data'].tolist()
        self.labels = dataframe['Class labels'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

dataset = DepressionDataset(df_train, tokenizer)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)

In [None]:
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MHA(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads=8, dropout=0.4):
        super(MHA, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc_q = nn.Linear(embed_dim, embed_dim)
        self.fc_k = nn.Linear(embed_dim, embed_dim)
        self.fc_v = nn.Linear(embed_dim, embed_dim)
        self.fc_o = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)
        self.classifier = nn.Linear(embed_dim, 2)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        batch_size = x.size(0)
        q = self.fc_q(x)
        k = self.fc_k(x)
        v = self.fc_v(x)
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        v = v.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = torch.softmax(attn_weights, dim=-1)
        attn_output = torch.matmul(attn_weights, v)
        attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.embed_dim)
        output = self.fc_o(attn_output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.layer_norm(output + x)
        output = self.classifier(output)
        output = output[:, 0, :]
        return output

import torch.optim as optim
import torch.nn as nn
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
mha_model = MHA(vocab_size=tokenizer.vocab_size, embed_dim=768).to(device)

optimizer = optim.Adam(mha_model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss().to(device)

num_epochs = 10

for epoch in range(num_epochs):
    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].type(torch.long).to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = mha_model(input_ids)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")