# Voting System

The fien-tuned model for classification can be found at [here](https://drive.google.com/file/d/107vHAbHNqG05WlDmNSzV7m9vET72AVe_/view?usp=sharing)

In [None]:
!pip install transformers datasets evaluate accelerate sentencepiece

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from ast import literal_eval
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    train_path = '/content/gdrive/MyDrive/advanced-ml-project/data/train.tsv'
    test_path = '/content/gdrive/MyDrive/advanced-ml-project/data/test.tsv'
    dev_path = '/content/gdrive/MyDrive/advanced-ml-project/data/dev.tsv'
    emotion_train_path = '/content/gdrive/MyDrive/advanced-ml-project/data/train_emotion.csv'
    emotion_test_path = '/content/gdrive/MyDrive/advanced-ml-project/data/test_emotion.csv'
    emotion_dev_path = '/content/gdrive/MyDrive/advanced-ml-project/data/dev_emotion.csv'

    classification_model_path = '/content/gdrive/MyDrive/advanced-ml-project/bert-depression-detection.pth'
except:
    train_path = 'data/train.tsv'
    test_path = 'data/test.tsv'
    dev_path = 'data/dev.tsv'
    emotion_train_path = 'data/train_emotion.csv'
    emotion_test_path = 'data/test_emotion.csv'
    emotion_dev_path = 'data/dev_emotion.csv'

    masked_model_path = 'masked_model/'
    classification_model_path = 'bert-depression-detection.pth'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Load Data

In [None]:
label2idx = {'moderate': 0, 'not depression': 1, 'severe':2}
idx2label = {0: 'moderate', 1: 'not depression', 2: 'severe'}

In [None]:
test = pd.read_csv(emotion_test_path, sep='\t')
test['emotion_scores'] = test['emotion_scores'].apply(literal_eval)
test['label'] = test['label'].apply(lambda x: label2idx[x])

print('Length of Data:', len(test))
print(test.label.value_counts())
test.head(10)

Length of Data: 3245
label
0    2169
1     848
2     228
Name: count, dtype: int64


Unnamed: 0,PID,text,label,emotion_scores
0,test_pid_1,Im scared : This is it. I lie to myself every ...,0,"[5.375104904174805, -0.8492379784584045, -1.99..."
1,test_pid_2,New to this but just wanted to vent : I just f...,0,"[3.0481295585632324, 2.1657772064208984, -2.13..."
2,test_pid_3,I’m sad : It’s kinda always been an issue. I w...,0,"[-0.5275790691375732, -0.36453449726104736, -1..."
3,test_pid_4,Lonely but not alone. : All of my immediately ...,0,"[4.492433071136475, 0.433444082736969, -2.2141..."
4,test_pid_5,This year has been trash. : I dont know why I’...,0,"[0.8373550772666931, 1.134714961051941, -1.501..."
5,test_pid_6,Needed to yell into the void : I'm a pos. I'm ...,0,"[5.4630208015441895, -1.3103712797164917, -1.7..."
6,test_pid_7,I don’t know why I’m here anymore : I feel lik...,0,"[-1.5854066610336304, -1.2980746030807495, -1...."
7,test_pid_8,i dont want to be here anymore : i dont wanna ...,0,"[2.5347487926483154, 0.692678689956665, -2.158..."
8,test_pid_9,Antidepressants : Do antidepressants help if y...,0,"[-0.4418915808200836, -1.912025809288025, -2.0..."
9,test_pid_10,TMS : My doctor wants me to do TMS for my depr...,0,"[-3.599724531173706, 1.7751812934875488, 1.598..."


In [None]:
X = test[['text', 'emotion_scores']].values
y = test['label'].values
print(X.shape)
print(y.shape)

(3245, 2)
(3245,)


In [None]:
class MyDataSet(Dataset):
    def __init__(self, X, y, max_len=512):
        self.X = X
        self.y = y
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.max_len = max_len

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        prompt = "The level of depression in this tweet is {}.\n{}"
        text = self.X[index][0]
        label = self.y[index]
        features = self.X[index][1]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt', max_length=self.max_len)

        return {
            'text': prompt.format('<mask>', text),
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'token_type_ids': encoding['token_type_ids'].squeeze(),
            'features': torch.tensor(features),
            'label': torch.tensor(label),
        }

test_data = MyDataSet(X, y)
test_loader = DataLoader(test_data, batch_size=5, shuffle=True)

## Build Model

### MaskedML model

In [None]:
mask_filler = pipeline(
    "fill-mask",
    'kwang123/MaskedLM-roberta-large',
    device=device,
)

### Classification model

In [None]:
class DepressionClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.classifier = AutoModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(self.classifier.config.hidden_size + 5, 3)
        """ Initialize the weights of linear layer."""
        nn.init.xavier_normal_(self.linear1.weight)

    def forward(self, input_ids, token_type_ids, attention_mask, features):
        output = self.classifier(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        output = output.last_hidden_state[:, 0, :] # [batch size, hidden size]
        output = self.dropout(output)
        output = torch.cat((output, features), dim=-1) # [batch size, hidden size+num extra features]
        output = self.linear1(output) # [batch size, num labels]
        return output

model = DepressionClassifier().to(device)
model.load_state_dict(torch.load(classification_model_path))

<All keys matched successfully>

## Inference

Weight for MaskedLM model: $0.6160 \div (0.6160 + 0.7686) = 0.4449$

Weight for Classification model: $0.7686 \div (0.6160 + 0.7686) = 0.5551$

In [None]:
def get_logits(preds):
    logits = []
    for i in preds:
        tmp = {'moderate': 0, 'healthy': 1, 'severe':2}
        for j in i:
            tmp[j['token_str'].strip()] = j['score']
        logits.append([tmp['moderate'], tmp['healthy'], tmp['severe']])
    logits = F.softmax(torch.tensor(logits)*1000, dim=1)
    return logits

In [None]:
model.eval()

masked_weight = 0.6160 / (0.6160 + 0.7686)
classification_weight = 0.7686 / (0.6160 + 0.7686)

preds = []
y_true = []
for i, inputs in enumerate(tqdm(test_loader, leave=False, desc="Evaluating")):
    text = inputs['text']
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    features = inputs['features'].to(device)
    label = inputs['label'].to(device)

    with torch.no_grad():
        classification_logits = model(input_ids, token_type_ids, attention_mask, features).cpu()
    classification_logits = F.softmax(classification_logits, dim=1)

    predictions = mask_filler(
        text,
        top_k=3,
        targets=['moderate', 'healthy', 'severe'],
        tokenizer_kwargs={'padding': 'max_length', 'truncation': True, 'max_length': 512}
    )
    masked_logits = get_logits(predictions)

    classification_logits *= classification_weight
    masked_logits *= masked_weight
    logits = classification_logits + masked_logits

    y_pred = torch.argmax(logits, 1)

    preds += y_pred.cpu().tolist()
    y_true += label.cpu().tolist()

f1 = f1_score(np.array(y_true), np.array(preds), average='weighted')
print(f"F1 score: {f1}")

Evaluating:   0%|          | 0/649 [00:00<?, ?it/s]



F1 score: 0.8487601324211029
