In [1]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
!pip install transformers evaluate accelerate sentencepiece

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
import numpy as np
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
import torch.nn as nn

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    train_path = '/content/gdrive/MyDrive/advanced-ml-project/data/train.tsv'
    test_path = '/content/gdrive/MyDrive/advanced-ml-project/data/test.tsv'
    dev_path = '/content/gdrive/MyDrive/advanced-ml-project/data/dev.tsv'

    eval_model_path = '/content/gdrive/MyDrive/advanced-ml-project/masked_lm_model.pth'
except:
    train_path = 'data/train.tsv'
    test_path = 'data/test.tsv'
    dev_path = 'data/dev.tsv'

    eval_model_path = 'masked_lm_model.pth'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Load Data

In [4]:
train = pd.read_csv(train_path, sep='\t', header=0)
train['label'] = train['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
train = train.sample(frac=1).reset_index(drop=True)
print('Length of train:', len(train))

test = pd.read_csv(test_path, sep='\t', header=0)
test['label'] = test['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
test = test.sample(frac=1).reset_index(drop=True)
print('Length of test:', len(test))

dev = pd.read_csv(dev_path, sep='\t', header=0)
dev['label'] = dev['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
dev = dev.sample(frac=1).reset_index(drop=True)
print('Length of dev:', len(dev))

print(train.label.value_counts())
train.head(10)

Length of train: 8891
Length of test: 3245
Length of dev: 4496
label
moderate    6019
healthy     1971
severe       901
Name: count, dtype: int64


Unnamed: 0,PID,text,label
0,train_pid_2550,"Now that 2019 is behind us, and we greet 2020....",moderate
1,train_pid_8552,"Is anyone awake? : Please help, I need a voice...",severe
2,train_pid_744,being alive is fucking exhausting : [removed],moderate
3,train_pid_2902,I came to the conclusion that I'm dead inside....,moderate
4,train_pid_640,Happy new year : Fuck 2019...2020 will be bett...,moderate
5,train_pid_8543,I don’t want to live my life sick and tired : ...,severe
6,train_pid_228,Had probably the worst year in my life.. : 201...,moderate
7,train_pid_6633,"Insecurities, fuck em. : I constantly feel lik...",healthy
8,train_pid_1805,Fuck Holidays : I feel the loneliest around th...,moderate
9,train_pid_6101,Goodbye 2019. May you rot in hell. :,healthy


We add the prompt to the begining of the sentence, so that when the sequence is longer than the max sequence length of the model, we can always keep our prompt and mask.

In [5]:
label2idx = {'healthy': 0, 'severe': 1, 'moderate': 2}

class MLMDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.tokenizer = tokenizer
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        prompt = "The level of depression in the following tweet is {}.\n{}"
        text = prompt.format(self.tokenizer.mask_token, self.X[idx])
        inputs = self.tokenizer(text, padding='max_length', max_length=512, return_tensors='pt', truncation=True)

        mask_index = torch.where(inputs['input_ids'].squeeze() == tokenizer.mask_token_id)[0]
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(label2idx[self.y[idx]]),
            'mask_index': mask_index
        }

tokenizer = AutoTokenizer.from_pretrained("roberta-large/")

train_data = MLMDataset(train.text, train.label, tokenizer)
test_data = MLMDataset(test.text, test.label, tokenizer)
dev_data = MLMDataset(dev.text, dev.label, tokenizer)

train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
test_loader = DataLoader(test_data, batch_size=8, shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=8, shuffle=True)

## Training

In [6]:
class MaskedLMModel(nn.Module):
    def __init__(self, model_name):
        super(MaskedLMModel, self).__init__()
        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
        
        # classification head
        self.dense = nn.Linear(self.model.config.vocab_size, 1024)
        self.dropout = nn.Dropout(0.1)
        self.out_proj = nn.Linear(1024, 3)

    def forward(self, input_ids, attention_mask, mask_index):
        outputs = self.model(input_ids, attention_mask)
        
        # classification head
        outputs = outputs[0][[batch for batch in range(len(mask_index))], mask_index, :] # [:, 0, :]
        outputs = self.dropout(outputs)
        outputs = self.dense(outputs)
        outputs = torch.tanh(outputs)
        outputs = self.dropout(outputs)
        outputs = self.out_proj(outputs)
        return outputs

We use weighted loss here in order to deal with imbalanced data

In [7]:
def compute_class_weight(train_y):
    """
    Compute class weight given imbalanced training data
    Usually used in the neural network model to augment the loss function (weighted loss function)
    Favouring/giving more weights to the rare classes.
    """
    import sklearn.utils.class_weight as scikit_class_weight

    train_y = [label2idx[each] for each in train_y]
    class_list = list(set(train_y))
    class_weight_value = scikit_class_weight.compute_class_weight(class_weight='balanced', classes=class_list, y=train_y)

    return torch.tensor(class_weight_value, dtype=torch.float).to(device)

In [8]:
# model = AutoModelForMaskedLM.from_pretrained("roberta-large").to(device)
model = MaskedLMModel("roberta-large/").to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_func = nn.CrossEntropyLoss(compute_class_weight(train.label.to_list()))

In [9]:
num_epoch = 5
best_testing_loss = float("inf")
best_model_state = None

for epoch in tqdm(range(num_epoch), desc='Training Process'):
    training_losses, training_f1 = [], []
    testing_losses, testing_f1 = [], []

    model.train()
    for _, inputs in enumerate(tqdm(train_loader, leave=False, desc=f"Epoch {epoch + 1}/{num_epoch}")):
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = inputs['labels'].to(device)
        mask_index = inputs['mask_index'].to(device).squeeze()

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, mask_index)
        # print(outputs.shape)
#         selected_output = outputs[[batch for batch in range(labels.shape[0])], mask_index, :]

        loss = loss_func(outputs, labels)

        loss.backward()
        optimizer.step()
        
        y_preds = torch.argmax(outputs, 1)
        f1 = f1_score(labels.cpu(), y_preds.cpu(), average='weighted')

        training_losses.append(loss.item())
        training_f1.append(f1)

    model.eval()
    for _, inputs in enumerate(tqdm(test_loader, leave=False, desc="Evaluating")):
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = inputs['labels'].to(device)
        mask_index = inputs['mask_index'].to(device).squeeze()

        with torch.no_grad():
            outputs = model(input_ids, attention_mask, mask_index)
        # selected_output = outputs[[batch for batch in range(labels.shape[0])], mask_index, :]
        loss = loss_func(outputs, labels)
        
        y_preds = torch.argmax(outputs, 1)
        f1 = f1_score(labels.cpu(), y_preds.cpu(), average='weighted')
        
        testing_losses.append(loss.item())
        testing_f1.append(f1)
    
    if np.mean(testing_losses) < best_testing_loss:
        best_testing_loss = np.mean(testing_losses)
        best_model_state = model.state_dict()
        
    print('Epoch %d: Training Loss: %.4f, Training f1: %.4f, Testing Loss: %.4f, Testing f1: %.4f' % 
          (epoch + 1, np.mean(training_losses), np.mean(training_f1), np.mean(testing_losses), np.mean(testing_f1)))

print('Best testing loss is', best_testing_loss)

Training Process:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5:   0%|          | 0/1112 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/406 [00:00<?, ?it/s]

Epoch 1: Training Loss: 1.1727, Training f1: 0.5009, Testing Loss: 1.0742, Testing f1: 0.5484


Epoch 2/5:   0%|          | 0/1112 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/406 [00:00<?, ?it/s]

Epoch 2: Training Loss: 1.1066, Training f1: 0.4904, Testing Loss: 1.0946, Testing f1: 0.5479


Epoch 3/5:   0%|          | 0/1112 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
if best_model_state is not None:
    torch.save(best_model_state, eval_model_path)
# torch.save(model.state_dict(), eval_model_path)

## Evaluating

In [None]:
model = MaskedLMModel('roberta-large/').to(device)
model.load_state_dict(torch.load(eval_model_path))
model.eval()

In [None]:
testing_losses = []
testing_f1 = []
model.eval()
for _, inputs in enumerate(tqdm(dev_loader, leave=False, desc="Evaluating")):
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = inputs['labels'].to(device)
    mask_index = inputs['mask_index'].to(device).squeeze()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
    selected_output = outputs[[batch for batch in range(labels.shape[0])], mask_index, :]
    loss = loss_func(selected_output, labels)
    
    y_preds = torch.argmax(selected_output, 1)
    f1 = f1_score(labels.cpu(), y_preds.cpu(), average='weighted')
    
    testing_losses.append(loss.item())
    testing_f1.append(f1)

print('Evaluation loss: %.4f, Evaluation f1-score: %.4f' % (np.mean(testing_losses), np.mean(testing_f1)))