# Masked language modeling

This file fine-tunes `roberta-large` using prompted-based method for depression detection.

In [None]:
!pip install transformers datasets evaluate accelerate

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, pipeline
import pandas as pd
from datasets import Dataset, DatasetDict
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    train_path = '/content/gdrive/MyDrive/advanced-ml-project/data/train.tsv'
    test_path = '/content/gdrive/MyDrive/advanced-ml-project/data/test.tsv'
    dev_path = '/content/gdrive/MyDrive/advanced-ml-project/data/dev.tsv'

    eval_model_path = '/content/gdrive/MyDrive/advanced-ml-project/masked_model'
except:
    train_path = 'data/train.tsv'
    test_path = 'data/test.tsv'
    dev_path = 'data/dev.tsv'

    eval_model_path = 'masked_model'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Load Data

In [None]:
train = pd.read_csv(train_path, sep='\t', header=0)
train['label'] = train['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
train = train.sample(frac=1).reset_index(drop=True)

test = pd.read_csv(test_path, sep='\t', header=0)
test['label'] = test['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
test = test.sample(frac=1).reset_index(drop=True)

dev = pd.read_csv(dev_path, sep='\t', header=0)
dev['label'] = dev['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
dev = dev.sample(frac=1).reset_index(drop=True)
print(train.label.value_counts())
train.head(10)

label
moderate    6019
healthy     1971
severe       901
Name: count, dtype: int64


Unnamed: 0,PID,text,label
0,train_pid_2553,Give me a reason : 27 years old and honestly r...,moderate
1,train_pid_1041,meh : i wish i didnt see reality as it is. i w...,moderate
2,train_pid_8472,I feel like a ghost... : I feel like nobody wo...,severe
3,train_pid_4283,Alone on Nye (another holiday) again : I'm so ...,moderate
4,train_pid_7849,How do you recover from self destruction? : [r...,healthy
5,train_pid_6270,Have a happy near year.... : I'm spending this...,healthy
6,train_pid_2741,Cringing over my own socializing attempts : A ...,moderate
7,train_pid_325,"If depressions doesn’t kill me, loneliness wil...",moderate
8,train_pid_8265,Going in and out of happy and depressed. : I f...,severe
9,train_pid_2392,meh : i wish i didnt see reality as it is. i w...,moderate


In [None]:
data = DatasetDict({
    'train': Dataset.from_pandas(train),
    'test': Dataset.from_pandas(test),
    'dev': Dataset.from_pandas(dev),
})
data

DatasetDict({
    train: Dataset({
        features: ['PID', 'text', 'label'],
        num_rows: 8891
    })
    test: Dataset({
        features: ['PID', 'text', 'label'],
        num_rows: 3245
    })
    dev: Dataset({
        features: ['PID', 'text', 'label'],
        num_rows: 4496
    })
})

## Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    prompt = "The level of depression in the following tweet is {}.\n{}"
    texts = []
    for x, y in zip(examples["text"], examples["label"]):
        texts.append(prompt.format(y, x))
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

tokenized_data = data.map(
    preprocess_function,
    batched=True,
    remove_columns=data["train"].column_names,
)

tokenized_data

Map:   0%|          | 0/8891 [00:00<?, ? examples/s]

Map:   0%|          | 0/3245 [00:00<?, ? examples/s]

Map:   0%|          | 0/4496 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 8891
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3245
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4496
    })
})

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

## Training

In [None]:
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

training_args = TrainingArguments(
    output_dir="model_output",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    data_collator=data_collator,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.3787,0.454573
2,0.2936,0.441656
3,0.2455,0.432219


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


TrainOutput(global_step=3336, training_loss=0.3550902876636679, metrics={'train_runtime': 2536.0242, 'train_samples_per_second': 10.518, 'train_steps_per_second': 1.315, 'total_flos': 2.486152377692467e+16, 'train_loss': 0.3550902876636679, 'epoch': 3.0})

In [None]:
trainer.save_model(eval_model_path)
tokenizer.save_pretrained(eval_model_path)

## Evaluation

In [None]:
from transformers import pipeline

In [None]:
mask_filler = pipeline(
    "fill-mask",
    eval_model_path,
    device=device,
    batch_size=16,
)

prompt = "The level of depression in this tweet is {}.\n{}"

predictions = mask_filler(
    [prompt.format(mask_filler.tokenizer.mask_token, x) for x in test['text'].tolist()],
    top_k=1,
    targets=['moderate', 'healthy', 'severe'],
    tokenizer_kwargs={'padding': 'max_length', 'truncation': True, 'max_length': 512}
)

In [None]:
label2idx = {'moderate': 0, 'healthy': 1, 'severe':2}

y_preds = [label2idx[each[0]['token_str'].strip()] for each in predictions]
y_true = [label2idx[each] for each in test['label'].tolist()]
f1 = f1_score(y_true, y_preds, average='weighted')
print('Weighted f1: %.4f' % (f1))

Weighted f1: 0.6160
