# Masked language modeling

This file fine-tunes `roberta-large` using prompted-based method for depression detection.

In [1]:
!pip install transformers datasets evaluate accelerate

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, pipeline
import pandas as pd
from datasets import Dataset, DatasetDict
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, accuracy_score

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    train_path = '/content/gdrive/MyDrive/advanced-ml-project/data/train.tsv'
    test_path = '/content/gdrive/MyDrive/advanced-ml-project/data/test.tsv'
    dev_path = '/content/gdrive/MyDrive/advanced-ml-project/data/dev.tsv'

    eval_model_path = 'kwang123/MaskedLM-roberta-large'
except:
    train_path = 'data/train.tsv'
    test_path = 'data/test.tsv'
    dev_path = 'data/dev.tsv'

    eval_model_path = 'masked_model'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Mounted at /content/gdrive


device(type='cuda', index=0)

## Load Data

In [4]:
train = pd.read_csv(train_path, sep='\t', header=0)
train['label'] = train['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
train = train.sample(frac=1).reset_index(drop=True)

test = pd.read_csv(test_path, sep='\t', header=0)
test['label'] = test['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
test = test.sample(frac=1).reset_index(drop=True)

dev = pd.read_csv(dev_path, sep='\t', header=0)
dev['label'] = dev['label'].apply(lambda x: 'healthy' if x == 'not depression' else x)
dev = dev.sample(frac=1).reset_index(drop=True)
print(train.label.value_counts())
train.head(10)

moderate    6019
healthy     1971
severe       901
Name: label, dtype: int64


Unnamed: 0,PID,text,label
0,train_pid_6767,Affirmations For Wealth &amp; Happiness / 24/7...,healthy
1,train_pid_1318,"I hate my life, I wish I had never been born, ...",moderate
2,train_pid_5842,Life is boring as fuck : I hate being around p...,moderate
3,train_pid_4000,Absolutely. Zero. Motivation : With the new ye...,moderate
4,train_pid_2736,"if i fail 2020, it might be my year : already ...",moderate
5,train_pid_5127,"If you don’t want to read this, you don’t have...",moderate
6,train_pid_2886,I’m in so much pain it feels like I’m chocking...,moderate
7,train_pid_7907,Massacre : my name is Michelle Rebal. I am ver...,healthy
8,train_pid_3686,This time of year is weighing on me. : I wish ...,moderate
9,train_pid_5785,I think that's me done. : On March 6th of last...,moderate


In [5]:
data = DatasetDict({
    'train': Dataset.from_pandas(train),
    'test': Dataset.from_pandas(test),
    'dev': Dataset.from_pandas(dev),
})
data

DatasetDict({
    train: Dataset({
        features: ['PID', 'text', 'label'],
        num_rows: 8891
    })
    test: Dataset({
        features: ['PID', 'text', 'label'],
        num_rows: 3245
    })
    dev: Dataset({
        features: ['PID', 'text', 'label'],
        num_rows: 4496
    })
})

## Preprocessing

In [6]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    prompt = "The level of depression in the following tweet is {}.\n{}"
    texts = []
    for x, y in zip(examples["text"], examples["label"]):
        texts.append(prompt.format(y, x))
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

tokenized_data = data.map(
    preprocess_function,
    batched=True,
    remove_columns=data["train"].column_names,
)

tokenized_data

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/8891 [00:00<?, ? examples/s]

Map:   0%|          | 0/3245 [00:00<?, ? examples/s]

Map:   0%|          | 0/4496 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 8891
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3245
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4496
    })
})

In [7]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

## Training

In [None]:
model = AutoModelForMaskedLM.from_pretrained("roberta-large")

training_args = TrainingArguments(
    output_dir="model_output",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    data_collator=data_collator,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.3787,0.454573
2,0.2936,0.441656
3,0.2455,0.432219


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


TrainOutput(global_step=3336, training_loss=0.3550902876636679, metrics={'train_runtime': 2536.0242, 'train_samples_per_second': 10.518, 'train_steps_per_second': 1.315, 'total_flos': 2.486152377692467e+16, 'train_loss': 0.3550902876636679, 'epoch': 3.0})

In [None]:
trainer.save_model(eval_model_path)
tokenizer.save_pretrained(eval_model_path)

## Evaluation

In [8]:
from transformers import pipeline

In [9]:
mask_filler = pipeline(
    "fill-mask",
    eval_model_path,
    device=device,
    batch_size=16,
)

prompt = "The level of depression in this tweet is {}.\n{}"

predictions = mask_filler(
    [prompt.format(mask_filler.tokenizer.mask_token, x) for x in test['text'].tolist()],
    top_k=1,
    targets=['moderate', 'healthy', 'severe'],
    tokenizer_kwargs={'padding': 'max_length', 'truncation': True, 'max_length': 512}
)

config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [10]:
label2idx = {'moderate': 0, 'healthy': 1, 'severe':2}

y_preds = [label2idx[each[0]['token_str'].strip()] for each in predictions]
y_true = [label2idx[each] for each in test['label'].tolist()]

weighted = f1_score(y_true, y_preds, average='weighted')
macro = f1_score(y_true, y_preds, average='macro')
accuracy = accuracy_score(y_true, y_preds)
print("Weighted F1 score: %.4f, Macro F1 score: %.4f, Accuracy: %.4f" % (weighted, macro, accuracy))

Weighted F1 score: 0.6160, Macro F1 score: 0.3786, Accuracy: 0.6743
