# RoBERTa
This notebook aims to use transfer learning on a RoBERTa model to perform text classification and detect suicidal text.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install -qqq transformers datasets wandb
!pip install transformers[torch]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.1/254.1 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [None]:
# Import packages
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric

In [None]:
# Specify GPU
device = torch.device("cuda")

In [None]:
# Change to own directory
try:
    os.chdir("/content/drive/MyDrive/MyProject_SIDetection")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


## Define constants

In [None]:
# Define constants
EPOCHS = 1
BATCH_SIZE = 6
LEARNING_RATE = 1e-5
SEED = 4222

MODEL_SAVE_PATH = "Models/roberta"
MODEL_CHECKPOINT_PATH = "Models/roberta_checkpoint"
MODEL_LOGGING_PATH = "Models/roberta_checkpoint/logs"

WANDB_ENTITY = "s2120973"
WANDB_PROJECT = "SI-P2-MM"
WANDB_RUN = "roBERTa"

## Load dataset

In [None]:
# Load dataset
df = pd.read_csv('Data_Cleaned_Transformer.csv', header=0)
df = df[df[['text']].notnull().all(1)]
df

Unnamed: 0,text,label
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don't get affected by compliments...,0
2,Finally 2020 is almost over. So I can never he...,0
3,i need helpjust help me im crying so hard,1
4,"I’m so lostHello, my name is Adam 16 and I’ve ...",1
...,...,...
232069,If you don't like rock then your not going to ...,0
232070,You how you can tell i have so many friends an...,0
232071,pee probably tastes like salty tea smirking fa...,0
232072,The usual stuff you find hereI'm not posting t...,1


In [None]:
# Split dataset into train, validation and test sets
train, temp = train_test_split(df,
                               random_state=SEED,
                               test_size=0.2,
                               stratify=df['label'])

val, test = train_test_split(temp,
                             random_state=SEED,
                             test_size=0.5,
                             stratify=temp['label'])

## Load RoBERTa Model

In [None]:
# Load RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def dataset_conversion(train, test, val):
  """Converts pandas dataframe to Dataset."""

  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)
  val.reset_index(drop=True, inplace=True)

  train_dataset = Dataset.from_pandas(train)
  test_dataset = Dataset.from_pandas(test)
  val_dataset = Dataset.from_pandas(val)

  return DatasetDict({"train": train_dataset,
                      "test": test_dataset,
                      "val": val_dataset})

raw_datasets = dataset_conversion(train, test, val)

In [None]:
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/185626 [00:00<?, ? examples/s]

Map:   0%|          | 0/23204 [00:00<?, ? examples/s]

Map:   0%|          | 0/23203 [00:00<?, ? examples/s]

In [None]:
# Tokenise datasets
SAMPLE_SIZE = 20
small_train_dataset = tokenized_datasets["train"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
small_val_dataset = tokenized_datasets["val"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))

full_train_dataset = tokenized_datasets["train"]
full_test_dataset = tokenized_datasets["test"]
full_val_dataset = tokenized_datasets["val"]

In [None]:
# Import roBERTa-base pretrained model
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Login wandb
wandb.login()

#d83e0f9a4cf3e57b6ce8eeb9743b6c51b6606e2f

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Initialise wandb
wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, name=WANDB_RUN)

[34m[1mwandb[0m: Currently logged in as: [33ms2120973[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from scipy.special import softmax

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Calculate probabilities using softmax
    probabilities = softmax(logits, axis=1)[:, 1]  # Probabilities for the positive class

    # Load metrics
    metric_acc = load_metric("accuracy")
    metric_rec = load_metric("recall")
    metric_pre = load_metric("precision")
    metric_f1 = load_metric("f1")
    metric_auc = load_metric("roc_auc")

    # Convert logits to discrete predictions
    predictions = np.argmax(logits, axis=-1)

    # Calculate standard metrics
    accuracy = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    recall = metric_rec.compute(predictions=predictions, references=labels)["recall"]
    precision = metric_pre.compute(predictions=predictions, references=labels)["precision"]
    f1 = metric_f1.compute(predictions=predictions, references=labels)["f1"]

    # AUC calculation with probabilities, adjusting argument names
    auc = metric_auc.compute(prediction_scores=probabilities, references=labels)["roc_auc"]

    return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1, "auc": auc}

In [None]:
# Define model and training parameters
training_args = TrainingArguments(
    output_dir=MODEL_CHECKPOINT_PATH,
    overwrite_output_dir = True,
    report_to = 'wandb',
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    seed=SEED,
    # evaluation_strategy="epoch",
    run_name=WANDB_RUN,
    logging_dir=MODEL_LOGGING_PATH,
    save_strategy="steps",
    save_steps=1500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## Pre-trained RoBERTa

In [None]:
# Predict before fine-tuning
trainer.predict(full_test_dataset).metrics

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  metric_acc = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 0.7020664215087891,
 'test_accuracy': 0.4999138079641441,
 'test_recall': 0.0,
 'test_precision': 0.0,
 'test_f1': 0.0,
 'test_auc': 0.24929801257592507,
 'test_runtime': 240.85,
 'test_samples_per_second': 96.342,
 'test_steps_per_second': 16.06}

## Fine-tuned RoBERTa

In [None]:
# To observe training progress live
%wandb

In [None]:
# Fine-tune model
trainer.train()
# Resume fine-tuning from checkpoint
# trainer.train(MODEL_CHECKPOINT_PATH + "/" + "checkpoint-10500")

Step,Training Loss
500,0.3285
1000,0.2244
1500,0.2112
2000,0.1503
2500,0.1559
3000,0.1655
3500,0.1545
4000,0.1575
4500,0.1214
5000,0.1265


TrainOutput(global_step=30938, training_loss=0.11751769424232317, metrics={'train_runtime': 6548.4121, 'train_samples_per_second': 28.347, 'train_steps_per_second': 4.725, 'total_flos': 4.884025276225536e+16, 'train_loss': 0.11751769424232317, 'epoch': 1.0})

In [None]:
# Save fine-tuned model
trainer.save_model(MODEL_SAVE_PATH)

In [None]:
# Evaluate fine-tuned model
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

{'eval_loss': 0.09456352144479752,
 'eval_accuracy': 0.9805628582510882,
 'eval_recall': 0.9798328018615875,
 'eval_precision': 0.9812704988779561,
 'eval_f1': 0.9805511233774635,
 'eval_auc': 0.9979496681892613,
 'eval_runtime': 243.7824,
 'eval_samples_per_second': 95.179,
 'eval_steps_per_second': 15.867,
 'epoch': 1.0}

In [None]:
# Predict after fine-tuning
trainer.predict(full_test_dataset).metrics

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

{'test_loss': 0.09272676706314087,
 'test_accuracy': 0.9808222720220652,
 'test_recall': 0.9819027921406411,
 'test_precision': 0.9797918995614412,
 'test_f1': 0.9808462101321396,
 'test_auc': 0.998069675736072,
 'test_runtime': 249.8088,
 'test_samples_per_second': 92.887,
 'test_steps_per_second': 15.484}

In [None]:
# Terminate wandb run
wandb.finish()

VBox(children=(Label(value='127.680 MB of 127.680 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁
eval/auc,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
eval/accuracy,0.98056
eval/auc,0.99795
eval/f1,0.98055
eval/loss,0.09456
eval/precision,0.98127
eval/recall,0.97983
eval/runtime,243.7824
eval/samples_per_second,95.179
eval/steps_per_second,15.867
train/epoch,1.0


In [None]:
def get_training_history(wandb_run):
  """Extract key metrics from training and eval from wandb run data."""

  # Get training history from wandb
  api = wandb.Api()
  run = api.run(wandb_run)
  history = run.history()

  # Rename columns
  train_column_dict = {'train/epoch': 'epoch', 'train/loss': 'training_loss'}
  val_column_dict = {'train/epoch': 'epoch', 'eval/loss': 'validation_loss', 'eval/accuracy': 'accuracy',
                'eval/precision': 'precision', 'eval/recall': 'recall', 'eval/f1': 'f1','eval/auc':'auc'}

  # Train data
  train_history = history[list(train_column_dict.keys())]
  train_history.columns = [train_column_dict.get(x, x) for x in train_history.columns]
  train_history = train_history.dropna()

  # Val data
  val_history = history[list(val_column_dict.keys())]
  val_history.columns = [val_column_dict.get(x, x) for x in val_history.columns]
  val_history = val_history.dropna()

  return pd.merge(train_history, val_history, how="right", on="epoch")

# Get dataframe for training history
WANDB_RUN_ID = "om0kbhnu" # Replace with your wandb run details, found in the training cell

training_history = get_training_history(WANDB_ENTITY + "/" + WANDB_PROJECT + "/" + WANDB_RUN_ID)
training_history

Unnamed: 0,epoch,training_loss,validation_loss,accuracy,precision,recall,f1,auc
0,1.0,,0.094564,0.980563,0.98127,0.979833,0.980551,0.99795


In [None]:
# Load fine-tuned model
saved_model = AutoModelForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)

# Load trainer after fine-tune
saved_trainer = Trainer(
    model=saved_model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Predict after fine-tuning
saved_trainer.predict(full_test_dataset).metrics

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datase

{'test_loss': 0.09272676706314087,
 'test_accuracy': 0.9808222720220652,
 'test_recall': 0.9819027921406411,
 'test_precision': 0.9797918995614412,
 'test_f1': 0.9808462101321396,
 'test_auc': 0.998069675736072,
 'test_runtime': 243.1352,
 'test_samples_per_second': 95.437,
 'test_steps_per_second': 15.909}

## GPU Memory Utilities

In [None]:
# Delete variables and empty cache
del trainer
del model
torch.cuda.empty_cache()

In [None]:
# Python garbage collection
import gc
gc.collect()

38868

In [None]:
# Check memory allocation
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

517586432
2145386496


In [None]:
# Check memory summary
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 505455 KiB |   5302 MiB | 608692 GiB | 608691 GiB |
|       from large pool | 504960 KiB |   5300 MiB | 605246 GiB | 605246 GiB |
|       from small pool |    495 KiB |      3 MiB |   3445 GiB |   3445 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 505455 KiB |   5302 MiB | 608692 GiB | 608691 GiB |
|       from large pool | 504960 KiB |   5300 MiB | 605246 GiB | 605246 GiB |
|       from small pool |    495 KiB |      3 MiB |   3445 GiB |   3445 GiB |
|---------------------------------------------------------------

In [None]:
# Check GPU allocation and acprocesses
!nvidia-smi

Mon Jan  1 14:17:58 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0              41W / 300W |   2430MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    