# Install and import the required packages

In [None]:
!pip install numpy transformers==4.23.1 pandas scikit-learn torch matplotlib datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import transformers
import json
import numpy as np
import matplotlib
import pandas as pd
import torch 
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load pre-trained model, define data folder path and dataset file name

In [None]:
model_shortcut = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_shortcut)
model = AutoModelForSequenceClassification.from_pretrained(model_shortcut, num_labels=3)

DATA_PATH = '/content/drive/MyDrive/NLU Bert Spam Classification/data/'
DATASET_FILENAME = 'combinedmorespam-dataset-labelled'

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/4dc145c5bd4fdb672dcded7fdc1efd6c2bc55992/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.23.1",
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/4dc145c5bd4fdb672dcded7fdc1efd6c2bc55992/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/4dc145c5bd4fdb672dcded7fdc1efd6c2bc

In [None]:
def get_topics_from_csv(file_path):
  """Retrieves list of topic keywords from a topic csv file
  Args:
    file_path (str): file path to topic csv file
  Returns:
    topic_dict (dict): dictionary mapping from video_id to a list of topic keywords
  """
  # Dataframe of csv file
  df = pd.read_csv(file_path)

  # Create a dictionary mapping from video_id to their topics
  topic_dict = dict()
  video_ids = list(df['video_id'].unique())
  for video_id in video_ids:
    query = f"video_id=='{video_id}'"
    topic_keywords = df.query(query)['topic_keywords'].iloc[0]
    topic_dict[video_id] = topic_keywords

  return topic_dict

def text_to_label(cls):
  """Converts class to an integer label
  Args:
    cls (str): class
  Returns:
    label (int): integer label in set {0,1,2}, returns -1 if there is an unidentifiable class
  """
  if cls == "spam":
    return 0
  elif cls == "neutral":
    return 1
  elif cls == "ham":
    return 2
  else:
    return -1

def parse_comment(comment):
  """Parses a comment in a format suitable for the custom tokenizer
  Args:
    comment (str): raw comment string in the comment retrieval format
  Returns:
    parsed_comment (str): parsed string
  """  
  if '[REPLY]' in comment:
    return comment
  else:
    return f"[MAIN] {comment}"
    
def get_comments_from_csv(file_path):
  """Retrieve comment data from a specified .csv file
  Args:
      file_name (str): .csv file name
  Returns:
      comments_by_videoid (dict): dictionary mapping from video ID to its list of comments
      video_name_dict (dict): dictionary mapping from video ID its video name
  """
  if file_path[-4:] != '.csv':
      file_path += '.csv'

  df = pd.read_csv(file_path)
  info_dict = dict()
  video_ids = list(df['video_id'].unique())
  for video_id in video_ids:
      query = f"video_id=='{video_id}'"
      comments = list(df.query(query)['comment'])
      usernames = list(df.query(query)['username'])
      labels = list(df.query(query)['class'])

      info = []
      for i in range(0,len(comments)):
        label = text_to_label(labels[i])
        if label == -1:
          continue
        info.append([f"[USER] {usernames[i]} {parse_comment(comments[i])} ",label])

      info_dict[video_id] = info
      
  return info_dict

def create_dataset(data_path,dataset_filename):
  """Create a dataset from a .csv file
  Args:
      data_path (str): path to the data folder
      dataset_filename (str): name of the dataset file
  Returns:
      dataset (list): dataset where each data item contains a sentence pair (comment, topic keywords)
      labels (list): list of labels corresponding to the indices in the dataset
  """
  topic_dict = get_topics_from_csv(f"{data_path}/topics/{dataset_filename}-topics.csv")
  info_dict = get_comments_from_csv(f"{data_path}/labelled/{dataset_filename}")

  video_ids = topic_dict.keys()
  dataset = []
  labels = []
  for video_id in video_ids:
    for comment_data in info_dict[video_id]:
      data_sample = (comment_data[0],topic_dict[video_id])
      dataset.append(data_sample)
      labels.append(comment_data[1])

  return dataset, labels

# Add custom tokens to tokenizer
num_added_toks = tokenizer.add_tokens(['[USER]','[MAIN]','[REPLY]'], special_tokens=True)
model.resize_token_embeddings(len(tokenizer))

def tokenize_data(data):
  comments, topic_keywords = zip(*data)
  return tokenizer(comments,topic_keywords, padding="max_length", truncation='only_first')

# Import dataset from file path

In [None]:
# Define dataset and labels
dataset, labels = create_dataset(DATA_PATH,DATASET_FILENAME)

# Split into train and test data splits
train_data, test_data, train_labels, test_labels = train_test_split(dataset, labels, test_size=0.2, random_state=50)

# Encode train and test data
train_encodings = tokenize_data(train_data)
test_encodings = tokenize_data(test_data)


In [None]:
# Show examples of spam
idx = 0
while train_labels[idx] != 0 or test_labels[idx] != 0:
  idx += 1
print(train_labels[idx])
print(tokenizer.decode(train_encodings[idx].ids))
print(test_labels[idx])
print(tokenizer.decode(test_encodings[idx].ids))

# Show other examples
idx = 12
print(train_labels[idx])
print(tokenizer.decode(train_encodings[idx].ids))
print(test_labels[idx])
print(tokenizer.decode(test_encodings[idx].ids))


0
[CLS] [USER] [UNK] [UNK] = [UNK] [UNK] [UNK] [UNK] [MAIN] * Let ’ s just take a minute to appreciate how much time and work he put into these videos? * * It's unbelievable, and I think they deserve a lot more than that * [UNK] [UNK] : 08 [SEP] pie, kind, kreekcraft, anti, lol, dedication, baller, ever, video, content, honest, roblox, unbelievable, appreciate, dad, bruh, lot, kid, biggest, reply, vantherz, work, hilarious, much, touch, take, hell, update, people, cheat, grass, let, least, respect, webcam, love, cutie, problem, bot, bro, sssniperwolf, time, day, main, fake, minute, recent, moment [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

# Prepare dataset and Trainer arguments for training

In [None]:
# Define PyTorch Dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [None]:
training_args = TrainingArguments(
    optim="adamw_torch",
    output_dir='/content/drive/MyDrive/NLU Bert Spam Classification/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/MyDrive/NLU Bert Spam Classification/logs',            # directory for storing logs
    logging_steps=100,
    save_steps=100,
    learning_rate=2e-5
    eval_steps=100
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

# Fine-tune the model to the custom dataset

In [None]:
trainer.train('/content/drive/MyDrive/NLU Bert Spam Classification/results/checkpoint-400')

Loading model from /content/drive/MyDrive/NLU Bert Spam Classification/results/checkpoint-400.
***** Running training *****
  Num examples = 3538
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 666
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 400
  Will skip the first 1 epochs then the first 178 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/178 [00:00<?, ?it/s]

Step,Training Loss
500,0.325
600,0.2993


Saving model checkpoint to /content/drive/MyDrive/NLU Bert Spam Classification/results/checkpoint-500
Configuration saved in /content/drive/MyDrive/NLU Bert Spam Classification/results/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/NLU Bert Spam Classification/results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/NLU Bert Spam Classification/results/checkpoint-600
Configuration saved in /content/drive/MyDrive/NLU Bert Spam Classification/results/checkpoint-600/config.json
Model weights saved in /content/drive/MyDrive/NLU Bert Spam Classification/results/checkpoint-600/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=666, training_loss=0.12345781698599234, metrics={'train_runtime': 9169.686, 'train_samples_per_second': 1.158, 'train_steps_per_second': 0.073, 'total_flos': 1406034043508736.0, 'train_loss': 0.12345781698599234, 'epoch': 3.0})

# Save the model

In [None]:
trainer.save_model('/content/drive/MyDrive/NLU Bert Spam Classification/results/')

Saving model checkpoint to /content/drive/MyDrive/NLU Bert Spam Classification/results/
Configuration saved in /content/drive/MyDrive/NLU Bert Spam Classification/results/config.json
Model weights saved in /content/drive/MyDrive/NLU Bert Spam Classification/results/pytorch_model.bin


# Evaluate the model on the validation set

In [21]:
# Load saved model and training arguments
saved_model = DistilBertForSequenceClassification.from_pretrained("/content/drive/MyDrive/NLU Bert Spam Classification/results/model_0")
training_args = torch.load("/content/drive/MyDrive/NLU Bert Spam Classification/results/model_0/training_args.bin")

# Define evaluation metric
metric = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average=None)

# Retrieve validation set
eval_data, eval_labels = create_dataset(DATA_PATH,'combined-test-labelled')
eval_encodings = tokenize_data(eval_data)
eval_dataset = CustomDataset(eval_encodings, eval_labels)

# Evaluate model on the validation set
trainer = Trainer(
    model=saved_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.evaluate()

loading configuration file /content/drive/MyDrive/NLU Bert Spam Classification/results/model_0/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 28999
}

loading weights file /content/drive/MyDrive/NLU Bert Spam Classification

Trainer is attempting to log a value of "[0.44444444 0.50340136 0.89235127]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.6107633709907532,
 'eval_f1': array([0.44444444, 0.50340136, 0.89235127]),
 'eval_runtime': 581.6001,
 'eval_samples_per_second': 0.772,
 'eval_steps_per_second': 0.014}