In [None]:
# Install dependencies
!pip uninstall accelerate transformers -y
!pip install -U accelerate>=0.21.0
!pip3 install transformers
!pip3 install datasets
!pip3 install pandas
!pip3 install torch
!pip3 install scikit-learn
!pip3 install numpy
!pip3 install nltk emoji==0.6.0

[0mFound existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Collecting transformers
  Downloading transformers-4.39.0-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
Successfully installed transformers-4.39.0
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
# Install the BertTweet Repo, needed for the tweet normalizer
from os import path
from os import path
if not path.exists('./BERTweet/'):
  !git clone https://github.com/VinAIResearch/BERTweet/
import sys
sys.path.append('/content/BERTweet')

In [None]:
# Use BERTweet to normalize all tweets, so that they match the BERTweet style
from TweetNormalizer import normalizeTweet

def normalize_tweet(tweet_text):
  # In our data usernames are already normalized to @USER so this will not change anything
  # however URLS are already replaced to URL, but BERTweet uses HTTPURL instead of URL
  # so we have to replace these too, the rest is done by normalizeTweet from BERTweet
  return normalizeTweet(tweet_text).replace('URL', 'HTTPURL')

In [None]:
# Create the tokenizer
from transformers import AutoTokenizer

tokenizer  = AutoTokenizer.from_pretrained('vinai/bertweet-base')

def tokenize(tweet):
  # The normalized tweet is tokenized, padded and truncated based on how the
  # BERTweet model is trained aswell.
  return tokenizer(normalize_tweet(tweet['text']), padding='max_length', truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Create function to run after every epoch to calculate the metrics
def compute_metrics(p):
  pred, labels = p
  pred = np.argmax(pred, axis=1)

  accuracy = accuracy_score(y_true=labels, y_pred=pred)
  recall = recall_score(y_true=labels, y_pred=pred, average="macro")
  precision = precision_score(y_true=labels, y_pred=pred, average="macro")
  f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

  return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Create function to transform the labels to numbers
def transform_labels(tweet, encoder):
  l = encoder.transform([tweet['abusive_offensive_not']])
  tweet['label'] = l[0]
  return tweet

In [None]:
# Prepare the dataset
from datasets import load_dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder

# Set seed to 42 for replicability
SEED = 42

# Load the dataset
dataset = load_dataset('csv', data_files='train_data_offensive_abusive_taskC.csv')

# Remove unsused columns
# dataset.remove_columns(['id'])

# Tokenize and normalize the dataset
dataset = dataset.map(tokenize)

# Encode the labels
le = LabelEncoder()
le.fit(dataset['train']['abusive_offensive_not'])

dataset = dataset.map(lambda x: transform_labels(x, le))

# Remove unused columns
dataset = dataset.remove_columns(['id', 'text', 'abusive_offensive_not'])

# Show example of how the data looks like now
print(dataset)

# Split into train, validation and test sets
train_test_ds= dataset['train'].train_test_split(test_size=0.2, seed=SEED)
train_valid_ds = train_test_ds['train'].train_test_split(test_size=0.2, seed=SEED)

# Create a new dataset with the split data
ds = DatasetDict({
  'train': train_valid_ds['train'],
  'validation': train_valid_ds['test'],
  'test': train_test_ds['test']
})

# Show the shape of the dataset
print(ds)

# Show some examples
print(ds['train'][0:5])

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 6817
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 4362
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1091
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1364
    })
})
{'input_ids': [[0, 5, 6906, 980, 686, 1061, 54609, 2237, 22333, 818, 23585, 18, 1851, 450, 22536, 1178, 1131, 4580, 1035, 37926, 5406, 12663, 1384, 5284, 7271, 11506, 818, 2375, 806, 9880, 35430, 581, 2911, 520, 986, 14425, 14984, 7, 1384, 4934, 1059, 818, 2231, 1143, 15, 4203, 2115, 1604, 1851, 450, 22536, 1178, 3047, 2177, 6196, 17, 6754, 29549, 510, 49138, 7, 1930, 701, 650, 8772, 1851, 30318, 818, 69, 1849, 31830, 5559, 69, 4, 2, 1, 1, 1,

In [None]:
# Get and shuffle train, validation and test data
train_data = ds['train'].shuffle(seed=SEED)
validation_data = ds['validation'].shuffle(seed=SEED)
test_data = ds['test'].shuffle(seed=SEED)

In [None]:
# Model
from transformers import AutoModelForSequenceClassification, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base', num_labels=3)

training_args = TrainingArguments(
    output_dir = 'test_trainer_1',
    logging_dir = 'logs',
    evaluation_strategy="epoch",
    learning_rate=0.00005,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0,
    num_train_epochs=10
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = validation_data,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.789392,0.670027,0.396659,0.464313,0.427697
2,No log,0.776812,0.696609,0.436766,0.46088,0.437901
3,No log,0.814954,0.657195,0.506501,0.545077,0.489899
4,No log,0.757134,0.732356,0.607254,0.548638,0.551262
5,No log,0.890994,0.692942,0.530039,0.564589,0.515057
6,No log,0.829886,0.712191,0.604107,0.559851,0.572714
7,No log,0.894314,0.708524,0.619593,0.570512,0.578465
8,0.546100,0.933894,0.705775,0.603475,0.597947,0.597059
9,0.546100,0.976636,0.708524,0.589907,0.571803,0.578508
10,0.546100,1.006864,0.712191,0.603709,0.577539,0.585387


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=690, training_loss=0.4548017916472062, metrics={'train_runtime': 304.5354, 'train_samples_per_second': 143.235, 'train_steps_per_second': 2.266, 'total_flos': 2869251820323840.0, 'train_loss': 0.4548017916472062, 'epoch': 10.0})

In [None]:
model.eval()
x = trainer.predict(test_data)
x

NameError: name 'model' is not defined

In [None]:
# Save the model

from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My\ Drive/Uni-23-24/Machine\ Learning\ Project/Final\ Assignment/models/

# trainer.save_pretrained("bert_tweet_finetuned_1")
# model.save_pretrained ("bert_tweet_finetuned_1_model")

trainer.save_model("gdrive/My Drive/Uni-23-24/Machine\ Learning\ Project/Final\ Assignment/models/")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Uni-23-24/Machine Learning Project/Final Assignment/models
