In [None]:
# Install required libraries
!pip install datasets transformers huggingface_hub -q

In [None]:
# Import key libraries and packages
import numpy as np
import os
import pandas as pd

from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer

In [None]:
# Login to HF hub
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Load the datasets
train_df = pd.read_csv("/content/drive/MyDrive/waid/Azubi Africa BAP/LP5 - NLP/zindi_challenge/data/Train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/waid/Azubi Africa BAP/LP5 - NLP/zindi_challenge/data/Test.csv")

In [None]:
# View the training data info
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [None]:
# Drop the rows with nulls from the training data
train_df.dropna(inplace = True)

In [None]:
# Distribution of tweet sentiments
train_df["label"].value_counts()

 0.0    4908
 1.0    4053
-1.0    1038
Name: label, dtype: int64

## Fine-tuning the RoBERTa model

In [None]:
# Split the train data into train, eval
train, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

In [None]:
# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/08b4d993d880171203539b37794af581debb55ca/config.json
Model config RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  

In [None]:
# Save split data subsets
train.to_csv("/content/drive/MyDrive/waid/Azubi Africa BAP/LP5 - NLP/zindi_challenge/data/training_subset.csv", index=False)
eval.to_csv("/content/drive/MyDrive/waid/Azubi Africa BAP/LP5 - NLP/zindi_challenge/data/eval_subset.csv", index=False)

In [None]:
# Load the subsetted data
data = load_dataset("csv", 
                    data_files={"train": "/content/drive/MyDrive/waid/Azubi Africa BAP/LP5 - NLP/zindi_challenge/data/training_subset.csv",
                                "eval": "/content/drive/MyDrive/waid/Azubi Africa BAP/LP5 - NLP/zindi_challenge/data/eval_subset.csv"}, 
                    encoding = "ISO-8859-1")



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-c3335e90ad7e503f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-c3335e90ad7e503f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Define helper functions
## Function to transform labels
def transform_labels(label):

    label = label['label']
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2

    return {'labels': num}

## Function to tokenize data
def tokenize_data(example):
    return tokenizer(example['safe_text'], padding='max_length',truncation=True, max_length = 256)

In [None]:
# Tokenize the tweets
dataset = data.map(tokenize_data, batched=True)

# Transform	labels and limit the columns
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7999 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

In [None]:
# Define training arguments
training_args = TrainingArguments(
    "covid_tweets_sentiment_analysis_model", 
    num_train_epochs=5,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch"
    )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Load the pretrained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/08b4d993d880171203539b37794af581debb55ca/config.json
Model config RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tra

In [None]:
# Define evaluation metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Instantiate the training and evaluation sets
train_dataset = dataset["train"].shuffle(seed=24) 
eval_dataset = dataset["eval"].shuffle(seed=24)

In [None]:
#converting training data to PyTorch tensors to speed up training and adding padding:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Instantiate the trainer
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics)
trainer.train()

***** Running training *****
  Num examples = 7999
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 124647939


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6298,0.568754,0.7745
2,0.49,0.615174,0.79
3,0.3678,0.709918,0.796
4,0.2377,1.121533,0.7885
5,0.1272,1.277587,0.787


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Saving model checkpoint to covid_tweets_sentiment_analysis_model/checkpoint-1000
Configuration saved in covid_tweets_sentiment_analysis_model/checkpoint-1000/config.json
Model weights saved in covid_tweets_sentiment_analysis_model/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Saving model checkpoint to covid_tweets_sentiment_analysis_model/checkpoint-2000
Configuration saved in covid_tweets_sentiment_analysis_model/checkpoint-2000/config.json
Model weights saved in covid_tweets_sentiment_analysis_model/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Saving model checkpoint to covid_tweets_sentiment_analysis_model/checkpoint-3000
Configuration saved in covid_tweets_sentiment_analysis_model/checkpoint-3000/config.json
Model weights saved in covid_tweets_sentiment_analysis_model/checkpoint-3000/pytorch_model

TrainOutput(global_step=5000, training_loss=0.38404733276367187, metrics={'train_runtime': 2001.7078, 'train_samples_per_second': 19.98, 'train_steps_per_second': 2.498, 'total_flos': 5261610571015680.0, 'train_loss': 0.38404733276367187, 'epoch': 5.0})

In [None]:
# Reinstantiate the trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Launch the final evaluation 
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.5687537789344788,
 'eval_accuracy': 0.7745,
 'eval_runtime': 28.134,
 'eval_samples_per_second': 71.088,
 'eval_steps_per_second': 8.886}

In [None]:
# Push model and tokenizer to HF Hub
model.push_to_hub("KwameOO/covid-tweet-sentiment-analyzer-roberta")
tokenizer.push_to_hub("KwameOO/covid-tweet-sentiment-analyzer-roberta")

Configuration saved in /tmp/tmpop2ut3h5/config.json
Model weights saved in /tmp/tmpop2ut3h5/pytorch_model.bin
Uploading the following files to KwameOO/covid-tweet-sentiment-analyzer-roberta: config.json,pytorch_model.bin
tokenizer config file saved in /tmp/tmpffk34wn0/tokenizer_config.json
Special tokens file saved in /tmp/tmpffk34wn0/special_tokens_map.json
Uploading the following files to KwameOO/covid-tweet-sentiment-analyzer-roberta: vocab.json,special_tokens_map.json,merges.txt,tokenizer_config.json,tokenizer.json


CommitInfo(commit_url='https://huggingface.co/KwameOO/covid-tweet-sentiment-analyzer-roberta/commit/27d17019f0bb76be9f85867bc193bfc33cf6efac', commit_message='Upload tokenizer', commit_description='', oid='27d17019f0bb76be9f85867bc193bfc33cf6efac', pr_url=None, pr_revision=None, pr_num=None)