<a href="https://colab.research.google.com/github/HofstraDoboli/TextMining/blob/main/bert_sentiment_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets --upgrade
!pip install transformers[torch]
!pip install evaluate
! pip install accelerate -U

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/TextMining/DataSets/Covid_Tweets/
%ls *.csv

/content/drive/MyDrive/TextMining/DataSets/Covid_Tweets
Corona_NLP_test.csv  Corona_NLP_train.csv


In [None]:
from datasets import load_dataset
# download the data from": https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification
dataset = load_dataset('csv', data_files={'train': 'Corona_NLP_train.csv', 'test': 'Corona_NLP_test.csv'}, encoding = "ISO-8859-1")

In [None]:
# documentation datasets https://huggingface.co/docs/datasets/index
dataset


DatasetDict({
    train: Dataset({
        features: ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment'],
        num_rows: 41157
    })
    test: Dataset({
        features: ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment'],
        num_rows: 3798
    })
})

In [None]:
dataset['train'][:5] # ['OriginalTweet']

In [None]:
#  Tokenize data
from transformers import AutoTokenizer
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# PYTORCH_NO_CUDA_MEMORY_CACHING=1
# PYTORCH_CUDA_ALLOC_CONF = max_split_size_mb:128
base_model = 'distilbert-base-cased' # "bert-base-cased" # distilbert-base-cased - smaller model
tokenizer = AutoTokenizer.from_pretrained(base_model) # cased = means it was trained to recognize capitalization (vs. bert-base-uncased)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
# exaample what tokenization does
print(dataset['train'][10]["OriginalTweet"])
print(len(dataset['train'][10]["OriginalTweet"].split(' ')))
test_token = tokenizer(dataset['train'][10]["OriginalTweet"])
print(test_token.keys())
print(len(test_token['input_ids']))
print(test_token['input_ids'])

# input_ids = token's ids for parts of words in the text
# token_type_ids = all 0's if the input is one texr, 0's followed by 1's if input is a two sentence text
# attention_mask = if any of the words should be masked

All month there hasn't been crowding in the supermarkets or restaurants, however reducing all the hours and closing the malls means everyone is now using the same entrance and dependent on a single supermarket. #manila #lockdown #covid2019 #Philippines https://t.co/HxWs9LAnF9
39
dict_keys(['input_ids', 'attention_mask'])
73
[101, 1398, 2370, 1175, 8186, 112, 189, 1151, 3515, 1158, 1107, 1103, 20247, 1116, 1137, 7724, 117, 1649, 7914, 1155, 1103, 2005, 1105, 5134, 1103, 8796, 1116, 2086, 2490, 1110, 1208, 1606, 1103, 1269, 3448, 1105, 7449, 1113, 170, 1423, 20247, 119, 108, 1299, 8009, 108, 5842, 5455, 108, 1884, 18312, 10973, 16382, 108, 4336, 18630, 131, 120, 120, 189, 119, 1884, 120, 145, 1775, 2924, 1116, 1580, 10783, 1179, 2271, 1580, 102]


In [None]:
# tokenize all dataset - this works only with padding = True or padding = 'max_length'
def tokenize_data(example):
    return tokenizer(example['OriginalTweet'], truncation = True, padding = True, return_tensors = "pt").to(device) # padding='max_length'

dataset = dataset.map(tokenize_data, batched = True)  # tokenizes all the data

In [None]:
print(type(dataset['train'][1]['input_ids'])) # attention_mask is 1 for non padding tokens only, 0 for padding tokens
print(len(dataset['train'][1]['input_ids']))

<class 'list'>
512


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment', 'input_ids', 'attention_mask'],
        num_rows: 41157
    })
    test: Dataset({
        features: ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment', 'input_ids', 'attention_mask'],
        num_rows: 3798
    })
})

In [None]:
# transform the labels from words to numbers
def labels2num(label):

    label = label['Sentiment']
    dict_labels = {'Positive':0, 'Negative':1, 'Neutral':2,
                   'Extremely Positive':3, 'Extremely Negative':4}
    return {'labels': dict_labels[label]}

print(labels2num(dataset['train'][1]))

{'labels': 0}


In [None]:
remove_columns = ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']
dataset = dataset.map(labels2num, remove_columns=remove_columns)

In [None]:
dataset # see it added 'labels' feature and removed the rest

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 41157
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3798
    })
})

In [None]:
len(dataset['train']['input_ids'][1:3])

2

In [None]:
# set format of the dataset to torch
dataset.set_format(type="torch", columns=["input_ids",  "attention_mask", "labels"]) # "token_type_ids",
dataset['train'].format['type']

'torch'

In [None]:
# set the training and validation datesets
train_dataset = dataset['train'].shuffle(seed=10).select(range(40000))
eval_dataset  = dataset['train'].shuffle(seed=10).select(range(40000, 41000)) # validation dataset

In [None]:
from transformers import DefaultDataCollator, DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer) # only if you did not pad the data with the tokenizer

In [None]:
# load training arguments, trainer, AutoModel
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

#data_collator = DataCollatorWithPadding(tokenizer) # use only if you did not pad the data with the tokenizer
# base_model = "bert-base-cased" # already defined above
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels = 5).to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(model.parameters)

<bound method Module.parameters of DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropo

In [None]:


training_args = TrainingArguments(
    output_dir  = 'OutputTrain', # output directory
    num_train_epochs = 3,
    eval_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
    weight_decay = 0.01,
    push_to_hub = False,
    report_to = None
)
# training_args
import wandb # a library to visualize training process and results - disable it
wandb.init(mode='disabled')

In [None]:
print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object

In [None]:
train_dataset[1:3]['labels'].shape # 'input_ids'

torch.Size([2])

In [None]:
# evaluate the model
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics = {}
    metrics.update(accuracy.compute(predictions=predictions, references=labels))
    metrics.update(precision.compute(predictions=predictions, references=labels, average='weighted'))
    metrics.update(recall.compute(predictions=predictions, references=labels, average='weighted'))
    metrics.update(f1.compute(predictions=predictions, references=labels, average='weighted'))
    return metrics


In [None]:
trainer = Trainer(
    model= model,
    args = training_args,
    train_dataset= train_dataset,
    eval_dataset = eval_dataset,
    data_collator= data_collator, # only if you want to pad inputs during training
    compute_metrics = compute_metrics,
    # tokenizer = tokenizer - no longer used
    )



In [None]:
# test compute_metrics
trainer.evaluate()

In [None]:
!pip install pynvml
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

print_gpu_utilization()

GPU memory occupied: 5985 MB.


In [None]:
trainer.train() # it does batch size - 1875 * 64 = 120000/3 epochs = 40K samples per epoch

In [None]:
# check results of the trained model
trainer.evaluate(dataset['test'])

In [None]:
model.config.id2label = {0: 'Positive', 1: 'Negative', 2: 'Neutral', 3: 'Extremely Positive', 4: 'Extremely Negative'}
model.config.label2id = {'Positive': 0, 'Negative': 1, 'Neutral': 2, 'Extremely Positive': 3, 'Extremely Negative': 4}

In [None]:
# save model after training
model.save_pretrained("OutputTrain/model_11_21_24")
tokenizer.save_pretrained("OutputTrain/tokenizer_11_21_24")

('OutputTrain/tokenizer_11_21_24/tokenizer_config.json',
 'OutputTrain/tokenizer_11_21_24/special_tokens_map.json',
 'OutputTrain/tokenizer_11_21_24/vocab.txt',
 'OutputTrain/tokenizer_11_21_24/added_tokens.json',
 'OutputTrain/tokenizer_11_21_24/tokenizer.json')

In [None]:
# load best_model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
best_model     = AutoModelForSequenceClassification.from_pretrained("OutputTrain/model_11_21_24").to(device)
best_tokenizer = AutoTokenizer.from_pretrained("OutputTrain/tokenizer_11_21_24")

In [None]:
from datasets import load_dataset
# download the data from": https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification
dataset_test = load_dataset('csv', data_files={'test': 'Corona_NLP_test.csv'}, encoding = "ISO-8859-1")
print(dataset_test)


In [None]:
def predict(text):
    inputs = best_tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = best_model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    return best_model.config.id2label[predicted_class_id]

In [None]:
best_model.config.id2label

{0: 'Positive',
 1: 'Negative',
 2: 'Neutral',
 3: 'Extremely Positive',
 4: 'Extremely Negative'}

In [None]:
ind_tweet = 10
tweet = dataset_test['test']['OriginalTweet'][ind_tweet]
print(tweet)
print('Predicted sentiment', predict(tweet))
print('Original sentiment', dataset_test['test']['Sentiment'][ind_tweet])

Best quality couches at unbelievably low prices available to order.

We are in Boksburg GP 

For more info WhatsApp:
084 764 8086

#SuperTuesdsy #PowerTalk 
#Covid_19 #SayEntrepreneur 
#DJSBU https://t.co/HhDJhyQ2Dc
Predicted sentiment Extremely Positive
Original sentiment Positive


==== UP TO HERE 11/20/24 ============

In [None]:
# train more

trainer.train(sume_from_checkpoint = "OutputTrain/checkpoint-1875")

In [None]:
trainer.train()