# Loading Libraries


In [None]:
!pip install datasets -q
!pip install transformers --upgrade
!pip install accelerate>=0.20.1 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.37.2


In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
import datasets
import nltk
import transformers
import warnings
import torch
import torch.nn.functional as F
warnings.filterwarnings("ignore")

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Loading dataset


In [None]:
from datasets import load_dataset
data=load_dataset('carblacac/twitter-sentiment-analysis')

Downloading builder script:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/149985 [00:00<?, ? examples/s]

Map:   0%|          | 0/61998 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/120 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Generating train split:   0%|          | 0/119988 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29997 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/61998 [00:00<?, ? examples/s]

In [None]:
def label2str(label):
    return data['train'].features['label'].int2str(label)

# Tokenization


In [None]:
from transformers import AutoTokenizer
model_ckpt="bert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
tokenizer.model_input_names

['input_ids', 'token_type_ids', 'attention_mask']

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'],padding=True,truncation=True,return_tensors='pt')

In [None]:
tokenized_data=data.map(tokenize,batched=True)
tokenized_data

Map:   0%|          | 0/119988 [00:00<?, ? examples/s]

Map:   0%|          | 0/29997 [00:00<?, ? examples/s]

Map:   0%|          | 0/61998 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'feeling', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 119988
    })
    validation: Dataset({
        features: ['text', 'feeling', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 29997
    })
    test: Dataset({
        features: ['text', 'feeling', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 61998
    })
})

In [None]:
tokenized_data=tokenized_data.rename_column('feeling','label')

# Loading Bert model for Fine tuning


In [None]:
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained(model_ckpt,num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Setting Training Args


In [None]:
from transformers import Trainer,TrainingArguments
batch_size=16
model_name=f"{model_ckpt}-finetuned-sentiment"
training_args=TrainingArguments(output_dir=model_name,
                               num_train_epochs=2,
                               learning_rate=2e-5,
                               per_device_train_batch_size=batch_size,
                               per_device_eval_batch_size=batch_size,
                               weight_decay=0.01,
                               evaluation_strategy='epoch',
                               disable_tqdm=False,
                               logging_steps=len(tokenized_data['train'])//batch_size,
                               log_level='error')

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
 labels = pred.label_ids
 preds = pred.predictions.argmax(-1)
 f1 = f1_score(labels, preds, average="weighted")
 acc = accuracy_score(labels, preds)
 return {"accuracy": acc, "f1": f1}

In [None]:
# tokenized_data.set_format('torch',columns=['input_ids','attention_mask','feeling'])
tokenized_data.set_format('torch',columns=['input_ids','attention_mask','label'])

# Training Model


In [None]:
trainer = Trainer(model=model, args=training_args,
 compute_metrics=compute_metrics,
 train_dataset=tokenized_data["train"],
 eval_dataset=tokenized_data["validation"],
 tokenizer=tokenizer)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3785,0.337136,0.855652,0.855652


KeyboardInterrupt: 

- Stopped training at one epoch because taking too long for training


# Evaluation


In [None]:
predictions=trainer.predict(tokenized_data['validation'])
predictions.metrics

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3785,0.337136,0.855652,0.855652


{'test_loss': 0.37952345609664917,
 'test_accuracy': 0.853952061872854,
 'test_f1': 0.8539513876321886,
 'test_runtime': 160.805,
 'test_samples_per_second': 186.543,
 'test_steps_per_second': 11.66}

# Saving Model


In [None]:
trainer.save_model("/save_bert")