In [41]:
import torch
from datasets import load_dataset
from huggingface_hub import notebook_login
from sklearn.metrics import accuracy_score, f1_score

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer




In [42]:
## Dataset loading & process

In [43]:
dataset = load_dataset("emotion")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [44]:
train_set = dataset["train"]

In [45]:
dataset.set_format(type="pandas")
dataframe = dataset["train"][:]

In [46]:
def label_converter(row):
  return dataset["train"].features["label"].int2str(row)

In [47]:
# creating a new row label_name for easier identification
dataframe["label_name"]=dataframe["label"].apply(label_converter)

In [48]:
dataset.reset_format()

In [49]:
## Data preprocessing

In [50]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)

In [51]:
tokenized_datasets = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [52]:
## Create a model & train

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 6)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [54]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [55]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [56]:
training_args = TrainingArguments(
    output_dir = "Finetuned-sentiment-model",
    num_train_epochs = 4,
    learning_rate = 2e-5,
    per_device_train_batch_size = 128,
    per_device_eval_batch_size = 128,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    disable_tqdm = False,
    push_to_hub = True

)

In [57]:
trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    tokenizer = tokenizer
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [58]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.531137,0.831,0.808091
2,No log,0.239038,0.9215,0.921437
3,No log,0.189505,0.932,0.931917
4,0.455900,0.179242,0.9315,0.931599


TrainOutput(global_step=500, training_loss=0.45586984252929685, metrics={'train_runtime': 263.955, 'train_samples_per_second': 242.466, 'train_steps_per_second': 1.894, 'total_flos': 1440685723392000.0, 'train_loss': 0.45586984252929685, 'epoch': 4.0})

In [59]:
# uploading the model to huggingface
trainer.push_to_hub(commit_message = "Training completed")

CommitInfo(commit_url='https://huggingface.co/KoontzP/Finetuned-sentiment-model/commit/94815e05bb16a75b9dbb4bc640151554129b433c', commit_message='Training completed', commit_description='', oid='94815e05bb16a75b9dbb4bc640151554129b433c', pr_url=None, pr_revision=None, pr_num=None)