# First load the dataset

In [1]:
!pip install -q datasets

In [2]:
from datasets import load_dataset
emotions = load_dataset("GerindT/mini_amazon_sentimental",)
emotions = emotions['test'].train_test_split(test_size=0.2)

### Clean up the dataset - leave only the necessary information - map sentiment to labels

In [3]:
emotions = emotions.remove_columns(["title", "content","label","score"])
emotions = emotions.rename_column("sentiment", "label")

emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 320000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 80000
    })
})

In [4]:
# Map sentiment values to numerical labels
sentiment_mapping = {
    "Perfect": 3,
    "Neutral": 1,
    "Negative": 0,
    "Satisfied": 2
}

# Apply the mapping to the 'label' column
emotions = emotions.map(lambda example: {'label': sentiment_mapping[example['label']]})


Map:   0%|          | 0/320000 [00:00<?, ? examples/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

In [5]:
emotions["test"][0]

{'text': "Fine Overview This is a lucid and concise overview of the philosophy of Karl Popper. Magee covers Popper's philosophy of science, philosophy of history, epistemology, and political philosophy in a series of well written chapters that also provide a reasonable amount of background information. The themes are Popper's emphasis on the provisional nature of knowledge, the important of vigorous criticism and falsification, constant questioning, and the importance of intellectual diversity. Magee is particularly concerned with demonstrating the underlying unity of Popper's thought and does a good job of connecting Popper's epistemology with his political philosophy. Magee's enthusiasm for Popper is obvious, perhaps to the extent of being a little uncritical. There are certainly precedents for Popper's fallibalist epistemology which Magee doesn't mention. Based on Magee's account, I don't think that Popper has really overcome the induction problem or that his evolutionism really reb

In [6]:
train_ds = emotions["train"]
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 320000
})

In [7]:
train_ds[1]

{'text': 'Waste of money. This is a haphazard collection of miscellaneous thoughts that is not worth the time or the money. Mr. Murray was out to collect a paycheck on this one.',
 'label': 0}

## Tokenizing the entire dataset - Distilbert Tokenizer

In [8]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [10]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)


In [11]:
emotions_encoded = emotions.map(tokenize, batched=True,
                               batch_size=None)

Map:   0%|          | 0/320000 [00:00<?, ? examples/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

In [12]:
emotions_encoded["test"][:2]


{'text': ["Fine Overview This is a lucid and concise overview of the philosophy of Karl Popper. Magee covers Popper's philosophy of science, philosophy of history, epistemology, and political philosophy in a series of well written chapters that also provide a reasonable amount of background information. The themes are Popper's emphasis on the provisional nature of knowledge, the important of vigorous criticism and falsification, constant questioning, and the importance of intellectual diversity. Magee is particularly concerned with demonstrating the underlying unity of Popper's thought and does a good job of connecting Popper's epistemology with his political philosophy. Magee's enthusiasm for Popper is obvious, perhaps to the extent of being a little uncritical. There are certainly precedents for Popper's fallibalist epistemology which Magee doesn't mention. Based on Magee's account, I don't think that Popper has really overcome the induction problem or that his evolutionism really re

## Model Training 

In [14]:
from transformers import AutoModelForSequenceClassification
import torch
num_labels = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model=AutoModelForSequenceClassification.from_pretrained(model_ckpt,
         num_labels = num_labels).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Evaluate

In [15]:
!pip install -q evaluate

In [16]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, 
                           references = labels)

## Logging to HuggingFace

In [17]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Setting Training Arguments

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    
    output_dir="distilbert-emotion-mini-amazon",
    num_train_epochs = 2, 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    push_to_hub = True,
    report_to = "none"    
)

In [19]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = emotions_encoded["train"],
    eval_dataset = emotions_encoded["test"],
    tokenizer = tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [20]:
trainer.train()

  0%|          | 0/40000 [00:00<?, ?it/s]

{'loss': 0.5642, 'learning_rate': 4.937500000000001e-05, 'epoch': 0.03}
{'loss': 0.4401, 'learning_rate': 4.875e-05, 'epoch': 0.05}
{'loss': 0.4008, 'learning_rate': 4.8125000000000004e-05, 'epoch': 0.07}
{'loss': 0.3689, 'learning_rate': 4.75e-05, 'epoch': 0.1}
{'loss': 0.364, 'learning_rate': 4.6875e-05, 'epoch': 0.12}
{'loss': 0.3292, 'learning_rate': 4.6250000000000006e-05, 'epoch': 0.15}
{'loss': 0.344, 'learning_rate': 4.5625e-05, 'epoch': 0.17}
{'loss': 0.3305, 'learning_rate': 4.5e-05, 'epoch': 0.2}
{'loss': 0.3099, 'learning_rate': 4.4375e-05, 'epoch': 0.23}
{'loss': 0.3205, 'learning_rate': 4.375e-05, 'epoch': 0.25}
{'loss': 0.3112, 'learning_rate': 4.3125000000000005e-05, 'epoch': 0.28}
{'loss': 0.288, 'learning_rate': 4.25e-05, 'epoch': 0.3}
{'loss': 0.3013, 'learning_rate': 4.1875e-05, 'epoch': 0.33}
{'loss': 0.2995, 'learning_rate': 4.125e-05, 'epoch': 0.35}
{'loss': 0.2952, 'learning_rate': 4.0625000000000005e-05, 'epoch': 0.38}
{'loss': 0.2881, 'learning_rate': 4e-05, '

  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.23700617253780365, 'eval_accuracy': 0.9065375, 'eval_runtime': 229.0823, 'eval_samples_per_second': 349.219, 'eval_steps_per_second': 21.826, 'epoch': 1.0}
{'loss': 0.1577, 'learning_rate': 2.4375e-05, 'epoch': 1.02}
{'loss': 0.1774, 'learning_rate': 2.375e-05, 'epoch': 1.05}
{'loss': 0.1606, 'learning_rate': 2.3125000000000003e-05, 'epoch': 1.07}
{'loss': 0.1643, 'learning_rate': 2.25e-05, 'epoch': 1.1}
{'loss': 0.1595, 'learning_rate': 2.1875e-05, 'epoch': 1.12}
{'loss': 0.1677, 'learning_rate': 2.125e-05, 'epoch': 1.15}
{'loss': 0.1616, 'learning_rate': 2.0625e-05, 'epoch': 1.18}
{'loss': 0.1587, 'learning_rate': 2e-05, 'epoch': 1.2}
{'loss': 0.1669, 'learning_rate': 1.9375e-05, 'epoch': 1.23}
{'loss': 0.1644, 'learning_rate': 1.8750000000000002e-05, 'epoch': 1.25}
{'loss': 0.1503, 'learning_rate': 1.8125e-05, 'epoch': 1.27}
{'loss': 0.1639, 'learning_rate': 1.75e-05, 'epoch': 1.3}
{'loss': 0.1636, 'learning_rate': 1.6875000000000004e-05, 'epoch': 1.32}
{'loss': 0.15

  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.25046175718307495, 'eval_accuracy': 0.9231875, 'eval_runtime': 229.1381, 'eval_samples_per_second': 349.134, 'eval_steps_per_second': 21.821, 'epoch': 2.0}
{'train_runtime': 6166.9618, 'train_samples_per_second': 103.779, 'train_steps_per_second': 6.486, 'train_loss': 0.22324391679763794, 'epoch': 2.0}


TrainOutput(global_step=40000, training_loss=0.22324391679763794, metrics={'train_runtime': 6166.9618, 'train_samples_per_second': 103.779, 'train_steps_per_second': 6.486, 'train_loss': 0.22324391679763794, 'epoch': 2.0})

## Predicting the validation dataset - Model evaluation

In [27]:
preds_output = trainer.predict(emotions_encoded["test"])

  0%|          | 0/5000 [00:00<?, ?it/s]

In [28]:
preds_output.metrics

{'test_loss': 0.23700617253780365,
 'test_accuracy': 0.9065375,
 'test_runtime': 233.6503,
 'test_samples_per_second': 342.392,
 'test_steps_per_second': 21.4}

Pushing the model to hugging face

In [22]:
trainer.push_to_hub(commit_message="Training completed!")

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/GerindT/distilbert-emotion-mini-amazon/commit/77607d4dc0aaabe8c1bda7f1074edb4c561e850e', commit_message='Training completed!', commit_description='', oid='77607d4dc0aaabe8c1bda7f1074edb4c561e850e', pr_url=None, pr_revision=None, pr_num=None)

## Model predictions - Calling the model from hugging face

In [23]:
from transformers import pipeline

model_id = "GerindT/distilbert-emotion-mini-amazon"
classifier = pipeline("text-classification", model= model_id)

config.json:   0%|          | 0.00/807 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [32]:
custom_text=["I bought this item 2 months ago but it said it would arrive in 2 weeks. I'm very disappointed with the service although the product is good. The quality is good but the delivery is very slow.",
             "the product was awesome",
             "it was ok the quality was average"]

In [30]:
preds=classifier(custom_text)

In [31]:
preds

[{'label': 'LABEL_0', 'score': 0.9648558497428894},
 {'label': 'LABEL_3', 'score': 0.999749481678009},
 {'label': 'LABEL_1', 'score': 0.9961615800857544}]

In [None]:
preds=classifier(custom_text)
preds_df = pd.DataFrame(preds)
print(preds_df)
