In [1]:
!pip install datasets transformers evaluate accelerate cleantext -qqqqq

In [2]:
from datasets import load_dataset
import os

os.environ["WANDB_MODE"]="offline"

In [3]:
# Load the dataset as a huggingface dataset object
ds = load_dataset('csv', data_files={'train': r"C:\Users\dell\Desktop\MyDocs\Docs\MK\trustpilot_reviews_2005.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
def create_labels(examples):
    label = 0 # Default to Negative
    if examples['stars'] == 5 or examples['stars'] == 4:
        label = 2 # Positive
    if examples['stars'] == 3:
        label = 1 # Neutral

    examples['label'] = label
    return examples

In [5]:
from cleantext import clean

def clean_text(examples):
    examples['clean_review'] = clean(examples['review'], clean_all=False, lowercase=True, extra_spaces=True)
    return examples

In [6]:
labeled_ds = ds.map(create_labels, batched=False)

Map:   0%|          | 0/123181 [00:00<?, ? examples/s]

In [7]:
labeled_ds = labeled_ds.map(clean_text, batched=False)

Map:   0%|          | 0/123181 [00:00<?, ? examples/s]

In [8]:
labeled_ds = labeled_ds['train'].train_test_split(test_size=0.25)

In [9]:
labeled_ds

DatasetDict({
    train: Dataset({
        features: ['category', 'company', 'description', 'title', 'review', 'stars', 'label', 'clean_review'],
        num_rows: 92385
    })
    test: Dataset({
        features: ['category', 'company', 'description', 'title', 'review', 'stars', 'label', 'clean_review'],
        num_rows: 30796
    })
})

In [10]:
from pprint import pprint
pprint(labeled_ds['test'][:3])
pprint(labeled_ds['train'][:3])

{'category': ['Electronics & Technology', 'Sports', 'Restaurants & Bars'],
 'clean_review': ['i wrote a review on here earlier and explained several '
                  "times that i'm deaf. they then replied and asked me to "
                  'phone them .............i think shark employees need a '
                  'refresher on how to deal with disabled customers. lesson 1 '
                  "would go as follows: don't ask customers who have explained "
                  'that they are deaf, and can only email you to phone '
                  "customer services. i can only assume that they don't "
                  'actually read the reviews and instead use the same copy and '
                  'paste reply for all negative reviews. no customer should '
                  'have to resort to begging a company via a negative review '
                  "to reply to their email. i didn't think my experience with "
                  'shark could get any more frustrating, but here we ar

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    return tokenizer(examples["clean_review"], padding="max_length", truncation=True)

tokenized_datasets = labeled_ds.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/92385 [00:00<?, ? examples/s]

Map:   0%|          | 0/30796 [00:00<?, ? examples/s]

In [12]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(3000)) # Use more samples for better accuracy
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(750)) # Or run the cell below instead to use the full set

In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=r"C:\Users\dell\Desktop\MyDocs\Docs\MK\distilbert_base_tp_2005",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  num_train_epochs=3,
                                  logging_dir=r"C:\Users\dell\Desktop\MyDocs\Docs\MK\distilbert_base_tp_2005/logs",
                                  run_name = "distil_bert_uncased_tp_2025",
                                  report_to="wandb",
                                  logging_steps=5)

In [19]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.19.3-py3-none-win_amd64.whl.metadata (10 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting sentry-sdk>=2.0.0 (from wandb)
  Downloading sentry_sdk-2.20.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.4-cp312-cp312-win_amd64.whl.metadata (10 kB)
Downloading wandb-0.19.3-py3-none-win_amd64.whl (19.7 MB)
   ---------------------------------------- 0.0/19.7 MB ? eta -:--:--
   ------ --------------------------------- 3.1/19.7 MB 16.9 MB/s eta 0:00:01
   -------------- ------------------------- 7.1/19.7 MB 18.2 MB/s eta 0:00:01
   ---------------------- ----------------- 11.0/19.7 MB 18.6 MB/s eta 0:00:01
   ------------------------------ --------- 15.2/19.7 MB 18.8 MB/s eta 0:00:01
   -------------------------------------- - 19.1/19.7 MB 18.9 MB/s eta 0:00:01
   ---------------------------------------- 19.7/1

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [21]:
trainer.train()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
model.save_pretrained(r'C:\Users\dell\Desktop\MyDocs\Docs\MK\distilbert-tp-123k')

In [None]:
from transformers import pipeline
pipe = pipeline(task="sentiment-analysis", model="C:\Users\dell\Desktop\MyDocs\Docs\MK\distilbert-tp-123k", tokenizer=tokenizer, device="cpu")

In [None]:
label2text = {'LABEL_0': 'Negative', 'LABEL_1': 'Neutral', 'LABEL_2': 'Positive'}
reviews = ["I’ve purchased first of those coats in May2020. Still goes strong although my girl is not with us anymore her little sister is still using it. I’ve bought a limited edition this year as a Christmas gift for my current dog so she has also one of her own.My dog is notorious at rolling so needs to be washed pretty much every day. Those coats are a life saver. Dry dog and no shivering within 20 min. So glad I’ve found the brand. Highly recommend it for any dog parent.",
           "Took my son and friends for a birthday party 6 children under 12. A week later get a £200 fine for parking in a free car park!!Apparently you have to type your reg number into something to validate your stay. They don’t care as a third party operate the car park so it’s not there problem if you get a fine!! Everywhere I’ve ever been with one of these systems the person at reception will tell you when booking to enter your details into the machine.  But not Hollywood bowl Oxford!!I see they have replied to me saying to get in touch I already did they said it’s not their problem due to them not owning the car park!",
           "Beyond brilliant! Rachel's energy is something else and she entertains them children thoroughly throughout the entire time of booking, plenty of dancing and music and she matches her routine to the ages of the children.I've used for a couple of years now and have no hesitation in booking her for every kids event we have, She makes it a proper party and the adults love it too!"]

sentiments = pipe(reviews)
i = 0
for sentiment in sentiments:
  print(f"Review:\n{reviews[i]}")
  print(f"Category: {label2text[sentiment['label']]}")
  print(f"Confidence: {round(sentiment['score'] * 100, 2)}%")
  print("=" * 40)
  i += 1