In [1]:
!pip install datasets



In [2]:
# Important Libraries
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

In [3]:
# Load the SST-2 dataset
dataset = load_dataset('glue','sst2')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# Load the model
model = DistilBertForSequenceClassification.from_pretrained(model_name)

In [5]:
# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)

# Apply tokenization
dataset = dataset.map(preprocess_function, batched=True)

print(dataset['train'][0])

{'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0, 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [6]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
)

# Start training
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mmohamed-hishamk90[0m ([33mmohamed-hishamk90-aiet[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.0336,0.44251
2,0.0639,0.535353
3,0.0267,0.569718


TrainOutput(global_step=12630, training_loss=0.05719917626934965, metrics={'train_runtime': 834.2145, 'train_samples_per_second': 242.2, 'train_steps_per_second': 15.14, 'total_flos': 3375846534095508.0, 'train_loss': 0.05719917626934965, 'epoch': 3.0})

In [7]:
# Evaluate the model
results = trainer.evaluate()

# Print evaluation results
print(results)

{'eval_loss': 0.5697184801101685, 'eval_runtime': 0.6024, 'eval_samples_per_second': 1447.494, 'eval_steps_per_second': 91.298, 'epoch': 3.0}


In [8]:
# Example text
text = "This movie was absolutely wonderful!"

# Tokenize the input and move tensors to the same device as the model
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to('cuda')

# Make predictions
with torch.no_grad():
    logits = model(**inputs).logits

# Convert logits to probabilities
probs = torch.softmax(logits, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probs, dim=-1).item()

# Map the predicted class to a label
labels = ["NEGATIVE", "POSITIVE"]
predicted_label = labels[predicted_class]

print(f"Predicted label: {predicted_label}")
print(f"Confidence: {probs[0][predicted_class].item():.4f}")

Predicted label: POSITIVE
Confidence: 1.0000


In [9]:
model.save_pretrained("./sentiment-analysis-model")
tokenizer.save_pretrained("./sentiment-analysis-tokenizer")

('./sentiment-analysis-tokenizer/tokenizer_config.json',
 './sentiment-analysis-tokenizer/special_tokens_map.json',
 './sentiment-analysis-tokenizer/vocab.txt',
 './sentiment-analysis-tokenizer/added_tokens.json')