<a href="https://colab.research.google.com/github/Nebil1/UNDP-FTL-AI/blob/main/Task_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 3: Sentiment Analysis with BERT

Load tweets

In [1]:
import nltk
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples

pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

print("Sample positive tweet:", pos_tweets[0])
print("Sample negative tweet:", neg_tweets[0])

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


Sample positive tweet: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Sample negative tweet: hopeless for tmr :(


Run BERT sentiment analysis on the tweets

In [2]:
from transformers import pipeline

# Load BERT sentiment classifier
classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Test on a few tweets
print(classifier(pos_tweets[0]))
print(classifier(neg_tweets[0]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu
  return forward_call(*args, **kwargs)


[{'label': '5 stars', 'score': 0.6683592200279236}]
[{'label': '1 star', 'score': 0.5024276375770569}]


Test

In [3]:
custom_texts = [
    "This degradable plastic broke down in a week!",
    "The compostable packaging smelled awful",
    "I love these reusable containers",
    "This eco-plastic is a scam"
]

for text in custom_texts:
    print(f"{text} → {classifier(text)}")

This degradable plastic broke down in a week! → [{'label': '1 star', 'score': 0.7523602247238159}]
The compostable packaging smelled awful → [{'label': '1 star', 'score': 0.5603858232498169}]
I love these reusable containers → [{'label': '5 stars', 'score': 0.8426023125648499}]
This eco-plastic is a scam → [{'label': '1 star', 'score': 0.7374317049980164}]


 Generate synthetic dataset (500 positive, 500 negative)

In [4]:
import random

# Simple synthetic generator
positive_synthetic = [f"I love this sustainable product! #{i}" for i in range(500)]
negative_synthetic = [f"This eco-friendly thing is terrible... #{i}" for i in range(500)]

# Combine and shuffle
texts = positive_synthetic + negative_synthetic
labels = [1]*500 + [0]*500
combined = list(zip(texts, labels))
random.shuffle(combined)
texts, labels = zip(*combined)

### Fine-tune BERT

In [5]:
!pip install transformers datasets torch scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

1 Create dataset

In [6]:
from datasets import Dataset
import random

# Create synthetic texts and labels
positive = [f"I love this sustainable product #{i}" for i in range(500)]
negative = [f"I hate this eco product #{i}" for i in range(500)]

texts = positive + negative
labels = [1]*500 + [0]*500

# Shuffle
combined = list(zip(texts, labels))
random.shuffle(combined)
texts, labels = zip(*combined)

# Build HuggingFace Dataset
data = Dataset.from_dict({"text": texts, "label": labels})

2 Tokenize the texts

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_data = data.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

3 Train/validation split

In [8]:
tokenized_data = tokenized_data.train_test_split(test_size=0.2)

4 Load BERT model for classification

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)

transformers.training_args


 5 Define Trainer

In [11]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

In [12]:
import transformers
print(transformers.__file__)

/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none" # Disable Weights & Biases logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


Train the model

In [None]:
trainer.train()

In [None]:
!pip install transformers==4.30.0

In [None]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset