<a href="https://colab.research.google.com/github/Mohammadhsiavash/DeepL-Training/blob/main/Unsupervised%2BSemi-Supervised/Sentiment_Analysis_with_Fine_Tuned_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Fine-tune or use a pre-trained BERT model for sentiment classificaon on text data
like reviews or tweets.

In [1]:
!pip install transformers datasets torch scikit-learn pandas

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Load Text Data

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/sentiment140


In [3]:
import pandas as pd

# The dataset has no header, so we provide column names
column_names = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(f"{path}/training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None, names=column_names)

display(df.head())

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Tokenize Text Using BERT Tokenizer

In [6]:
from transformers import BertTokenizer
from datasets import Dataset
import pandas as pd

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Select a smaller chunk of the DataFrame
chunk_size = 10000  # Define the size of the chunk you want to use
df_subset = df.head(chunk_size) # Using head() to get the first 'chunk_size' rows

# Convert the smaller pandas DataFrame to Hugging Face Dataset
dataset_subset = Dataset.from_pandas(df_subset)

# Tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenizer to the dataset subset
tokenized_datasets = dataset_subset.map(tokenize_function, batched=True)

display(tokenized_datasets)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['target', 'ids', 'date', 'flag', 'user', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

 Load Pretrained BERT for Sequence Classificaon

In [7]:
from transformers import AutoModelForSequenceClassification

# Load the pre-trained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Prepare the data for training


In [9]:
# Rename the target column to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("target", "labels")

# Split the dataset into training and validation sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Set the format to torch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

display(train_dataset)
display(eval_dataset)

Dataset({
    features: ['labels', 'ids', 'date', 'flag', 'user', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8000
})

Dataset({
    features: ['labels', 'ids', 'date', 'flag', 'user', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

## Define training arguments



In [14]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,  # Reduced epochs
    per_device_train_batch_size=32, # Increased batch size
    per_device_eval_batch_size=64,
    warmup_steps=100,  # Reduced warmup steps
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disable reporting to services like Weights & Biases
)

display(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
eval_use_gather_object=False,


## Train the model


In [15]:
from transformers import Trainer

# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start training
trainer.train()

Step,Training Loss
10,0.0023
20,0.0016
30,0.0009
40,0.0005
50,0.0003
60,0.0002
70,0.0001
80,0.0001
90,0.0
100,0.0


TrainOutput(global_step=250, training_loss=0.00025176614310476, metrics={'train_runtime': 760.9366, 'train_samples_per_second': 10.513, 'train_steps_per_second': 0.329, 'total_flos': 2104888442880000.0, 'train_loss': 0.00025176614310476, 'epoch': 1.0})

In [16]:
# Evaluate the model
evaluation_results = trainer.evaluate()

display(evaluation_results)

{'eval_loss': 7.972508683451451e-06,
 'eval_runtime': 56.8101,
 'eval_samples_per_second': 35.205,
 'eval_steps_per_second': 0.563,
 'epoch': 1.0}