In [1]:
!pip install transformers
!pip install datasets
!pip install nltk

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import BertTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [3]:
# Download required NLTK data
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
def clean_text(text):
    if isinstance(text, float):
        text = str(text)
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", text)  # Remove URLs, hashtags and mentions
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stopwords.words("english")]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return " ".join(tokens)

In [6]:
# Load and preprocess the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [7]:
# Clean the text
train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)

In [8]:
# Map sentiment labels to integers
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
train_df["sentiment"] = train_df["sentiment"].map(label_mapping)
test_df["sentiment"] = test_df["sentiment"].map(label_mapping)

In [9]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:

# Convert to Hugging Face datasets
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)

In [11]:
def tokenize_and_format(examples):
    # Tokenize the texts
    tokenized = tokenizer(
        examples["clean_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors=None
    )

    # Add labels - they're already mapped as integers
    tokenized["label"] = examples["sentiment"]

    return tokenized

In [12]:
# Apply tokenization
tokenized_train = dataset_train.map(
    tokenize_and_format,
    batched=True,
    remove_columns=dataset_train.column_names
)

tokenized_test = dataset_test.map(
    tokenize_and_format,
    batched=True,
    remove_columns=dataset_test.column_names
)

Map:   0%|          | 0/27481 [00:00<?, ? examples/s]

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [13]:
print("Training set size:", len(tokenized_train))
print("Test set size:", len(tokenized_test))

Training set size: 27481
Test set size: 3534


In [14]:
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [15]:
# Set the format to PyTorch tensors
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [16]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [17]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments
from transformers import Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

In [18]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_sentiment",
    num_train_epochs=10,              # More epochs but with early stopping
    per_device_train_batch_size=32,   # Larger batch size for better generalization
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,                 # Warmup to prevent early overfitting
    weight_decay=0.1,                 # Increased weight decay for stronger regularization
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,                   # More frequent evaluation
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    learning_rate=1e-5,               # Lower learning rate
    save_total_limit=2,
    remove_unused_columns=True,
    gradient_accumulation_steps=2,
    fp16=True,
    report_to="none",
    label_smoothing_factor=0.1,       # Label smoothing
    # gradient_clipping=1.0,           # Gradient clipping - Remove or replace with max_grad_norm
    max_grad_norm=1.0                 # If using a newer version, use max_grad_norm
)

# Load model and initialize trainer
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    hidden_dropout_prob=0.3,          # Increased dropout
    attention_probs_dropout_prob=0.3,  # Increased attention dropout
    classifier_dropout=0.3,
)

# Custom Trainer with additional regularization
class RegularizedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): # Add num_items_in_batch argument
        outputs = model(**inputs)
        loss = outputs.loss

        # L2 regularization for all parameters
        l2_lambda = 0.01
        l2_reg = torch.tensor(0., requires_grad=True)
        for param in model.parameters():
            l2_reg = l2_reg + torch.norm(param, 2)
        loss += l2_lambda * l2_reg

        return (loss, outputs) if return_outputs else loss

# Initialize trainer with early stopping
trainer = RegularizedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stop if no improvement for 3 evaluations
)





model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train with reduced learning rate and early stopping
print("Starting training with anti-overfitting measures...")
train_result = trainer.train()

# Print training metrics
print("\nTraining metrics:")
print(train_result.metrics)

# Evaluate final model
print("\nEvaluating final model...")
eval_results = trainer.evaluate()
print("\nEvaluation metrics:")
print(eval_results)


Starting training with anti-overfitting measures...


Step,Training Loss,Validation Loss


In [None]:


from google.colab import drive
drive.mount('/content/drive')


In [None]:
# prompt: save themodel in the drive

# Save the model to Google Drive
model_save_path = "/content/drive/MyDrive/bert_sentiment_model"  # Choose a suitable path
trainer.save_model(model_save_path)

# Save the tokenizer as well
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")
