<a href="https://colab.research.google.com/github/KM-Adnan-Absar/Sentiment_Analysis/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification,get_linear_schedule_with_warmup


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

from tqdm import tqdm, trange,tnrange,tqdm_notebook
import random
import os
import io
# % matplotlib inline

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

# Safely print GPU name only if GPU exists
if torch.cuda.is_available():
    print("GPU available:", torch.cuda.get_device_name(0))
else:
    print("No GPU found. Using CPU.")

SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


GPU available: Tesla T4


In [5]:
df_train = pd.read_csv("/content/Social Media Engagement Dataset.csv")

In [6]:
df_train.isnull().sum()

Unnamed: 0,0
post_id,0
timestamp,0
day_of_week,0
platform,0
user_id,0
location,0
language,0
text_content,0
hashtags,0
mentions,3941


In [7]:
df_train.head()

Unnamed: 0,post_id,timestamp,day_of_week,platform,user_id,location,language,text_content,hashtags,mentions,...,comments_count,impressions,engagement_rate,brand_name,product_name,campaign_name,campaign_phase,user_past_sentiment_avg,user_engagement_growth,buzz_change_rate
0,kcqbs6hxybia,2024-12-09 11:26:15,Monday,Instagram,user_52nwb0a6,"Melbourne, Australia",pt,Just tried the Chromebook from Google. Best pu...,#Food,,...,701,18991,0.19319,Google,Chromebook,BlackFriday,Launch,0.0953,-0.3672,19.1
1,vkmervg4ioos,2024-07-28 19:59:26,Sunday,Twitter,user_ucryct98,"Tokyo, Japan",ru,Just saw an ad for Microsoft Surface Laptop du...,"#MustHave, #Food","@CustomerService, @BrandCEO",...,359,52764,0.05086,Microsoft,Surface Laptop,PowerRelease,Post-Launch,0.1369,-0.451,-42.6
2,memhx4o1x6yu,2024-11-23 14:00:12,Saturday,Reddit,user_7rrev126,"Beijing, China",ru,What's your opinion about Nike's Epic React? ...,"#Promo, #Food, #Trending",,...,643,8887,0.45425,Nike,Epic React,BlackFriday,Post-Launch,0.2855,-0.4112,17.4
3,bhyo6piijqt9,2024-09-16 04:35:25,Monday,YouTube,user_4mxuq0ax,"Lagos, Nigeria",en,Bummed out with my new Diet Pepsi from Pepsi! ...,"#Reviews, #Sustainable","@StyleGuide, @BrandSupport",...,743,6696,0.42293,Pepsi,Diet Pepsi,LaunchWave,Launch,-0.2094,-0.0167,-5.5
4,c9dkiomowakt,2024-09-05 21:03:01,Thursday,Twitter,user_l1vpox2k,"Berlin, Germany",hi,Just tried the Corolla from Toyota. Absolutely...,"#Health, #Travel","@BrandSupport, @InfluencerName",...,703,47315,0.08773,Toyota,Corolla,LocalTouchpoints,Launch,0.6867,0.0807,38.8


In [8]:
df_train['sentiment_label'].unique()


array(['Positive', 'Negative', 'Neutral'], dtype=object)

In [9]:
df_train['sentiment_label'].value_counts()

Unnamed: 0_level_0,count
sentiment_label,Unnamed: 1_level_1
Negative,4854
Positive,4839
Neutral,2307


In [10]:
df_train = df_train[~df_train['sentiment_label'].isnull()]

In [11]:
df_train = df_train[~df_train['text_content'].isnull()]


In [12]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

df_train['category_1'] = labelencoder.fit_transform(df_train['sentiment_label'])


In [13]:
df_train[['sentiment_label', 'category_1']].drop_duplicates(keep='first')


Unnamed: 0,sentiment_label,category_1
0,Positive,2
1,Negative,0
13,Neutral,1


In [14]:
df_train.rename(columns={'category_1':'label'},inplace=True)

In [15]:
## create label and sentence list
sentences = df_train['text_content'].values

# check distribution of data based on labels
print("Distribution of data based on labels: ", df_train['sentiment_label'].value_counts())

# Set maximum sequence length
MAX_LEN = 256

# Import BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


Distribution of data based on labels:  sentiment_label
Negative    4854
Positive    4839
Neutral     2307
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
inputs = tokenizer(
    sentences.tolist(),
    max_length=MAX_LEN,
    padding="max_length",
    truncation=True,
    return_tensors=None
)

input_ids = inputs["input_ids"]

In [17]:
# Labels
labels = df_train['sentiment_label'].values

# Print an example sentence
print("Actual sentence before tokenization: ", sentences[2])

# Tokenize sentences (if not done yet)
encoded_inputs = tokenizer(
    sentences.tolist(),            # convert numpy array to list
    add_special_tokens=True,       # add [CLS] and [SEP]
    max_length=MAX_LEN,
    padding='max_length',          # pad to MAX_LEN
    truncation=True,
    return_tensors='pt'            # return PyTorch tensors
)

# Input IDs
input_ids = encoded_inputs['input_ids']

print("Encoded Input from dataset: ", input_ids[2])

# Create attention masks: 1 for real tokens, 0 for padding
attention_masks = (input_ids != 0).long()

print(attention_masks[2])

Actual sentence before tokenization:  What's your opinion about Nike's Epic React?  #Promo, #Food, #Trending Really interested in hearing your thoughts!
Encoded Input from dataset:  tensor([  101,  2054,  1005,  1055,  2115,  5448,  2055, 18368,  1005,  1055,
         8680, 10509,  1029,  1001, 19430,  1010,  1001,  2833,  1010,  1001,
         9874,  2075,  2428,  4699,  1999,  4994,  2115,  4301,   999,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,   

In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch

# -------------------------------
# 1. Encode sentiment labels into integers
# -------------------------------
labelencoder = LabelEncoder()
labels = labelencoder.fit_transform(df_train['sentiment_label'])  # integer encoded

# -------------------------------
# 2. Split into train and validation sets
# -------------------------------
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(
    input_ids, attention_masks, labels,
    test_size=0.1, random_state=42
)

# -------------------------------
# 3. Convert to torch tensors safely
# -------------------------------
train_inputs = train_inputs.clone() if isinstance(train_inputs, torch.Tensor) else torch.tensor(train_inputs)
validation_inputs = validation_inputs.clone() if isinstance(validation_inputs, torch.Tensor) else torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels, dtype=torch.long)
validation_labels = torch.tensor(validation_labels, dtype=torch.long)
train_masks = train_masks.clone() if isinstance(train_masks, torch.Tensor) else torch.tensor(train_masks)
validation_masks = validation_masks.clone() if isinstance(validation_masks, torch.Tensor) else torch.tensor(validation_masks)

  train_labels = torch.tensor(train_labels, dtype=torch.long)
  validation_labels = torch.tensor(validation_labels, dtype=torch.long)


In [19]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Create TensorDataset for training data
train_data = TensorDataset(train_inputs, train_masks, train_labels)

# Create DataLoader for training data
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

# Create TensorDataset for validation data
# Note: Due to the initial incorrect unpacking in `train_test_split` (cell AIr5PJLFjhYi),
# validation_inputs contains input_ids, validation_masks contains actual labels,
# and validation_labels contains actual attention_masks. We use these variable names
# as they are currently assigned, matching the order used for train_data.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)

# Create DataLoader for validation data
validation_sampler = SequentialSampler(validation_data) # SequentialSampler is typically used for validation/test sets
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

# Access the first example from train_data for verification
print(train_data[0])

(tensor([  101, 13599, 19102,  9088,  3422,  2000,  1996,  2971,  1012,  3811,
        16755,  1012,  1001,  2569,  7245,  2121,  8025,  2055,  2115,  3325,
         2205,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,

In [20]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification

# Initialize BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3  # set this to number of classes
).to(device)

# Parameters
lr = 2e-5
adam_epsilon = 1e-8
epochs = 3

# Total training steps
num_training_steps = len(train_dataloader) * epochs
num_warmup_steps = 0

# Optimizer (torch.optim.AdamW, no correct_bias argument)
optimizer = AdamW(model.parameters(), lr=lr, eps=adam_epsilon)

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
train_loss_set = []
learning_rate = []

for epoch in trange(1, epochs + 1, desc="Epoch"):
    print("<" + "="*22 + f" Epoch {epoch} " + "="*22 + ">")
    model.train()
    batch_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Move batch to device
        # Due to incorrect unpacking in a previous cell (AIr5PJLFjhYi),
        # the batch elements from train_dataloader are ordered as
        # (input_ids, actual_labels, actual_attention_masks).
        # We need to pass them to the model as (input_ids, attention_mask, labels).
        b_input_ids = batch[0].to(device)
        b_labels_correct = batch[1].to(device) # This is the actual labels tensor
        b_attention_mask_correct = batch[2].to(device) # This is the actual attention mask tensor

        optimizer.zero_grad()
        outputs = model(
            input_ids=b_input_ids,
            token_type_ids=None,
            attention_mask=b_attention_mask_correct, # Pass the correctly identified attention mask
            labels=b_labels_correct # Pass the correctly identified labels
        )

        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        batch_loss += loss.item()

    avg_train_loss = batch_loss / len(train_dataloader)
    train_loss_set.append(avg_train_loss)

    for param_group in optimizer.param_groups:
        learning_rate.append(param_group['lr'])
        print("\n\tCurrent Learning rate:", param_group['lr'])

    print(f"\n\tAverage Training Loss: {avg_train_loss}")

    # -----------------------------
    # Validation
    # -----------------------------
    model.eval()
    eval_accuracy, eval_mcc_accuracy, nb_eval_steps = 0, 0, 0
    all_metrics = []

    # Assuming validation_dataloader is similarly structured due to the prior incorrect split.
    # Note: validation_dataloader was not defined in the current notebook state
    for batch in validation_dataloader:
        # Re-mapping for validation batch as well
        b_input_ids = batch[0].to(device)
        b_labels_correct = batch[1].to(device)
        b_attention_mask_correct = batch[2].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=b_input_ids,
                token_type_ids=None,
                attention_mask=b_attention_mask_correct # Use the correctly assigned attention mask
            )
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        labels_flat = b_labels_correct.detach().cpu().numpy() # Use the correctly assigned labels

        pred_flat = np.argmax(logits, axis=1).flatten()

        all_metrics.append(pd.DataFrame({
            'Epoch': epoch,
            'Actual_class': labels_flat,
            'Predicted_class': pred_flat
        }))

        eval_accuracy += accuracy_score(labels_flat, pred_flat)
        eval_mcc_accuracy += matthews_corrcoef(labels_flat, pred_flat)
        nb_eval_steps += 1

    df_metrics = pd.concat(all_metrics, ignore_index=True)

    print(f"\n\tValidation Accuracy: {eval_accuracy / nb_eval_steps}")
    print(f"\n\tValidation MCC Accuracy: {eval_mcc_accuracy / nb_eval_steps}")

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]


	Current Learning rate: 1.3333333333333333e-05

	Average Training Loss: 0.24401098270363247


Epoch:  33%|███▎      | 1/3 [07:13<14:26, 433.17s/it]


	Validation Accuracy: 0.9416118421052632

	Validation MCC Accuracy: 0.9099459695647577

	Current Learning rate: 6.666666666666667e-06

	Average Training Loss: 0.10647762723483377


Epoch:  67%|██████▋   | 2/3 [14:28<07:14, 434.41s/it]


	Validation Accuracy: 0.9391447368421053

	Validation MCC Accuracy: 0.9047558425101586

	Current Learning rate: 0.0

	Average Training Loss: 0.1044054580287534


Epoch: 100%|██████████| 3/3 [21:43<00:00, 434.52s/it]


	Validation Accuracy: 0.9424342105263158

	Validation MCC Accuracy: 0.9101393005700732





In [22]:
from sklearn.metrics import confusion_matrix,classification_report
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


In [23]:
## emotion labels
label2int = {
  "Negative": 0,
  "Neutral": 1,
  "Positive": 2
}

In [24]:
print(classification_report(df_metrics['Actual_class'].values, df_metrics['Predicted_class'].values, target_names=label2int.keys(), digits=len(label2int)))

              precision    recall  f1-score   support

    Negative      0.961     0.931     0.945       475
     Neutral      1.000     0.863     0.926       211
    Positive      0.907     0.984     0.944       514

    accuracy                          0.942      1200
   macro avg      0.956     0.926     0.939      1200
weighted avg      0.945     0.942     0.941      1200

