## DATA PRE PROCESSING

In [1]:
!pip install tensorflow tensorflow-text transformers pandas



In [2]:
import re
import tensorflow as tf
import tensorflow_text as tf_text
import pandas as pd
from transformers import BertTokenizer

In [None]:
df = pd.read_csv("/content/complete_reviews_dataset.csv")

In [None]:
df["Username"] = df["Username"].fillna(lambda x: f"User_{x.name}")
df.dropna(subset=["Review"], inplace=True)

In [None]:
# Ensure no missing values

df["Review"] = df["Review"].fillna("")

In [None]:
# Define text cleaning function using TensorFlow

def clean_text_tf(text):
    text = tf.strings.lower(text)  # Convert to lowercase
    text = tf.strings.regex_replace(text, r"[^a-zA-Z\s]", "")  # Remove special characters, numbers

    # Tokenization (Whitespace-based)

    tokenizer = tf_text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(text)

    # Remove stopwords (Basic stopword list)

    stopwords = set(["the", "is", "in", "and", "to", "a", "of", "for", "on", "at", "with", "this", "it", "that", "as", "are", "was", "by"])  # Example list
    filtered_tokens = tf.ragged.boolean_mask(tokens, ~tf.reduce_any(tf.equal(tokens[:, None], list(stopwords)), axis=-1))

    # Join back into a sentence

    cleaned_text = tf.strings.reduce_join(filtered_tokens, separator=" ")

    return cleaned_text.numpy().decode('utf-8')

In [None]:
# Apply text cleaning

df["Cleaned_Review"] = df["Review"].apply(lambda x: clean_text_tf(x))

In [None]:
# Load BERT tokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Convert cleaned text into tokenized inputs for BERT

df["Tokenized_Review"] = df["Cleaned_Review"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=50, return_tensors="pt")["input_ids"].squeeze().tolist())

In [None]:
# Save processed data

df.to_csv("processed_reviews.csv", index=False)

print("Preprocessing complete! ✅")

Preprocessing complete! ✅


## GENERATING LABELS USING FACEBOOK'S BART

Source Code: HuggingFace.com

In [3]:
pip install transformers



In [4]:
from transformers import pipeline

In [5]:
# Load zero-shot classification model

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [6]:
# Example reviews

reviews = [
    "I love my job, it makes me feel fulfilled and happy!",
    "This is the worst place I have ever worked.",
    "It's an average workplace, nothing too special.",
]

In [7]:
# Define emotion labels

emotion_labels = ["happy", "joyous", "sad", "angry", "frustrated", "excited", "disappointed", "hopeful", "relieved", "surprised"]

In [8]:
# Classify reviews

for review in reviews:
    result = classifier(review, emotion_labels)
    print(f"Review: {review}")
    print(f"Predicted Emotion: {result['labels'][0]} (Confidence: {result['scores'][0]:.3f})\n")

Review: I love my job, it makes me feel fulfilled and happy!
Predicted Emotion: joyous (Confidence: 0.397)

Review: This is the worst place I have ever worked.
Predicted Emotion: disappointed (Confidence: 0.391)

Review: It's an average workplace, nothing too special.
Predicted Emotion: disappointed (Confidence: 0.520)



In [9]:
import pandas as pd

df = pd.read_csv("/content/processed_reviews_3.csv")

In [None]:
# # Define expanded emotion labels

# emotion_labels = ["happy", "joyous", "sad", "angry", "frustrated", "excited", "disappointed", "hopeful", "relieved", "surprised"]

# # Apply zero-shot classification

# df["Emotion"] = df["Review"].apply(lambda x: classifier(x, emotion_labels)["labels"][0])

# # Save labeled dataset

# df.to_csv("zero_shot_labeled_reviews.csv", index=False)

# print("Zero-shot classified dataset saved successfully!")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
# import torch

# # Ensure GPU is used if available

# device = "cuda" if torch.cuda.is_available() else "cpu"

# # Define expanded emotion labels
# emotion_labels = ["happy", "joyous", "sad", "angry", "frustrated", "excited", "disappointed", "hopeful", "relieved", "surprised"]

# # Apply zero-shot classification in batches
# def classify_batch(reviews):
#     results = classifier(reviews, emotion_labels)
#     return [res["labels"][0] for res in results]

# # Process data in chunks of 100 to speed up

# batch_size = 100
# df["Emotion"] = df["Review"].groupby(df.index // batch_size).transform(classify_batch)

# # Save labeled dataset

# df.to_csv("zero_shot_labeled_reviews.csv", index=False)

# print("Zero-shot classified dataset saved successfully!")

In [10]:
import torch
import pandas as pd
from transformers import pipeline

# Ensure GPU is used if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load zero-shot classification model on GPU if available
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=0 if device == "cuda" else -1  # 0 = GPU, -1 = CPU
)

# Define expanded emotion labels
emotion_labels = ["happy", "joyous", "sad", "angry", "frustrated", "excited", "disappointed", "hopeful", "relieved", "surprised"]

# Function to classify a batch of reviews
def classify_batch(reviews):
    results = classifier(reviews, emotion_labels, truncation=True)  # Enable truncation for long text
    return [res["labels"][0] for res in results]

# Load dataset
df = pd.read_csv("/content/processed_reviews_3.csv")

# Process data in batches
batch_size = 100  # Adjust batch size based on your GPU/CPU
emotions = []

for i in range(0, len(df), batch_size):
    batch_reviews = df["Review"].iloc[i : i + batch_size].tolist()
    batch_emotions = classify_batch(batch_reviews)
    emotions.extend(batch_emotions)

# Assign results back to DataFrame
df["Emotion"] = emotions

# Save labeled dataset
df.to_csv("zero_shot_labeled_reviews.csv", index=False)

print("Zero-shot classified dataset saved successfully! ✅")

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Zero-shot classified dataset saved successfully! ✅


In [11]:
# Print the first 10 rows of the dataset

print(df.head(10))

                                Username  Total reviews  \
0                           Saman Karimi           18.0   
1                     Take a closer look           18.0   
2  <function <lambda> at 0x7b3256b0ec00>           10.0   
3                          Madeline Hall            1.0   
4                        Christy Mashore            2.0   
5                                     LA            1.0   
6                                 Laurie            3.0   
7                                     NI            1.0   
8                                 Nicole            4.0   
9                   See all10,872reviews            1.0   

                       Date  \
0  2025-03-11T02:19:00.000Z   
1  2025-03-11T20:29:17.000Z   
2  2020-05-17T00:48:31.000Z   
3  2020-05-17T00:42:15.000Z   
4  2020-05-16T22:38:27.000Z   
5  2020-05-16T17:14:29.000Z   
6  2020-05-16T16:40:40.000Z   
7  2020-05-16T15:50:54.000Z   
8  2020-05-16T11:29:42.000Z   
9  2020-05-15T20:27:38.000Z   

        

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [None]:
# Load manually labeled dataset

df = pd.read_csv("/content/zero_shot_labeled_reviews.csv")

In [None]:
# # Convert text labels to numerical labels

# label_map = {"happy": 0, "joyous": 1, "sad": 2, "angry": 3}
# df["label"] = df["Emotion"].map(label_map)

In [None]:
# Define the expanded list of emotion labels

emotion_labels = ["happy", "joyous", "sad", "angry", "frustrated", "excited", "disappointed", "hopeful", "relieved", "surprised"]

# Generate a dynamic mapping of emotions to numerical labels

label_map = {emotion: idx for idx, emotion in enumerate(emotion_labels)}

# Convert text labels to numerical labels using the updated mapping

df["label"] = df["Emotion"].map(label_map)

print("Updated Label Mapping:", label_map)  # Optional: Print mapping for verification

Updated Label Mapping: {'happy': 0, 'joyous': 1, 'sad': 2, 'angry': 3, 'frustrated': 4, 'excited': 5, 'disappointed': 6, 'hopeful': 7, 'relieved': 8, 'surprised': 9}


In [None]:
# Split data into training and validation sets

train_texts, val_texts, train_labels, val_labels = train_test_split(df["Review"], df["label"], test_size=0.2, random_state=42)

In [None]:
# Load BERT tokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Tokenize the dataset

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

In [None]:
# Convert dataset to Hugging Face format

train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"], "labels": train_labels.tolist()})
val_dataset = Dataset.from_dict({"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"], "labels": val_labels.tolist()})

In [None]:
# Load pre-trained BERT model for classification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define training arguments

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    # num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # ✅ Disable W&B logging
)




In [None]:
# Define Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
print(set(train_dataset["labels"]))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


In [None]:
!kill -9 $(nvidia-smi | awk '$2=="Processes:" {p=1} p && $2 ~ /^[0-9]+$/ {print $2}')

kill: usage: kill [-s sigspec | -n signum | -sigspec] pid | jobspec ... or kill -l [sigspec]


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_SILENT"] = "true"
os.environ["WANDB_PROJECT"] = "disabled"

# Now train the model
trainer.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_SILENT"] = "true"
os.environ["WANDB_PROJECT"] = "disabled"

# Now train the model
trainer.train()

In [None]:
# Save the fine-tuned model

model.save_pretrained("fine_tuned_bert")
tokenizer.save_pretrained("fine_tuned_bert")

print("Fine-tuning completed! Model saved.")

Fine-tuning completed! Model saved.


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

In [None]:
# Load fine-tuned model

model = BertForSequenceClassification.from_pretrained("fine_tuned_bert")
tokenizer = BertTokenizer.from_pretrained("fine_tuned_bert")

In [None]:
# Define function to predict emotions

def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits).item()
    label_map = {0: "happy", 1: "joyous", 2: "sad", 3: "angry"}
    return label_map[predicted_class]

In [None]:
# Example predictions

new_reviews = [
    "I feel so satisfied with my work!",
    "This place makes me miserable.",
]

for review in new_reviews:
    print(f"Review: {review} -> Predicted Emotion: {predict_emotion(review)}")

Review: I feel so satisfied with my work! -> Predicted Emotion: happy
Review: This place makes me miserable. -> Predicted Emotion: sad
