<a href="https://colab.research.google.com/github/HamzaWajid1/Amazon_Customer_Review_Sentiment_Detector/blob/master/Training_Bert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**SENTIMENT ANALYSIS USING THE POWER OF NLP**


In this notebooke I will be doing sentiment analysis using following techniques

1: Bag of word approach

2: Roberta pretrained Model

3: Huggingface Pipeline

**READING DATA**

Roberta Pretrained Model

Use a model trained of a large corpus of data.

Transformer model accounts for the words but also the context related to other words.

In [3]:
import pandas as pd
import os
import re
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [5]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [6]:
df=pd.read_csv("/content/drive/MyDrive/CSV Files/file.csv")

In [7]:
df.head(1)

Unnamed: 0,Id,neg,neu,pos,compound,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,0.0,0.695,0.305,0.9441,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...


**PREPARING DATA**

In [8]:
X=df['Text'].values
X
Y=df['Score'].values.reshape(-1,1)

Y

array([[5],
       [1],
       [4],
       ...,
       [5],
       [3],
       [5]])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val =train_test_split(X, Y, test_size=0.05, random_state=2020)

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Convert labels to one-hot encoding
encoder = OneHotEncoder(sparse=False)
labels_one_hot_train = encoder.fit_transform(y_train)
labels_one_hot_val = encoder.fit_transform(y_val)



In [11]:
import torch

In [12]:
labels_train = torch.tensor(labels_one_hot_train, dtype=torch.float32)
labels_train.shape

torch.Size([19000, 5])

In [13]:
labels_test = torch.tensor(labels_one_hot_val, dtype=torch.float32)
labels_test.shape

torch.Size([1000, 5])

**TEXT PREPROCESSING**

In [14]:
import nltk
# Uncomment to download "stopwords"
nltk.download("stopwords")
from nltk.corpus import stopwords

def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Remove "@name"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r'  ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()

    return s

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**IMPORTING TORCH**

In [15]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


**USING BERT TOKENIDER AND CONVERT THE TEXT INTO REQUIRED FORMAT FOR MODEL FINETUNING**

In [16]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            #truncation=True,
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )

        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks










The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [17]:
# Encode our concatenated data
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in X_train]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors


Max length:  2534


In [18]:
# Specify `MAX_LEN`
MAX_LEN = 508

# Print sentence 0 and its encoded token ids
token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print('Original: ', X[50])
print('Token IDs: ', token_ids)
#print('Mask: ',mask)

# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  This oatmeal is not good. Its mushy, soft, I don't like it. Quaker Oats is the way to go.
Token IDs:  [101, 4149, 2195, 8995, 3012, 27141, 3899, 2833, 3688, 2179, 2204, 3737, 4031, 3504, 2066, 20717, 13995, 6240, 14747, 2488, 18604, 10346, 6799, 2100, 9120, 2015, 4031, 2488, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 



**NOW THE REAL MAGIC IS GOING TO BEGIN**

In [19]:
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split, RandomSampler, SequentialSampler

In [20]:
train_dataset = TensorDataset(train_inputs, train_masks, labels_train)
val_dataset = TensorDataset(val_inputs, val_masks, labels_test)

In [21]:
# Specify batch size
batch_size = 8

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Check the size of the datasets
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Training dataset size: 19000
Validation dataset size: 1000


In [22]:
%pip install accelerate -U
%pip install transformers[torch]




In [23]:
!pip install transformers[torch] -U
!pip install accelerate -U



In [26]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=5
)

training_arguments = TrainingArguments(
    output_dir=".",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: vars() argument must have __dict__ attribute

In [25]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Assume you have train_encodings, train_labels, eval_encodings, eval_labels ready



# Model initialization
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=5
)

# Loss function
loss_fn = BCEWithLogitsLoss()

# Metrics function
def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    return {"f1_score": f1_score(labels, predictions > 0.5, average="micro")}

# Training arguments
training_arguments = TrainingArguments(
    output_dir="./content/drive/MyDrive/AI_models",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    save_total_limit=1,
    logging_dir="./your_logging_directory",
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    compute_loss=lambda model, inputs: loss_fn(model(**inputs).logits, inputs["labels"]),
)

# Start training
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: Trainer.__init__() got an unexpected keyword argument 'compute_loss'

In [None]:
from torch.utils.data.dataset import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split


model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=5
)

training_arguments = TrainingArguments(
    output_dir=".",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

In [None]:
from transformers import BertForSequenceClassification, AdamW

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)  # Adjust the number of labels based on your rating scale

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Set up the loss function (CrossEntropyLoss handles the softmax activation internally)
criterion = torch.nn.CrossEntropyLoss()

# Specify the number of training epochs
epochs = 3

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': torch.argmax(batch[2], dim=1)}  # Use argmax to get the class indices
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs} - Average Training Loss: {average_loss}")

# Validation loop
model.eval()
total_val_loss = 0

with torch.no_grad():
    for batch in val_dataloader:
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': torch.argmax(batch[2], dim=1)}  # Use argmax to get the class indices
        outputs = model(**inputs)
        val_loss = outputs.loss
        total_val_loss += val_loss.item()

average_val_loss = total_val_loss / len(val_dataloader)
print(f"Average Validation Loss: {average_val_loss}")
