<a href="https://colab.research.google.com/github/Jaya2404/Text-Classification-Using-BERT/blob/main/TextClassificationModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Get the data ready

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install Huggingface Transformers library
#!pip install transformers

# If you meet problems below, restart your kernel and try this instead
!pip install transformers[torch]



In [3]:
# Import necessary libraries
import os
from os.path import join
import numpy as np
import pandas as pd
import torch
import random
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available

In [4]:
train_data = pd.read_csv("/content/drive/My Drive/Corona_NLP_train.csv",encoding='latin-1')
test_data = pd.read_csv("/content/drive/My Drive/Corona_NLP_test.csv",encoding='latin-1')

# New Section

In [5]:
# set the seed (very important)
seed = 35

In [6]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(seed)

In [7]:
# Extract the 'OriginalTweet' column as lists of strings
train_text = train_data['OriginalTweet'].tolist()
test_text = test_data['OriginalTweet'].tolist()

# Replace 'Extremely Positive' with 'Positive' and 'Extremely Negative' with 'Negative'
train_data['Sentiment'] = train_data['Sentiment'].replace({'Extremely Positive': 'Positive', 'Extremely Negative': 'Negative'})
test_data['Sentiment'] = test_data['Sentiment'].replace({'Extremely Positive': 'Positive', 'Extremely Negative': 'Negative'})

# Drop rows with 'Neutral' sentiment
train_data = train_data[train_data['Sentiment'] != 'Neutral']
test_data = test_data[test_data['Sentiment'] != 'Neutral']

# Convert 'Positive' to 1 and 'Negative' to 0
train_data['Sentiment'] = train_data['Sentiment'].map({'Positive': 1, 'Negative': 0})
test_data['Sentiment'] = test_data['Sentiment'].map({'Positive': 1, 'Negative': 0})

# Convert the 'Sentiment' column to numpy arrays
train_label = train_data['Sentiment'].values
test_label = test_data['Sentiment'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Sentiment'] = train_data['Sentiment'].map({'Positive': 1, 'Negative': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Sentiment'] = test_data['Sentiment'].map({'Positive': 1, 'Negative': 0})


In [8]:
# download pretrained BERT Tokenizer
from transformers import BertTokenizer
# credits to https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python
model_checkpoint = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
# download pretrained BERT model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Your Code Here
# Filling the number of labels in this classification task.
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Information of BERT model
def get_model_layer(model):
  return model.config.num_hidden_layers

def get_hidden_size(model):
  return model.config.hidden_size

In [11]:
# Move the model to GPU
# Revisit the GPU tutorial if you meets error in this cell
model = model.to('cuda')

In [12]:
# Metrics

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
  return {
      'accuracy': acc,
      "tn": tn,
      "fp": fp,
      "fn": fn,
      "tp": tp
  }

In [13]:
# Change the batch size here
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
                                     # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=100,               # log & save weights each logging_steps
    save_steps=100,
    evaluation_strategy="steps",     # evaluate each `logging_steps`?
)


In [14]:
# Encode the data

max_length = 128
train_encodings = tokenizer(train_text, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(test_text, truncation=True, padding=True, max_length=max_length)

In [16]:
# Build the dataset

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = TweetDataset(train_encodings, train_label)
test_dataset = TweetDataset(test_encodings, test_label)

In [17]:
# Setup the trainer

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [18]:
# train the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Tn,Fp,Fn,Tp
100,0.6972,0.697265,0.4838,29,1604,37,1509
200,0.6936,0.69516,0.484429,993,640,999,547
300,0.6952,0.712684,0.486316,0,1633,0,1546
400,0.6968,0.695161,0.489777,291,1342,280,1266
500,0.694,0.696961,0.486316,0,1633,0,1546
600,0.6957,0.698327,0.486316,0,1633,0,1546
700,0.6967,0.699595,0.486946,3,1630,1,1545
800,0.692,0.696408,0.486946,11,1622,9,1537
900,0.6938,0.699657,0.486316,0,1633,0,1546
1000,0.6919,0.695284,0.486946,52,1581,50,1496


TrainOutput(global_step=1569, training_loss=0.6916597827062856, metrics={'train_runtime': 2458.7697, 'train_samples_per_second': 40.806, 'train_steps_per_second': 0.638, 'total_flos': 6599614601594880.0, 'train_loss': 0.6916597827062856, 'epoch': 3.0})

In [19]:
# evaluate the model
trainer.evaluate()

{'eval_loss': 0.6951601505279541,
 'eval_accuracy': 0.4844290657439446,
 'eval_tn': 993,
 'eval_fp': 640,
 'eval_fn': 999,
 'eval_tp': 547,
 'eval_runtime': 24.6384,
 'eval_samples_per_second': 129.026,
 'eval_steps_per_second': 2.029,
 'epoch': 3.0}

In [20]:
def get_prediction(model, text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return probs