<a href="https://colab.research.google.com/github/Harshanand7/ASVA.AI/blob/main/bert_model_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## I have used BERT LLM in this approach todo the text classification task. I first imported the basemodule and later on fine tuned it.


In [None]:
!pip install transformers -U



In [None]:
!pip install accelerate -U



In [None]:
!pip install datasets



# Code block of importing necessary libraries and setting up the correct setup

In [None]:
import pandas as pd
from datasets import load_dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification

## DATASET:-
# I have used OLID dataset for my task. The OLID dataset is the most used and famous dataset for the identification of  OFFENSIVE language on social media. The dataset was prepared using approximately 14,000 tweets. The dataset which i have used already had the processed tweets so i did not need to preprocess the dataset. It has the ccolumn of 'cleaned_tweet' asthe processed tweets.

In [None]:


dataset = load_dataset("christophsonntag/OLID")

In [None]:
df = dataset["train"].to_pandas()
df=df[['cleaned_tweet','subtask_a']]
df.head()

## Mapping of the dataset
# 1 as Offensive
# 0 as Non Offensive

In [None]:


# mapping 'OFF' to 1 and 'NOT' to 0 in the 'subtask_a' column
df['subtask_a'] = df['subtask_a'].map({'OFF': 1, 'NOT': 0})

# displaying the first few rows to confirm the changes
print(df.head())

In [None]:
# i have saved the processed dataset for further use
output_file = 'modified_tweets.csv'
df.to_csv(output_file, index=False)

In [None]:

data = pd.read_csv("/content/modified_tweets.csv",  engine="python")
data.head()
data=data[['cleaned_tweet','subtask_a']]

In [None]:

data = data[0:2000]
data.head()

## Importing the base model and tokenizer

In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

In [None]:
model

## Memory Management
# for better and efficient memory and processor management I have used 'CUDA'

In [None]:
# shifting to CUDA
model = model.to('cuda')

In [None]:
sample_data = ["I am eating","I am playing "]
tokenizer(sample_data, padding=True, truncation=True, max_length=512)

## Splitting the dataset into train and eval and converting them into strings for tokenizer

In [None]:
X = list(data["cleaned_tweet"])
y = list(data["subtask_a"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(

    [str(text) for text in X_train],  # converting each element in X_train to a string
    padding=True,
    truncation=True,
    max_length=512
)
X_val_tokenized = tokenizer(
    [str(text) for text in X_val],  # Convert each element in X_val to a string
    padding=True,
    truncation=True,
    max_length=512
)

In [None]:
X_train_tokenized.keys()

In [None]:
print(X_train_tokenized['attention_mask'][0])

In [None]:
len(X_train),len(X_val)

In [None]:
# creating torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
train_dataset[5]

## Metrics for checking the model performance

In [None]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

## Fine tuning of the base model using the trainer of transformers library

In [None]:
# Definin Trainer
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# setting the parameters of the trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    logging_steps=20,  # logging every 10 steps
    report_to="none"  # disabling other reporting to focus on logs


)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
#tuning the model
trainer.train()


In [None]:
# evaluating the model
trainer.evaluate()


In [None]:
np.set_printoptions(suppress=True)

In [None]:
# checking the working of the model using example prompt
text = "That was good point"
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model(**inputs)
print(outputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)
predictions = predictions.cpu().detach().numpy()
predictions
non_offensive_prob_1 = predictions[0][0]
#The output is an array  where the first element is the probability of the test to be not offensive
#and the second one is the probability of text as  offensive

In [None]:
# conditional statemnt for classifying the text as Offensive or not offensive
if (non_offensive_prob_1>=0.67):
  print("The given text is likely to be classified as NON Offensive text")
else:
  print("The given text is probably OFFENSIVE text")
#text = "That was good point"

In [None]:
#saving the model for further use without repetaed training
trainer.save_model('CustomModel')

In [None]:
#using the saved model
model_2 = BertForSequenceClassification.from_pretrained("CustomModel")
model_2.to('cuda')

In [None]:
# checking by using a test case
text = "you are a fool"
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model_2(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions
non_offensive_prob = predictions[0][0]
#The output is an array of two elements where the first element is the probability of the test to be not offensive
#and the second one is the probability of text as  offensive

In [None]:
# conditional statemnt for classifying the text as Offensive or not offensive
if (non_offensive_prob>=0.67):
  print("The given text is likely to be classified as NON Offensive text")
else:
  print("The given text is probably OFFENSIVE text")
#text = "you are a fool"