# Initialization

In [2]:
pip install datasets



In [3]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import pandas as pd
from tqdm import tqdm

from transformers import BertTokenizer, BertModel, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn

dataset = load_dataset("sst2")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})


In [4]:
df = pd.DataFrame(dataset["train"])
df

Unnamed: 0,idx,sentence,label
0,0,hide new secretions from the parental units,0
1,1,"contains no wit , only labored gags",0
2,2,that loves its characters and communicates som...,1
3,3,remains utterly satisfied to remain the same t...,0
4,4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...,...
67344,67344,a delightful comedy,1
67345,67345,"anguish , anger and frustration",0
67346,67346,"at achieving the modest , crowd-pleasing goals...",1
67347,67347,a patient viewer,1


In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1, stratify=df['label'])
train_df

Unnamed: 0,idx,sentence,label
18071,18071,poses for itself that one can forgive the film...,1
22251,22251,it 's tough to tell which is in more abundant ...,0
13938,13938,the first movie,1
58240,58240,charms,1
39721,39721,the narrator and the other characters try to c...,0
...,...,...,...
14608,14608,because we 've seen ( eddie ) murphy do the ge...,0
12394,12394,"so rhapsodize cynicism , with repetition and l...",0
36785,36785,"there is a beautiful , aching sadness to it al...",1
62276,62276,a true pleasure .,1


In [6]:
test_df

Unnamed: 0,idx,sentence,label
5536,5536,carvey 's considerable talents are wasted in it,0
31437,31437,wrap the proceedings up neatly,1
44037,44037,irritates and saddens,0
27492,27492,a magnetic performance,1
35062,35062,as plain and pedestrian as,0
...,...,...,...
42806,42806,they 're just a couple of cops in copmovieland...,1
2733,2733,the perfect festival film,1
55173,55173,sneak out of the theater,0
55483,55483,has never looked uglier,0


In [7]:
train_sentences = list(train_df['sentence'])
train_labels = list(train_df['label'])

# Fine Tuning the Bert Model for Classification



In [10]:
tokenizer = BertTokenizer.from_pretrained('PinkiKumari22/FinalPretrainedModel')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare the dataset and dataloader
dataset_new = CustomDataset(train_sentences, train_labels, tokenizer)
data_loader = DataLoader(dataset_new, batch_size=16, shuffle=True)

In [None]:
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained('PinkiKumari22/FinalPretrainedModel', num_labels=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
outputs = None
model.train()
criterion = nn.CrossEntropyLoss()
for epoch in range(3):  # Adjust the number of epochs
    epoch_loss = 0
    progress_bar = tqdm(data_loader, desc=f"Epoch {epoch+1}", leave=False, disable=False)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        model.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        prediction_scores = outputs.logits
        loss = criterion(prediction_scores, labels)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

        # Update progress bar
        progress_bar.set_description(f"Epoch {epoch+1} - Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1}, Average Loss: {epoch_loss / len(data_loader)}")

In [None]:
# Save the model
model.save_pretrained("fine_tuned_bert_sst2_2")
model.push_to_hub("finetuned_classification")

# Evaluation after Fine Tuning

In [11]:
from transformers import BertForSequenceClassification, BertTokenizer
model = BertForSequenceClassification.from_pretrained('PinkiKumari22/finetuned_classification', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('PinkiKumari22/FinalPretrainedModel')

In [12]:
# Function to predict the sentiment
def predict_sentiment(model,input_text):
  # Tokenize the input text
  inputs = tokenizer.encode_plus(
      input_text,
      None,
      add_special_tokens=True,
      max_length=512,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
  )
  input_ids = inputs['input_ids']
  attention_mask = inputs['attention_mask']

  # Predict
  with torch.no_grad():
      # Use logits directly
      outputs = model(input_ids, attention_mask=attention_mask)
      prediction = torch.argmax(outputs.logits, dim=1).item()

  return prediction

In [28]:
def predict(model,dataset):
  predictions = []
  batch_size = 32
  # Iterate through the dataset in batches
  for i in range(0, len(dataset), batch_size):
      batch = dataset[i:i + batch_size]

      # Tokenize the batch
      batch_encoding = tokenizer(
          batch,
          max_length=512,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt'
      )

      # Obtain input_ids and attention_mask
      input_ids = batch_encoding['input_ids']
      attention_mask = batch_encoding['attention_mask']

      # Predict
      with torch.no_grad():
          outputs = model(input_ids, attention_mask=attention_mask)
          predictions.extend(torch.argmax(outputs.logits, dim=1).tolist())
  return predictions

In [29]:
# Example usage
input_string = "Though he is not a good writter, he is a brilliant cricketer"
predicted_class = predict_sentiment(model,input_string)



In [32]:
pred = predict(model,list(df["sentence"][:1000]))

In [33]:
len(pred)

1000

In [34]:
test_labels = list(test_df["label"][:1000])

In [35]:
#!pip install evaluate
import evaluate

In [37]:
recall_metric = evaluate.load('recall')
recall = recall_metric.compute(references = test_labels, predictions = pred)

precision_metric = evaluate.load("precision")
prec = precision_metric.compute(references = test_labels, predictions = pred)

accuracy_metric = evaluate.load("accuracy")
acc = accuracy_metric.compute(references = test_labels, predictions = pred)

f1_metric = evaluate.load("f1")
F1 = f1_metric.compute(references = test_labels, predictions = pred)

In [38]:
recall, prec, acc, F1

({'recall': 1.0},
 {'precision': 0.551},
 {'accuracy': 0.551},
 {'f1': 0.7105093488072212})