<a href="https://colab.research.google.com/github/MRazin172/LLM_Lab4/blob/main/Razin_534_Lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


In [2]:
df = pd.read_csv('/content/data.csv')
df.head(5)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [3]:
# Convert sentiment labels to integers
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
df['Sentiment'] = df['Sentiment'].map(label_mapping)

df.head(5)

# Reset index to ensure proper alignment
df = df.reset_index(drop=True)

In [4]:

# Preprocess the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Sentence'], df['Sentiment'], test_size=0.2)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

print(f"Train size: {len(train_texts)}, Validation size: {len(val_texts)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Train size: 4673, Validation size: 1169


In [5]:
# Define Dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)  # Use iloc for label indexing
        return item

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [6]:
# Initialize DataLoader
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)


In [7]:
# Load Pretrained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.cuda()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
# Define Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * 4  # Assuming 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




In [9]:
# Training Loop
def train(model, train_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val.cuda() for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Training loss: {avg_loss}")

def evaluate(model, val_loader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.cuda() for key, val in batch.items()}
            outputs = model(**inputs)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).tolist())
            true_labels.extend(inputs['labels'].tolist())
    accuracy = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average='weighted')
    print(f"Validation Accuracy: {accuracy}")
    print(f"Validation F1 Score: {f1}")

for epoch in range(4):  # Assuming 4 epochs
    print(f"Epoch {epoch+1}")
    train(model, train_loader, optimizer, scheduler)
    evaluate(model, val_loader)

model.save_pretrained("fine-tuned-bert-sentiment")
tokenizer.save_pretrained("fine-tuned-bert-sentiment")

Epoch 1
Training loss: 0.6422398830334888
Validation Accuracy: 0.7818648417450813
Validation F1 Score: 0.7899971159911842
Epoch 2
Training loss: 0.3446295430737551
Validation Accuracy: 0.7989734816082121
Validation F1 Score: 0.7884691998662684
Epoch 3
Training loss: 0.2325158577711153
Validation Accuracy: 0.7904191616766467
Validation F1 Score: 0.7950424423958592
Epoch 4
Training loss: 0.18033815763649172
Validation Accuracy: 0.7887082976903336
Validation F1 Score: 0.7919462174190308


('fine-tuned-bert-sentiment/tokenizer_config.json',
 'fine-tuned-bert-sentiment/special_tokens_map.json',
 'fine-tuned-bert-sentiment/vocab.txt',
 'fine-tuned-bert-sentiment/added_tokens.json')

In [10]:
def predict_sentiment(text):
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: val.cuda() for key, val in encoding.items()}
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return prediction

text = "The company's stock price increased significantly after the announcement of a new product line"
print(predict_sentiment(text))

2
