<a href="https://colab.research.google.com/github/Guhan2348519/LLM-lab-tasks/blob/main/2348519_LLM_lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics import f1_score
import os

In [11]:
!pip install transformers
!pip install tqdm
!pip install scikit-learn



In [13]:
df = pd.read_csv("smile-annotations-final.csv", names=['id', 'text', 'category'])
df.set_index('id', inplace=True)
df = df[~df.category.str.contains('\|')]
df = df[df.category != 'nocode']

label_dict = {label: idx for idx, label in enumerate(df.category.unique())}
df['label'] = df.category.replace(label_dict)

X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=17, stratify=df.label.values)
df['data_type'] = 'not_set'
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [25]:
# Remove any rows where the category is 'nocode' or contains '|'
df = df[~df.category.str.contains('\|')]
df = df[df.category != 'nocode']

# Create a mapping from category names to indices
label_dict = {label: i for i, label in enumerate(df.category.unique())}

# Add the label column
df['label'] = df.category.replace(label_dict)

# Count the number of classes
num_classes = len(label_dict)
print(f"Number of classes: {num_classes}")

# Count the number of instances in each class
class_counts = df['category'].value_counts()
print("Number of instances in each class:")
print(class_counts)

Number of classes: 6
Number of instances in each class:
category
happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: count, dtype: int64


In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def encode_data(texts):
    return tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )

encoded_data_train = encode_data(df[df.data_type == 'train'].text.values)
encoded_data_val = encode_data(df[df.data_type == 'val'].text.values)

input_ids_train, attention_masks_train, labels_train = encoded_data_train['input_ids'], encoded_data_train['attention_mask'], torch.tensor(df[df.data_type == 'train'].label.values)
input_ids_val, attention_masks_val, labels_val = encoded_data_val['input_ids'], encoded_data_val['attention_mask'], torch.tensor(df[df.data_type == 'val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [15]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_dict))
model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
batch_size = 32
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

In [17]:

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * 10)



In [18]:
def evaluate(dataloader_val):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        with torch.no_grad():
            outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss, logits = outputs[:2]
        loss_val_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = batch[2].detach().cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    return loss_val_avg, predictions, true_vals

In [22]:

import os
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import pandas as pd

# Ensure output directory exists
os.makedirs('Models', exist_ok=True)

# Task 2: Data Preprocessing
df = pd.read_csv("smile-annotations-final.csv", names=['id', 'text', 'category'])
df.set_index('id', inplace=True)
df = df[~df.category.str.contains('\|')]
df = df[df.category != 'nocode']

label_dict = {label: i for i, label in enumerate(df.category.unique())}
df['label'] = df.category.replace(label_dict)

X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=17,
    stratify=df.label.values
)

df['data_type'] = ['not_set'] * df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

# Task 4: Tokenization and Encoding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# Task 5: Setting up BERT Pretrained Model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

# Task 6: Creating Data Loaders
batch_size = 32
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

# Task 7: Setting Up Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 10
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train) * epochs
)

# Task 8: Performance Metrics
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    labels_dict_inverse = {v: k for k, v in label_dict.items()}
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {labels_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label]) / len(y_true)}\n')

# Task 9: Training Loop
def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):
        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)

    # Ensure predictions and true_vals are not empty
    if predictions and true_vals:
        predictions = np.concatenate(predictions, axis=0)
        true_vals = np.concatenate(true_vals, axis=0)
    else:
        predictions, true_vals = np.array([]), np.array([])

    return loss_val_avg, predictions, true_vals

def train(model, dataloader_train, dataloader_val, epochs=10):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss_train = 0

        for batch in tqdm(dataloader_train, desc=f"Epoch {epoch}"):
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            outputs = model(**inputs)
            loss = outputs[0]
            total_loss_train += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

        print(f"Training loss for epoch {epoch}: {total_loss_train / len(dataloader_train)}")

        val_loss, predictions, true_vals = evaluate(dataloader_val)
        print(f"Validation loss for epoch {epoch}: {val_loss}")

        # Debugging print statements
        print(f"Predictions shape: {predictions.shape}")
        print(f"True values shape: {true_vals.shape}")

        if predictions.size > 0 and true_vals.size > 0:
            f1 = f1_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
            print(f"F1 score (weighted) for epoch {epoch}: {f1}")
        else:
            print("Predictions or true values are empty, skipping F1 score calculation.")

        torch.save(model.state_dict(), f'Models/BERT_ft_epoch{epoch}.model')

train(model, dataloader_train, dataloader_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 1: 1.1809483200311661


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 1: 0.8148801411901202
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 1: 0.6656119824269878


Epoch 2:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 2: 0.7596929095685482


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 2: 0.7095231754439217
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 2: 0.7043271626231267


Epoch 3:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 3: 0.6503830991685391


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 3: 0.6152347113404956
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 3: 0.7314489758853908


Epoch 4:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 4: 0.5371360644698143


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 4: 0.5572104752063751
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 4: 0.7756946654933844


Epoch 5:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 5: 0.4477928079664707


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 5: 0.5316080749034882
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 5: 0.7977705560901218


Epoch 6:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 6: 0.38095438964664935


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 6: 0.5594852992466518
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 6: 0.7873029351503148


Epoch 7:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 7: 0.33365769162774084


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 7: 0.5226479193993977
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 7: 0.8106426604434972


Epoch 8:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 8: 0.29681723453104497


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 8: 0.5064415016344616
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 8: 0.7983787512935495


Epoch 9:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 9: 0.2733803365379572


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 9: 0.5090330711432866
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 9: 0.7997960328576575


Epoch 10:   0%|          | 0/40 [00:00<?, ?it/s]

Training loss for epoch 10: 0.2567250721156597


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 10: 0.5110490300825664
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 10: 0.8127556107250772


In [24]:
# Task 10: Loading and Evaluating our Model
def load_and_evaluate(model_path, dataloader_val):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(label_dict),
        output_attentions=False,
        output_hidden_states=False
    )

    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    print(f"Validation loss: {val_loss}")

    if predictions.size > 0 and true_vals.size > 0:
        f1 = f1_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
        print(f"F1 score (weighted): {f1}")
        accuracy_per_class(predictions, true_vals)
    else:
        print("Predictions or true values are empty, skipping evaluation metrics.")


# Load and evaluate the model
load_and_evaluate('Models/BERT_ft_epoch1.model', dataloader_val)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5110490300825664
F1 score (weighted): 0.8127556107250772
Class: happy
Accuracy: 0.9590643274853801

Class: not-relevant
Accuracy: 0.625

Class: angry
Accuracy: 0.1111111111111111

Class: disgust
Accuracy: 0.0

Class: sad
Accuracy: 0.0

Class: surprise
Accuracy: 0.4



In [27]:
!pip install imbalanced-learn



In [None]:

import os
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tqdm.notebook import tqdm
import pandas as pd

# Ensure output directory exists
os.makedirs('Models', exist_ok=True)

# Task 2: Data Preprocessing
df = pd.read_csv("smile-annotations-final.csv", names=['id', 'text', 'category'])
df.set_index('id', inplace=True)
df = df[~df.category.str.contains('\|')]
df = df[df.category != 'nocode']

label_dict = {label: i for i, label in enumerate(df.category.unique())}
df['label'] = df.category.replace(label_dict)

X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=17,
    stratify=df.label.values
)

df['data_type'] = ['not_set'] * df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

# Task 4: Tokenization and Encoding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

# Convert tensors to numpy arrays for SMOTE
input_ids_train_np = input_ids_train.numpy()
attention_masks_train_np = attention_masks_train.numpy()
labels_train_np = labels_train.numpy()

# Combine input_ids and attention_masks into one array
X_train_combined = np.concatenate((input_ids_train_np, attention_masks_train_np), axis=1)

# Apply SMOTE to handle class imbalance
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', k_neighbors=2, random_state=17)
X_train_resampled, labels_train_resampled = smote.fit_resample(X_train_combined, labels_train_np)

# Split the combined features back into input_ids and attention_masks
input_ids_train_res = torch.tensor(X_train_resampled[:, :256])
attention_masks_train_res = torch.tensor(X_train_resampled[:, 256:])
labels_train_res = torch.tensor(labels_train_resampled)

dataset_train = TensorDataset(input_ids_train_res, attention_masks_train_res, labels_train_res)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# Task 5: Setting up BERT Pretrained Model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

# Task 6: Creating Data Loaders
batch_size = 32
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

# Task 7: Setting Up Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 3
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train) * epochs
)

# Task 8: Performance Metrics
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    labels_dict_inverse = {v: k for k, v in label_dict.items()}
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {labels_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label]) / len(y_true)}\n')

# Task 9: Training Loop
def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):
        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)

    # Ensure predictions and true_vals are not empty
    if predictions and true_vals:
        predictions = np.concatenate(predictions, axis=0)
        true_vals = np.concatenate(true_vals, axis=0)
    else:
        predictions, true_vals = np.array([]), np.array([])

    return loss_val_avg, predictions, true_vals

def train(model, dataloader_train, dataloader_val, epochs=10):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss_train = 0

        for batch in tqdm(dataloader_train, desc=f"Epoch {epoch}"):
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            outputs = model(**inputs)
            loss = outputs[0]
            total_loss_train += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

        print(f"Training loss for epoch {epoch}: {total_loss_train / len(dataloader_train)}")

        val_loss, predictions, true_vals = evaluate(dataloader_val)
        print(f"Validation loss for epoch {epoch}: {val_loss}")

        # Debugging print statements
        print(f"Predictions shape: {predictions.shape}")
        print(f"True values shape: {true_vals.shape}")

        if predictions.size > 0 and true_vals.size > 0:
            f1 = f1_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
            print(f"F1 score (weighted) for epoch {epoch}: {f1}")
        else:
            print("Predictions or true values are empty, skipping F1 score calculation.")

        torch.save(model.state_dict(), f'Models/BERT_ft_epoch{epoch}.model')

# Task 10: Loading and Evaluating our Model
def load_and_evaluate(model_path, dataloader_val):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(label_dict),
        output_attentions=False,
        output_hidden_states=False
    )

    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    print(f"Validation loss: {val_loss}")

    if predictions.size > 0 and true_vals.size > 0:
        f1 = f1_score(true_vals, np.argmax(predictions, axis=1), average='weighted')
        print(f"F1 score (weighted): {f1}")
        accuracy_per_class(predictions, true_vals)
    else:
        print("Predictions or true values are empty, skipping evaluation metrics.")

# Training the model
train(model, dataloader_train, dataloader_val)

# Load and evaluate the model
load_and_evaluate('Models/BERT_ft_epoch1.model', dataloader_val)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:   0%|          | 0/182 [00:00<?, ?it/s]

Training loss for epoch 1: 1.3602359858188


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 1: 0.759885949747903
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 1: 0.6953185953656175


Epoch 2:   0%|          | 0/182 [00:00<?, ?it/s]

Training loss for epoch 2: 0.9902129546626584


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 2: 0.6377212660653251
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 2: 0.7297956220739534


Epoch 3:   0%|          | 0/182 [00:00<?, ?it/s]

Training loss for epoch 3: 0.8467175554443191


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss for epoch 3: 0.5848140248230526
Predictions shape: (223, 6)
True values shape: (223,)
F1 score (weighted) for epoch 3: 0.7464374688589935


Epoch 4:   0%|          | 0/182 [00:00<?, ?it/s]