https://medium.com/@somasunder/fine-tuning-bert-for-text-classification-a-step-by-step-guide-with-code-examples-0dea8513bcf2

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertForSequenceClassification,RobertaTokenizer,RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score,recall_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#data = pd.read_excel("/content/drive/MyDrive/Experiment2/expanded_annotator_dataset.xlsx")

#data=pd.read_csv("/content/drive/MyDrive/Experiment2/sentiment_analysis.csv")
#file_path="/content/drive/MyDrive/Experiment2/EXIST2023_Task1_en.xlsx"
#file_path="/content/drive/MyDrive/Experiment2/Complete_agreement_records.xlsx"
#file_path="/content/drive/MyDrive/Experiment2/EXIST2023_Task1_age.xlsx"
#file_path="/content/drive/MyDrive/Experiment2/age3_agreement_records.xlsx"

file_path="dataset/Complete_agreement_records.xlsx"
data = pd.read_excel(file_path)

# Encode labels into numeric format
#label_encoder = LabelEncoder()
#data["label_encoded"] = label_encoder.fit_transform(data["label_task1"])

# Split into training and validation sets
#train_texts, val_texts, train_labels, val_labels = train_test_split(data["tweet"].values, data["label_encoded"].values,test_size=0.2, random_state=42,)


In [3]:
data = data.reset_index(drop=True)

In [4]:
#data = data.dropna(subset=["tweet", "majority_vote","female_vote","male_vote"])
#data = data.dropna(subset=["tweet", "majority_vote","age1_vote","age2_vote","age3_vote"])
data = data.dropna(subset=["tweet", "majority_vote"])

In [5]:
import re

def remove_urls_and_lower(text):
    # Define the regex pattern for URLs starting with http or https
    url_pattern = re.compile(r'http[s]?://\S+')
    # Substitute the URLs with an empty string
    cleaned_text = url_pattern.sub('', text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text.strip()

In [6]:
tweet_processed=data["tweet"].apply(remove_urls_and_lower)

In [7]:
len(tweet_processed)

1090

In [8]:
len(data["majority_vote"])

1090

In [9]:

train_texts, val_texts, train_labels, val_labels = train_test_split(tweet_processed.values, data["majority_vote"].values,test_size=0.2, random_state=42,)
#train_texts, val_texts, train_labels, val_labels = train_test_split(tweet_processed.values, data["female_vote"].values,test_size=0.2, random_state=42,)
#train_texts, val_texts, train_labels, val_labels = train_test_split(tweet_processed.values, data["male_vote"].values,test_size=0.2, random_state=42,)
#train_texts, val_texts, train_labels, val_labels = train_test_split(data["tweet"].values, data["age1_vote"].values,test_size=0.2, random_state=42,)
#train_texts, val_texts, train_labels, val_labels = train_test_split(data["tweet"].values, data["age2_vote"].values,test_size=0.2, random_state=42,)
#train_texts, val_texts, train_labels, val_labels = train_test_split(data["tweet"].values, data["age3_vote"].values,test_size=0.2, random_state=42,)

In [10]:
print(len(train_texts), len(train_labels))
print(len(val_texts), len(val_labels))

872 872
218 218


Tokenizer and Dataset Class

In [11]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#tokenizer = BertTokenizer.from_pretrained("distilbert-base-uncased")

In [12]:
class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Define dataset
max_len = 128
train_dataset = ClassificationDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = ClassificationDataset(val_texts, val_labels, tokenizer, max_len)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Model setup

In [13]:
# Model and Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = BertForSequenceClassification.from_pretrained( "bert-base-uncased",num_labels=len(label_encoder.classes_))
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2)
#model = BertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=2)

model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training loop

In [14]:
# Training Loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")


Epoch 0: 100%|██████████| 55/55 [01:59<00:00,  2.18s/it, loss=0.599]


Epoch 0 Loss: 0.5339977199381049


Epoch 1: 100%|██████████| 55/55 [02:05<00:00,  2.28s/it, loss=0.0577]


Epoch 1 Loss: 0.23253824724392458


Epoch 2: 100%|██████████| 55/55 [02:09<00:00,  2.35s/it, loss=0.0214] 

Epoch 2 Loss: 0.08195084397765723





Evaluation

In [15]:
# Evaluation
model.eval()
correct = 0
total = 0
all_predictions=[]
all_labels=[]

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        all_predictions.extend(predictions.cpu().numpy())  #this line from chatgpt
        all_labels.extend(labels.cpu().numpy())         #this line from chatgpt

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

# Compute F1 Score (macro and micro)
f1_macro = f1_score(all_labels, all_predictions, average='macro') #this line from chatgpt

pre=precision_score(all_labels, all_predictions, average='macro')
recall=recall_score(all_labels, all_predictions, average='macro')

print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Precision (Macro): {pre:.4f}")
print(f"Recall (Macro): {recall:.4f}")

Validation Accuracy: 0.9450
F1 Score (Macro): 0.9359
Precision (Macro): 0.9519
Recall (Macro): 0.9237


The below code is to solve error on github