In [1]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer

# Paths
csv_path = "C:/Users/Rifat/Music/dsv1text/transcriptions2.csv"
model_save_path = "C:/Users/Rifat/Music/Models/ver0o1"

# Load data
data = pd.read_csv(csv_path)
texts = data['text'].values
labels = data['label'].values

# Splitting the dataset into train, validation, and test sets
from sklearn.model_selection import train_test_split
train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


train_dataset = SarcasmDataset(train_texts, train_labels, tokenizer)
val_dataset = SarcasmDataset(val_texts, val_labels, tokenizer)
test_dataset = SarcasmDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

print("Data preprocessing complete. Train, validation, and test sets created.")


Data preprocessing complete. Train, validation, and test sets created.


In [2]:
from transformers import BertForSequenceClassification, AdamW
import torch.nn.functional as F

# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training and validation
epochs = 30
for epoch in range(epochs):
    model.train()
    total_loss = 0
    print(f"\nEpoch {epoch + 1}/{epochs}")

    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 10 == 0:
            print(f"Epoch {epoch+1} | Batch {batch_idx} | Loss: {loss.item():.4f}")

    # Validation
    model.eval()
    correct = 0
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            
            logits = outputs.logits
            predictions = torch.argmax(F.softmax(logits, dim=1), dim=1)
            correct += (predictions == labels).sum().item()
    
    val_accuracy = correct / len(val_dataset)
    print(f"Validation Loss: {val_loss / len(val_loader):.4f} | Validation Accuracy: {val_accuracy:.4f}")

    # Save the model after each epoch
    model_file_name = f"BERT_epoch{epoch+1}_valacc{val_accuracy:.4f}_text.pth"
    model_save_path_epoch = os.path.join(model_save_path, model_file_name)
    torch.save(model.state_dict(), model_save_path_epoch)
    print(f"Model saved to {model_save_path_epoch}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/30
Epoch 1 | Batch 0 | Loss: 0.7638
Epoch 1 | Batch 10 | Loss: 0.5767
Epoch 1 | Batch 20 | Loss: 0.6517
Epoch 1 | Batch 30 | Loss: 0.4777
Epoch 1 | Batch 40 | Loss: 0.6736
Epoch 1 | Batch 50 | Loss: 0.7309
Epoch 1 | Batch 60 | Loss: 0.8461
Epoch 1 | Batch 70 | Loss: 0.5729
Epoch 1 | Batch 80 | Loss: 0.5990
Epoch 1 | Batch 90 | Loss: 0.7428
Validation Loss: 0.6597 | Validation Accuracy: 0.6145
Model saved to C:/Users/Rifat/Music/Models/ver0o1\BERT_epoch1_valacc0.6145_text.pth

Epoch 2/30
Epoch 2 | Batch 0 | Loss: 0.7467
Epoch 2 | Batch 10 | Loss: 0.6773
Epoch 2 | Batch 20 | Loss: 0.5767
Epoch 2 | Batch 30 | Loss: 0.6770
Epoch 2 | Batch 40 | Loss: 0.8822
Epoch 2 | Batch 50 | Loss: 0.8185
Epoch 2 | Batch 60 | Loss: 0.5822
Epoch 2 | Batch 70 | Loss: 0.7067
Epoch 2 | Batch 80 | Loss: 0.6599
Epoch 2 | Batch 90 | Loss: 0.6941
Validation Loss: 0.6345 | Validation Accuracy: 0.6747
Model saved to C:/Users/Rifat/Music/Models/ver0o1\BERT_epoch2_valacc0.6747_text.pth

Epoch 3/30
Epoch 3 | 