In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re

# Function for basic text preprocessing
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = text.lower()  # Convert to lowercase
    return text

# Load data
data = pd.read_csv('/content/webtext.test.csv')

# Preprocess the text data
data['text'] = data['text'].apply(preprocess_text)

# Handle missing values
data.dropna(subset=['text', 'ended'], inplace=True)

# Split the data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(data['text'], data['ended'], test_size=0.2, random_state=42)

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

# Tokenize and encode the training and validation data
def tokenize_encode(texts, tokenizer, max_length=512):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = tokenize_encode(X_train.tolist(), tokenizer)
val_encodings = tokenize_encode(X_val.tolist(), tokenizer)

# Create DataLoader for training and validation
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train.values))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(y_val.values))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Fine-tuning the model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Training function
def train(model, train_loader, val_loader, optimizer, criterion, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            labels = labels.float().unsqueeze(1)  # BCEWithLogitsLoss expects targets to be float and of shape [batch_size, 1]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}')
        evaluate(model, val_loader, criterion)

# Evaluation function
def evaluate(model, val_loader, criterion):
    model.eval()
    val_preds = []
    val_labels = []
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            labels = labels.float().unsqueeze(1)  # BCEWithLogitsLoss expects targets to be float and of shape [batch_size, 1]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            total_val_loss += loss.item()
            val_preds.extend(torch.sigmoid(logits).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    avg_val_loss = total_val_loss / len(val_loader)
    val_preds = np.array(val_preds)
    val_preds = (val_preds > 0.5).astype(int)
    accuracy = accuracy_score(val_labels, val_preds)
    print(f'Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {accuracy:.4f}')
    model.train()

# Fine-tune the model
train(model, train_loader, val_loader, optimizer, criterion)

# After training, evaluate on the validation set again
model.eval()
val_preds = []
val_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        labels = labels.float().unsqueeze(1)  # BCEWithLogitsLoss expects targets to be float and of shape [batch_size, 1]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        val_preds.extend(torch.sigmoid(logits).cpu().numpy())
        val_labels.extend(labels.cpu().numpy())
val_preds = np.array(val_preds)
val_preds = (val_preds > 0.5).astype(int)
accuracy = accuracy_score(val_labels, val_preds)
print(f'Final Validation Accuracy: {accuracy:.4f}')


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Training Loss: 0.4291
Validation Loss: 0.3576, Validation Accuracy: 0.7910
Epoch 2, Training Loss: 0.3396
Validation Loss: 0.3220, Validation Accuracy: 0.8270
Epoch 3, Training Loss: 0.3123
Validation Loss: 0.3071, Validation Accuracy: 0.8360
Final Validation Accuracy: 0.8360
