In [1]:
from transformers import RobertaModel, RobertaTokenizer
from torch import nn

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# define the model
class RobertaWithClassifier(nn.Module):
    def __init__(self, model_name='roberta-base'):
        super(RobertaWithClassifier, self).__init__()
        # only keep the deep features
        self.roberta = RobertaModel.from_pretrained(model_name, output_hidden_states=True)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_output).squeeze(-1)



In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

class DataFrameDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # df.columns = [text, label, ...]
        text = self.dataframe.iloc[idx, 0]
        label = self.dataframe.iloc[idx, 1]

        # tokenizer
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [3]:
def train_model(model, dataloader, optimizer, loss_fn, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        total_loss = 0

        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Average Loss: {avg_loss:.4f}")
        
def evaluate_model(model, dataloader, device):
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(outputs)
            predictions = (probs > 0.5)

            correct_predictions += torch.sum(predictions == labels)
            total_predictions += labels.size(0)

    accuracy = correct_predictions.double() / total_predictions
    print(f"Test Accuracy: {accuracy.item():.4f}")
    return accuracy.item()

In [None]:
import pandas as pd
from Sampler import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load datasets
data1 = pd.read_csv('data/DGHD.csv')  
data2 = pd.read_csv('data/MHS.csv')   # Load DGHD, MHS as mixed_df for training
fox_df = pd.read_csv('data/fox-comment.csv')  # Load fox-comment as test_df

# Define the mix ratios from 100% DGHD to 50% DGHD in steps of 10%
ratios = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5]
total_count = 50000

# Loop through each ratio
for ratio in ratios:
    # Calculate the number of samples from each dataset
    num_data1_samples = int(ratio * total_count)
    num_data2_samples = int((1 - ratio) * total_count)

    # Sample the datasets accordingly
    sampled_data1 = balanced_fixedcount(data1, num_data1_samples, 'label')
    sampled_data2 = balanced_fixedcount(data2, num_data2_samples, 'label')

    # Concatenate the sampled datasets to create the mixed dataset
    mixed_df = pd.concat([sampled_data1, sampled_data2], ignore_index=True)

    # Initialize the model
    model = RobertaWithClassifier().to(device)

    # Set up the optimizer and loss function
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.BCEWithLogitsLoss()

    # Prepare the datasets and dataloaders
    train_dataset = DataFrameDataset(dataframe=mixed_df, tokenizer=tokenizer)
    test_dataset = DataFrameDataset(dataframe=fox_df, tokenizer=tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

    # Print the current mixing settings
    print(f"Training with {ratio*100:.0f}% DGHD and {(1-ratio)*100:.0f}% MHS")
    
    # Train and evaluate the model
    train_model(model, train_loader, optimizer, loss_fn, device, epochs=3)
    evaluate_model(model, test_loader, device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with 100% DGHD and 0% MHS
Epoch 1/3
