In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, Subset
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification, RobertaForSequenceClassification
from transformers import BertModel, DistilBertModel, RobertaModel
from transformers import BertTokenizer, DistilBertTokenizer, RobertaTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
import pandas as pd
import random
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
seed_val = 42
k_folds = 10
num_classes = 2
max_length = 512
batch_size = 8
num_epochs = 20

In [None]:
data = pd.read_csv('''Dataset path''')

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = data['text'].tolist()
        self.labels = [1 if deceptive == "deceptive" else 0 for deceptive in data['deceptive'].tolist()]

    def __len__(self):
        return(len(self.texts))

    def __getitem__(self,idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(text,
                                              max_length = self.max_length,
                                              add_special_tokens = True,
                                              padding = 'max_length',
                                              truncation = True,
                                              return_attention_mask = True,
                                              return_tensors = 'pt',
                                              )

        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'label': torch.tensor(label),
                }

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = []
    predictions = []
    actual_labels = []

    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        loss = nn.CrossEntropyLoss()(outputs[0], labels)
        losses.append(loss.item())
        _, preds = torch.max(outputs[0], dim = 1)
        predictions.extend(preds.cpu().tolist())
        actual_labels.extend(labels.cpu().tolist())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
    avg_loss = np.mean(losses)
    return avg_loss, predictions, actual_labels

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    losses = []
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            loss = nn.CrossEntropyLoss()(outputs[0], labels)
            losses.append(loss.item())
            _, preds = torch.max(outputs[0], dim = 1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    avg_loss = np.mean(losses)
    return avg_loss, predictions, actual_labels

In [None]:
sModels = [BertForSequenceClassification, RobertaForSequenceClassification, DistilBertForSequenceClassification]
model_list = ['bert-base-uncased', 'roberta-base', 'distilbert-base-uncased']
tokenizer_list = [BertTokenizer, RobertaTokenizer, DistilBertTokenizer]
learning_rate_list = [1e-5, 3e-5, 5e-5]

training_stats = []
kf = StratifiedKFold(n_splits = k_folds, shuffle = True, random_state = seed_val)

for i in range(3):
    for learning_rate in learning_rate_list:
        sModel = sModels[i]
        model_name = model_list[i]
        tokenizer_name = tokenizer_list[i]

        tokenizer = tokenizer_name.from_pretrained(model_name, do_lower_case = True)
        dataset = TextClassificationDataset(data, tokenizer, max_length)

        for fold, (train_idx, test_idx) in enumerate(kf.split(data['text'], data['deceptive'])):
            random.seed(seed_val)
            np.random.seed(seed_val)
            torch.manual_seed(seed_val)
            torch.cuda.manual_seed_all(seed_val)
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
            
            train_dataset = Subset(dataset, train_idx)
            val_dataset = Subset(dataset, test_idx)
            train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
            test_dataloader = DataLoader(val_dataset, batch_size = batch_size, shuffle = True)

            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model = sModel.from_pretrained(model_name, num_labels = num_classes, output_attentions = False,
                                           output_hidden_states = False).to(device)
            optimizer = AdamW(model.parameters(), lr = learning_rate)
            total_steps = len(train_dataloader) * num_epochs
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

            for epoch in range(num_epochs):
                training_loss, training_predictions, training_actual_labels = train(model, train_dataloader, optimizer, scheduler, device)
                training_accuracy_score = accuracy_score(training_actual_labels, training_predictions)

                test_loss, test_predictions, test_actual_labels = evaluate(model, test_dataloader, device)
                test_accuracy_score = accuracy_score(test_actual_labels, test_predictions)
                test_precision_score = precision_score(test_actual_labels, test_predictions)
                test_recall_score = recall_score(test_actual_labels, test_predictions)
                test_f1_score = f1_score(test_actual_labels, test_predictions, average = 'macro')
                test_roc_auc = roc_auc_score(test_actual_labels, test_predictions)

                print(f"Model: {model_name}, Learning rate: {learning_rate}, Fold: {fold + 1}, Epoch: {epoch + 1}, Training Accuracy: {training_accuracy_score:.4f}, Test Accuracy: {test_accuracy_score:.4f}")

                training_stats.append({'Model': model_name,
                                       'Learning Rate': learning_rate,
                                       'Fold': fold + 1,
                                       'Epoch': epoch + 1,
                                       'Training Loss': training_loss,
                                       'Test Loss': test_loss,
                                       'Training Accuracy': training_accuracy_score,
                                       'Test Accuracy': test_accuracy_score,
                                       'Test Precision': test_precision_score,
                                       'Test Recall': test_recall_score,
                                       'Test F-Score': test_f1_score,
                                       'Test Roc Auc': test_roc_auc,
                                       })