In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, Subset
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification, RobertaForSequenceClassification
from transformers import BertModel, DistilBertModel, RobertaModel
from transformers import BertTokenizer, DistilBertTokenizer, RobertaTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
import pandas as pd
import random
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
seed_val = 42
k_folds = 10
num_classes = 2
max_length = 512
batch_size = 8
num_epochs = 20

In [None]:
data = pd.read_csv('''Dataset path''')

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = data['text'].tolist()
        self.labels = [1 if deceptive == "deceptive" else 0 for deceptive in data['deceptive'].tolist()]
        self.emotion = data['sentences_emotion_label'].tolist()

    def __len__(self):
        return(len(self.texts))

    def __getitem__(self,idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(text,
                                              max_length = self.max_length,
                                              add_special_tokens = True,
                                              padding = 'max_length',
                                              truncation = True,
                                              return_attention_mask = True,
                                              return_tensors = 'pt',
                                              )

        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'label': torch.tensor(label),
                }

In [None]:
class classifier(nn.Module):
    def __init__(self, bert_model_name, num_classes, dropout_val):
        super(classifier, self).__init__()
        self.fModel = sModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_val)
        self.preclassifier = nn.Linear(768 , 192)
        self.classifier = nn.Linear(192 , num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.fModel(input_ids = input_ids, attention_mask = attention_mask)
        pooler = outputs[0][:, 0]
        pooler = self.preclassifier(pooler)
        pooler = nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        logits = self.classifier(pooler)
        return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = []
    predictions = []
    actual_labels = []

    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        losses.append(loss.item())
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds.cpu().tolist())
        actual_labels.extend(labels.cpu().tolist())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
    avg_loss = np.mean(losses)
    return avg_loss, predictions, actual_labels

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    losses = []
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            losses.append(loss.item())
            _, preds = torch.max(outputs, dim = 1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    avg_loss = np.mean(losses)
    return avg_loss, predictions, actual_labels

In [None]:
# sModels = [BertModel, RobertaModel, DistilBertModel]
sModels = [DistilBertModel]
# model_list = ['bert-base-uncased', 'roberta-base', 'distilbert-base-uncased']
model_list = ['distilbert-base-uncased']
# tokenizer_list = [BertTokenizer, RobertaTokenizer, DistilBertTokenizer]
tokenizer_list = [DistilBertTokenizer]
# dropout_list = [0.2, 0.3, 0.4]
dropout_list = [0.3]
# learning_rate_list = [1e-5, 3e-5, 5e-5, 7e-5, 9e-5]
learning_rate_list = [1e-5, 3e-5, 5e-5]


training_stats = []
best_testing_stats = []
last_testing_stats = []

kf = StratifiedKFold(n_splits = k_folds, shuffle = True, random_state = seed_val)

for i in range(len(model_list)):
    for dropout in dropout_list:
        for learning_rate in learning_rate_list:
            sModel = sModels[i]
            model_name = model_list[i]
            tokenizer_name = tokenizer_list[i]

            tokenizer = tokenizer_name.from_pretrained(model_name, do_lower_case = True)
            dataset = TextClassificationDataset(data, tokenizer, max_length)

            average_accuracy = 0
            average_precision = 0
            average_recall = 0
            average_f1_score = 0
            average_roc_auc = 0

            average_best_accuracy = 0
            average_best_precision = 0
            average_best_recall = 0
            average_best_f1_score = 0
            average_best_roc_auc = 0

            for fold, (train_idx, test_idx) in enumerate(kf.split(data['text'], data['deceptive'])):
                random.seed(seed_val)
                np.random.seed(seed_val)
                torch.manual_seed(seed_val)
                torch.cuda.manual_seed_all(seed_val)
                torch.backends.cudnn.deterministic = True
                torch.backends.cudnn.benchmark = False
                os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
                
                train_dataset = Subset(dataset, train_idx)
                val_dataset = Subset(dataset, test_idx)
                train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
                test_dataloader = DataLoader(val_dataset, batch_size = batch_size, shuffle = True)

                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                model = classifier(model_name, num_classes, dropout).to(device)
                optimizer = AdamW(model.parameters(), lr = learning_rate)
                total_steps = len(train_dataloader) * num_epochs
                scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

                temp_accuracy_score = 0
                temp_precision_score = 0
                temp_recall_score = 0
                temp_f1_score = 0
                temp_roc_auc = 0

                for epoch in range(num_epochs):
                    training_loss, training_predictions, training_actual_labels = train(model, train_dataloader, optimizer, scheduler, device)
                    training_accuracy_score = accuracy_score(training_actual_labels, training_predictions)

                    test_loss, test_predictions, test_actual_labels = evaluate(model, test_dataloader, device)
                    test_accuracy_score = accuracy_score(test_actual_labels, test_predictions)
                    test_precision_score = precision_score(test_actual_labels, test_predictions)
                    test_recall_score = recall_score(test_actual_labels, test_predictions)
                    test_f1_score = f1_score(test_actual_labels, test_predictions, average = 'macro')
                    test_roc_auc = roc_auc_score(test_actual_labels, test_predictions)

                    print(f"Model: {model_name}, Dropout: {dropout}, Learning rate: {learning_rate}, Fold: {fold + 1}, Epoch: {epoch + 1}, Training Accuracy: {training_accuracy_score:.4f}, Test Accuracy: {test_accuracy_score:.4f}")

                    training_stats.append({'Model': model_name,
                                           'Dropout': dropout,
                                           'Learning Rate': learning_rate,
                                           'Fold': fold + 1,
                                           'Epoch': epoch + 1,
                                           'Training Loss': training_loss,
                                           'Test Loss': test_loss,
                                           'Training Accuracy': training_accuracy_score,
                                           'Test Accuracy': test_accuracy_score,
                                           'Test Precision': test_precision_score,
                                           'Test Recall': test_recall_score,
                                           'Test F-Score': test_f1_score,
                                           'Test Roc Auc': test_roc_auc,
                                           })
                    if test_accuracy_score > temp_accuracy_score:
                        temp_accuracy_score = test_accuracy_score
                    if test_precision_score > temp_precision_score:
                        temp_precision_score = test_precision_score
                    if test_recall_score > temp_recall_score:
                        temp_recall_score = test_recall_score
                    if test_f1_score > temp_f1_score:
                        temp_f1_score = test_f1_score
                    if test_roc_auc > temp_roc_auc:
                        temp_roc_auc = test_roc_auc
                    
                    if epoch + 1 == num_epochs:
                        average_accuracy += test_accuracy_score
                        average_precision += test_precision_score
                        average_recall += test_recall_score
                        average_f1_score += test_f1_score
                        average_roc_auc += test_roc_auc

                        average_best_accuracy += temp_accuracy_score
                        average_best_precision += temp_precision_score
                        average_best_recall += temp_recall_score
                        average_best_f1_score += temp_f1_score
                        average_best_roc_auc += temp_roc_auc

            print(f"Best Test Accuracy: {average_best_accuracy:.4f}, Last Test Accuracy: {average_accuracy:.4f}")

            last_testing_stats.append({'Model': model_name,
                                       'Dropout': dropout,
                                       'Learning Rate': learning_rate,
                                       'Test Accuracy': average_accuracy / k_folds,
                                       'Test Precision': average_precision / k_folds,
                                       'Test Recall': average_recall / k_folds,
                                       'Test F-Score': average_f1_score / k_folds,
                                       'Test Roc Auc': average_roc_auc / k_folds,
                                       })
            best_testing_stats.append({'Model': model_name,
                                       'Dropout': dropout,
                                       'Learning Rate': learning_rate,
                                       'Test Accuracy': average_best_accuracy / k_folds,
                                       'Test Precision': average_best_precision / k_folds,
                                       'Test Recall': average_best_recall / k_folds,
                                       'Test F-Score': average_best_f1_score / k_folds,
                                       'Test Roc Auc': average_best_roc_auc / k_folds,
                                       })

In [None]:
train_stats = pd.DataFrame(data = training_stats)
best_test_stats = pd.DataFrame(data = best_testing_stats)
last_test_stats = pd.DataFrame(data = last_testing_stats)

In [None]:
train_stats.to_csv('exp3_train_6.csv', index = False, float_format='%.5f')
best_test_stats.to_csv('exp3_test_best_6.csv', index = False, float_format='%.5f')
last_test_stats.to_csv('exp3_test_last_6.csv', index = False, float_format='%.5f')