In [1]:
import nltk

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve


import torch
import torchtext
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizerFast, BertModel, AdamW

import time
from tqdm.notebook import tqdm


device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)



cuda:0


In [None]:
df = pd.read_csv('/kaggle/input/combined-dataset/data.csv', \
                            usecols=['text', 'generated'], )
size = 1000
df_0 = df[df['generated'] == 0].sample(n=size//2, random_state=42)
df_1 = df[df['generated'] == 1].sample(n=size//2, random_state=42)
df = pd.concat([df_0, df_1]).reset_index(drop=True)
# stratified_sample = df.groupby('generated').apply(lambda x: x.sample(frac=0.5, random_state=42))

# df = stratified_sample.sample(frac=1).reset_index(drop=True)[:1000]
# df['generated'].sum()
df

## Transformer

In [None]:

# initialguess for the archetecture is [823, 512, 256, 128, 1]
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim):
        super(MLP, self).__init__()
        self.fc1  = nn.Linear(input_dim, hidden_dim1)
        self.fc2  = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3  = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4  = nn.Linear(hidden_dim3, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.batchnorm1 = nn.BatchNorm1d(hidden_dim1)
        self.batchnorm2 = nn.BatchNorm1d(hidden_dim2)
        self.batchnorm3 = nn.BatchNorm1d(hidden_dim3)

    def forward(self, x):
        x = self.fc1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.fc4(x)
        return x
            
        
class CombinedModel(nn.Module):
    def __init__(self, bert_model, mlp):
        super(CombinedModel, self).__init__()
        self.bert = bert_model
        self.mlp = mlp

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state[:, 0, :]
        logits = self.mlp(last_hidden_state)
        return logits
            
        
def train(model, train_loader, optimizer, loss_fn):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = [t.to(device) for t in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask) #, labels=labels)
        loss = loss_fn(outputs.to(device), labels.type(torch.LongTensor).to(device))
        loss.backward()
        optimizer.step()
        train_loss += loss
        train_correct += (torch.max(outputs.data, dim=1)[1] == labels).sum().item()
        train_total += len(labels)
        
    train_loss = train_loss / train_total
    train_accuracy = 100 * train_correct / train_total
    return train_loss, train_accuracy


def validation(model, val_loader, loss_fn):
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    y_pred = []
    y_true = []
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids, attention_mask, labels = [t.to(device) for t in batch]
            outputs = model(input_ids, attention_mask=attention_mask) #, labels=labels)
            loss = loss_fn(outputs.to(device), labels.type(torch.LongTensor).to(device))
            val_loss += loss
            val_correct += (torch.max(outputs.data, dim=1)[1] == labels).sum().item()
            val_total += len(labels)
            
            y_pred.extend(torch.softmax(outputs.data, dim=1)[:, 1].cpu().numpy())  # Assuming class 1 probability
            y_true.extend(labels.cpu().numpy())

            val_correct += (torch.max(outputs.data, dim=1)[1] == labels).sum().item()
            val_total += len(labels)
    
    y_pred_label = [int(num >= 0.5) for num in y_pred]
    roc_auc = roc_auc_score(y_true, y_pred_label)
    precision = precision_score(y_true, y_pred_label)
    recall = recall_score(y_true, y_pred_label)
    f1 = f1_score(y_true, y_pred_label)
    fpr, tpr, thresholds = roc_curve(y_true, y_pred) 
    val_loss = val_loss / val_total
    val_accuracy = 100 * val_correct / val_total
    return val_loss, val_accuracy, roc_auc, precision, recall, f1, fpr, tpr

def tokenize_and_encode(tokenizer, comments, labels, max_length=128): 
    input_ids = [] 
    attention_masks = [] 

    for comment in comments: 
        encoded_dict = tokenizer.encode_plus( 
            comment,  
            add_special_tokens=True, 
            max_length=max_length, 
            pad_to_max_length=True, 
            return_attention_mask=True, 
            return_tensors='pt'
        ) 

        input_ids.append(encoded_dict['input_ids']) 
        attention_masks.append(encoded_dict['attention_mask']) 

    input_ids = torch.cat(input_ids, dim=0) 
    attention_masks = torch.cat(attention_masks, dim=0) 

    labels = torch.tensor(labels, dtype=torch.float32) 
    
    return input_ids, attention_masks, labels 



In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True)
bert_model = BertModel.from_pretrained('bert-base-uncased')
num_layers = len(bert_model.encoder.layer)  # Adjust based on your BERT model
for i in range(0, num_layers):
    if i >= num_layers - 0:
        for param in bert_model.encoder.layer[i].parameters():
            param.requires_grad = True
    else:
        for param in bert_model.encoder.layer[i].parameters():
            param.requires_grad = False



mlp = MLP(768, 512, 256, 128, 2)
model = CombinedModel(bert_model, mlp).to(device)

In [None]:

# size = 10000
df = pd.read_csv('/kaggle/input/combined-dataset/data.csv', usecols=['text', 'generated'], )
# df_0 = df[df['generated'] == 0].sample(n=size//2, random_state=42)
# df_1 = df[df['generated'] == 1].sample(n=size//2, random_state=42)
# df = pd.concat([df_0, df_1]).reset_index(drop=True)

train_texts, val_texts, train_labels, val_labels = train_test_split(df.text.tolist(), df.generated.tolist(), \
                                                  test_size=0.3, random_state=42, \
                                                  stratify=df.generated.tolist())

MAX_LEN = 400
BATCH_SIZE = 32
NUM_EPOCHS = 10


train_input_ids, train_attention_masks, train_labels = tokenize_and_encode( 
    tokenizer, 
    train_texts, 
    train_labels,
    max_length = MAX_LEN
) 

val_input_ids, val_attention_masks, val_labels = tokenize_and_encode( 
    tokenizer, 
    val_texts, 
    val_labels,
    max_length = MAX_LEN
) 


train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels) 
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels) 

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [7]:

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

for epoch in tqdm(range(NUM_EPOCHS)):
    Tr_loss, Tr_acc, Vl_loss, Vl_acc = [], [], [], []
    Vl_roc_auc, Vl_precision, Vl_recall, Vl_f1 = [], [], [], []
    Vl_fpr, Vl_tpr = [], []

    print(f"Epoch {epoch+1}")

    train_loss, train_acc = train(model, train_loader, optimizer, loss_fn)
    val_loss, val_acc, roc_auc, precision, recall, f1, fpr, tpr = validation(model, val_loader, loss_fn)
    
    Tr_loss.append(train_loss)
    Tr_acc.append(train_acc)
    Vl_loss.append(val_loss)
    Vl_acc.append(val_acc)
    Vl_roc_auc.append(roc_auc)
    Vl_precision.append(precision)
    Vl_recall.append(recall)
    Vl_f1.append(f1)
    Vl_tpr.append(tpr)
    Vl_fpr.append(fpr)

    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    print(f"AOC ROC: {roc_auc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, ")
    print("\n")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 1
Train Loss: 0.0051, Train Acc: 95.5424, Val Loss: 0.0012, Val Acc: 97.9818
AOC ROC: 0.9816, Precision: 0.9588, Recall: 0.9891, F1: 0.9737, 


Epoch 2


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 2
Train Loss: 0.0015, Train Acc: 99.0671, Val Loss: 0.0007, Val Acc: 98.8396
AOC ROC: 0.9889, Precision: 0.9786, Recall: 0.9910, F1: 0.9848, 


Epoch 3


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 3
Train Loss: 0.0008, Train Acc: 99.5274, Val Loss: 0.0007, Val Acc: 98.6449
AOC ROC: 0.9884, Precision: 0.9689, Recall: 0.9962, F1: 0.9823, 


Epoch 4


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 4
Train Loss: 0.0005, Train Acc: 99.7251, Val Loss: 0.0007, Val Acc: 98.6882
AOC ROC: 0.9887, Precision: 0.9698, Recall: 0.9964, F1: 0.9829, 


Epoch 5


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 5
Train Loss: 0.0003, Train Acc: 99.8147, Val Loss: 0.0006, Val Acc: 98.8900
AOC ROC: 0.9904, Precision: 0.9745, Recall: 0.9968, F1: 0.9855, 


Epoch 6


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 6
Train Loss: 0.0002, Train Acc: 99.9073, Val Loss: 0.0004, Val Acc: 99.2792
AOC ROC: 0.9935, Precision: 0.9848, Recall: 0.9964, F1: 0.9905, 


Epoch 7


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 7
Train Loss: 0.0001, Train Acc: 99.9228, Val Loss: 0.0003, Val Acc: 99.5099
AOC ROC: 0.9951, Precision: 0.9922, Recall: 0.9949, F1: 0.9935, 


Epoch 8


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 8
Train Loss: 0.0001, Train Acc: 99.9351, Val Loss: 0.0004, Val Acc: 99.3729
AOC ROC: 0.9942, Precision: 0.9875, Recall: 0.9960, F1: 0.9917, 


Epoch 9


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 9
Train Loss: 0.0001, Train Acc: 99.9537, Val Loss: 0.0003, Val Acc: 99.4810
AOC ROC: 0.9946, Precision: 0.9924, Recall: 0.9939, F1: 0.9931, 


Epoch 10


  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch 10
Train Loss: 0.0001, Train Acc: 99.9629, Val Loss: 0.0004, Val Acc: 99.3225
AOC ROC: 0.9938, Precision: 0.9860, Recall: 0.9962, F1: 0.9911, 




In [8]:
checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'train_loss': Tr_loss,
    'train_acc': Tr_acc,
    'val_loss': Vl_loss,
    'val_acc': Vl_acc,
    'val_roc_auc': Vl_roc_auc,
    'val_precision': Vl_precision,
    'val_recall': Vl_recall,
    'val_f1': Vl_f1,
    'val_tpr': Vl_tpr,
    'val_fpr': Vl_fpr,

}
torch.save(checkpoint, 'checkpoint.pth')
