In [None]:
#using libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# Load datasets
training_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/KAGGLE_llm-detection/train_essays.csv")
testing_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/KAGGLE_llm-detection/test_essays.csv")

# Initial data exploration
training_data.describe()

In [None]:
# Define a function for text cleaning
def preprocess_text(text):
    # Removing punctuation and converting to lowercase
    text = re.sub(r'\W', ' ', text).lower()
    # Tokenization and removal of stopwords
    words = text.split()
    filtered_words = [word for word in words if word.isalpha() and word not in set(stopwords.words('english'))]
    return ' '.join(filtered_words)

# Apply text preprocessing
training_data['processed_text'] = training_data['text'].apply(preprocess_text)

In [None]:
# Split data for training and validation
X_train, X_validate, y_train, y_validate = train_test_split(training_data['processed_text'], training_data['generated'], test_size=0.2, random_state=42)

# Setup BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def encode_texts(tokenizer, texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Encoding text data
train_encodings = encode_texts(bert_tokenizer, X_train.tolist())
validate_encodings = encode_texts(bert_tokenizer, X_validate.tolist())

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Prepare datasets for PyTorch
def create_dataset(encodings, labels):
    return TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels.values))

train_dataset = create_dataset(train_encodings, y_train)
validate_dataset = create_dataset(validate_encodings, y_validate)

In [None]:
# DataLoader setup
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validate_dataloader = DataLoader(validate_dataset, batch_size=batch_size)

In [None]:
# Initialize BERT model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

# Optimizer configuration
optimizer = AdamW(bert_model.parameters(), lr=2e-5)

In [None]:
# Training function
def train_model(model, dataloader, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in dataloader:
            batch = [item.to(device) for item in batch]
            inputs, masks, labels = batch

            model.zero_grad()
            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss =outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Avoid exploding gradients
            optimizer.step()
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {total_loss/len(dataloader):.4f}")

In [None]:
train_model(bert_model, train_dataloader, optimizer)

In [None]:
def evaluate_model(model, dataloader):
  model.eval()
  predictions, true_labels = [], []
  with torch.no_grad():
    for batch in dataloader:
      batch = [item.to(device) for item in batch]
      inputs, masks, labels = batch
      outputs = model(inputs, attention_mask=masks)
      logits = outputs.logits

    predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
    true_labels.extend(labels.cpu().numpy())
  accuracy = accuracy_score(true_labels, predictions)
  return accuracy

In [None]:
validation_accuracy = evaluate_model(bert_model, validate_dataloader)
print(f"Validation Accuracy: {validation_accuracy:.4f}")

In [None]:
test_encodings = encode_texts(bert_tokenizer, testing_data['text'].tolist())
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
def predict_test_data(model, dataloader):
  model.eval()
  test_predictions = []
  with torch.no_grad():
    for batch in dataloader:
      inputs, masks = batch
      inputs, masks = inputs.to(device), masks.to(device)
      outputs = model(inputs, attention_mask=masks)
      logits = outputs.logits
      test_predictions.extend(torch.softmax(logits, dim=1)[:, 1].cpu().numpy())
  return test_predictions

test_predictions = predict_test_data(bert_model, test_dataloader)

In [None]:
submission_df = pd.DataFrame({'id': testing_data['id'],'generated': test_predictions})

In [None]:
submission_df.to_csv('/content/drive/MyDrive/Colab Notebooks/KAGGLE_llm-detection/submission.csv', index=False)