In [None]:
!pip list

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import pandas as pd

# Paths for input data and saving outputs
input_path = '/kaggle/input/llm-detect-ai-generated-text'
output_path = '/kaggle/working/'
model_path = '/kaggle/input/distilbert-sst-2/my_model'

In [None]:
# 0. Load Dataset
train_essays_df = pd.read_csv(f'{input_path}/train_essays.csv')
train_prompts_df = pd.read_csv(f'{input_path}/train_prompts.csv')

In [None]:
# 1. Load the tokenizer and model from the local directory
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
# 2. Tokenize the dataset and prepare the features for training
tokenized_data = tokenizer(list(train_essays_df['text']), padding=True, truncation=True, return_tensors="pt")
labels = train_essays_df['generated'].values
labels_tensor = torch.tensor(labels)

In [None]:
# 3. Split the data into training and validation sets
train_input_ids, val_input_ids, train_attention_mask, val_attention_mask, train_labels, val_labels = train_test_split(
    tokenized_data['input_ids'], 
    tokenized_data['attention_mask'], 
    labels_tensor, 
    test_size=0.1, 
    random_state=42
)

In [None]:
# 4. Fine-tune the model with the prepared dataset
class EssaysDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Create the dataset objects
train_encodings = {'input_ids': train_input_ids, 'attention_mask': train_attention_mask}
val_encodings = {'input_ids': val_input_ids, 'attention_mask': val_attention_mask}
train_dataset = EssaysDataset(train_encodings, train_labels)
val_dataset = EssaysDataset(val_encodings, val_labels)

In [None]:
import os
from transformers import Trainer, TrainingArguments

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

# Set up your training arguments without Weights & Biases reporting
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",  # Disable wandb reporting
)


In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
# !wandb disabled
trainer.train()

In [None]:
# 5. Evaluate the model's performance
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
# 6. Save the model for later use or deployment
model.save_pretrained(f'{output_path}/my_model')
tokenizer.save_pretrained(f'{output_path}/my_model')

In [None]:
# 7. Use the model to predict test data (unchanged)
from torch.utils.data import Dataset, DataLoader

# Assuming the same tokenizer and model from earlier are still in scope and have been trained

# Load the test data
test_essays_path = '/kaggle/input/llm-detect-ai-generated-text/test_essays.csv' 
test_essays_df = pd.read_csv(test_essays_path)

# Tokenize the test data
test_encodings = tokenizer(list(test_essays_df['text']), padding=True, truncation=True, return_tensors="pt")

# Create a test dataset
class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

test_dataset = TestDataset(test_encodings)


In [None]:
# Create a DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Predict
model.eval()  # Set the model to evaluation mode
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predictions.extend(probabilities[:,1].tolist())  # Get the probability of the "generated" class


In [None]:
# Format the predictions into the required submission format
submission_df = pd.DataFrame({
    'id': test_essays_df['id'],
    'generated': predictions
})

In [None]:
print(submission_df)

In [None]:
# Save the predictions to a CSV file
submission_path = 'submission.csv'  # Update with the actual path
submission_df.to_csv(submission_path, index=False)