# LLM Competition Submission

This notebook represents my submission to the [LLM - Detect AI Generated Text](https://www.kaggle.com/competitions/llm-detect-ai-generated-text/overview) competition on Kaggle. This notebook was made on Kaggle as the competition is a code competition. This is why there may be some inconsistencies with the other code files and notebooks.

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from datasets import Dataset, DatasetDict
import os
os.environ["WANDB_DISABLED"] = "true"
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)
tqdm.pandas()
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
# # Getting the data
# training_data = pd.read_csv('../input/prepared-data-llm-competition/prepared_training_set.csv')
# training_data.head()

In [None]:
# # Renaming the columns
# training_data.rename(columns={'essay':'text','LLM_written':'labels'},inplace=True)

# # Changing labels to float
# training_data['labels'] = training_data['labels'].astype('float')

In [None]:
# # Putting training data into a Dataset for Hugging Face
# training = Dataset.from_pandas(training_data[['text','labels']])

In [None]:
# # Getting the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')

In [None]:
# # Function for tokenizing
# def tokenize_function(example):
#     return tokenizer(example['text'],padding='max_length',truncation=True,max_length=512)

In [None]:
# # Tokenizing
# tokenized_data = training.map(tokenize_function,batched=True,batch_size=128)

In [None]:
# Getting the model
model = AutoModelForSequenceClassification.from_pretrained('../input/roberta-base',num_labels=1)
model.to(device)

In [None]:
# training_args = TrainingArguments(output_dir='test_trainer',evaluation_strategy='no',learning_rate=2e-5,weight_decay=0,num_train_epochs=1)

In [None]:
# trainer = Trainer(model=model,args=training_args,train_dataset=tokenized_data,compute_metrics=roc_auc)

In [None]:
# # Fine tuning
# trainer.train()

In [None]:
# # Saving the model
# model.save_pretrained("fine-tuned-roberta")

In [None]:
# # Zipping the model
# !zip -r roberta.zip /kaggle/working/fine-tuned-roberta

In [None]:
# # Defining a function for inference
# def inference(essay:str) -> float:
#   # Tokenizing the input essay
#   inputs = tokenizer(essay,padding='max_length',truncation=True,max_length=512,return_tensors='pt').to(device)

#   # Getting the logits
#   with torch.no_grad():
#     logits = model(**inputs).logits
#     probability = nn.functional.sigmoid(logits)
#   return probability

In [None]:
# # Running the examples through the model
# train_predictions = training_data['text'].progress_apply(inference)

In [None]:
# # Making predictions on training data and evaluating
# print('ROC AUC on Training Set:')
# roc_auc_score(training_data['labels'],train_predictions)

## Submission

In [None]:
# Getting the tokenizer and model
model_path = '../input/fine-tuned-roberta-for-llm-detection/kaggle/working/fine-tuned-roberta'
tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)

In [None]:
# Defining a function for inference
def inference(essay:str) -> float:
  # Tokenizing the input essay
  inputs = tokenizer(essay,padding='max_length',truncation=True,max_length=512,return_tensors='pt').to(device)

  # Getting the logits
  with torch.no_grad():
    logits = model(**inputs).logits
    probability = nn.functional.sigmoid(logits)
  return probability.item()

In [None]:
# Getting the submission set
testing_data = pd.read_csv('../input/llm-detect-ai-generated-text/test_essays.csv')
testing_data.head()

In [None]:
# getting predictions
test_predictions = testing_data['text'].progress_apply(inference)

In [None]:
# Combining predictions with ids
submission = pd.DataFrame()
submission['id'] = testing_data['id']
submission['generated'] = test_predictions

In [None]:
# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

In [None]:
submission