<a href="https://www.kaggle.com/code/blohorn/llm-detect-generated-text?scriptVersionId=157881296" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/distilbert-base-uncased/distilbert_base_uncased/config.json
/kaggle/input/distilbert-base-uncased/distilbert_base_uncased/tokenizer_config.json
/kaggle/input/distilbert-base-uncased/distilbert_base_uncased/pytorch_model.bin
/kaggle/input/distilbert-base-uncased/distilbert_base_uncased/model.safetensors
/kaggle/input/distilbert-base-uncased/distilbert_base_uncased/special_tokens_map.json
/kaggle/input/distilbert-base-uncased/distilbert_base_uncased/vocab.txt
/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv
/kaggle/input/tokenizer-and-models/roberta-base/config.json
/kaggle/input/tokenizer-and-models/roberta-base/merges.txt
/kaggle/input/tokenizer-and-models/roberta-base/tokenizer.json
/kaggle/input/tokenizer-and-models/roberta-base/vocab.json
/kaggle/input/tokenizer-and-models/rob

In [2]:
#import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
#!pip install evaluate -q

In [4]:
import pandas as pd 
import numpy as np
import torch
from transformers import (
    AdamW,
    DistilBertTokenizer,
    DistilBertModel,
    DistilBertForSequenceClassification,
    DataCollatorWithPadding,
    Trainer, 
    TrainingArguments,
)
from sklearn.metrics import roc_auc_score
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datasets import Dataset, DatasetDict


class Hf():
    def __init__(self):
        pass
    
    def split_train_test(self, data):
        
        traindf, testdf = train_test_split(data, test_size=.2)
        return traindf,testdf 
    
        
    def convert_to_dict_dataset(self, traindf, testdf):
        dataset_dict = DatasetDict({
            'train': traindf,
            'validation': testdf
        })
        return dataset_dict
        
        
    def distilbert_base_uncased(self, model):
        #model = "distilbert-base-uncased"
        tokenizer = DistilBertTokenizer.from_pretrained(model)
        model = DistilBertForSequenceClassification.from_pretrained(model, 
                                                num_labels=2)
        return tokenizer, model
    
    def save_model_tokenizer(self, model_path):
        tokenizer.save_pretrained(model_path)
        model.save_pretrained(model_path)
        
    def preprocess_function(self, examples):
        return tokenizer(examples['text'],padding=True, 
                         truncation=True, max_length=256)

    def pipeline(self, dataframe,col):
        """
        Prepares the dataframe so that it can be given to the transformer model
        in -> pandas dataframe
        out -> tokenized dataset (columns = text, label, input, attention)
        """    
        data = Dataset.from_pandas(dataframe, preserve_index=False)
        tokenized_ds = data.map(self.preprocess_function, batched=True)
        tokenized_ds = tokenized_ds.remove_columns(col)
        return tokenized_ds
    
    def extract_hidden_states(self, batch):
        # Place model inputs on the GPU
        inputs = {k:v.to(device) for k,v in batch.items() 
                  if k in tokenizer.model_input_names}
        # Extract last hidden states
        with torch.no_grad():
            last_hidden_state = model(**inputs).last_hidden_state
        # Return vector for [CLS] token
        return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

    def compute_metrics(self,eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        accuracy = accuracy_score(y_true=labels, y_pred=predictions)
        return {"accuracy": accuracy}
    
    def training(self, model, train, validation):
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        
        training_args = TrainingArguments(
            output_dir="./results",
            learning_rate=5e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=2,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy = 'epoch',
            load_best_model_at_end=True,
            push_to_hub=False,
            #torch_compile=True,
            optim="adamw_torch_fused",
            report_to="none"
        )
        
        trainer = Trainer(
            tokenizer=tokenizer,
            model=model,
            args=training_args,
            train_dataset=train,
            eval_dataset=validation,
            compute_metrics=self.compute_metrics,
            data_collator=data_collator,
            
        )
        trainer.train()
        
        return trainer

    def predict(self, text, model, tokenizer):
        inputs = tokenizer(
            text,
            return_tensors='pt',
            padding='max_length', 
            truncation=True, 
           # max_length=512 - N_VIRTUAL_TOKENS
        )
        
        model.eval()
        
        if model.device.type == 'cuda':
            inputs = {k: v.to('cuda') for k, v in inputs.items()}
        
        with torch.no_grad():
            logits = model(**inputs).logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        
        return probabilities[0,-1].item()




In [5]:
hf = Hf()

In [6]:
#load all datasets
#C:/Users/hajer/Documents/Python Scripts/llmdetectia/data/llm-detect-ai-generated-text
train_df = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv') 
test_df = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sample = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
train_ext = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")
train_df.rename(columns={"generated": "label"}, inplace=True)

In [7]:
df_final = pd.concat([train_df[["text",'label']], train_ext[["text","label"]]])

In [8]:
train, test = hf.split_train_test(df_final)

In [9]:
model_path = '/kaggle/input/distilbert-base-uncased/distilbert_base_uncased/'

In [10]:
tokenizer, model = hf.distilbert_base_uncased(model_path)

In [11]:
tokenized_train = hf.pipeline(train, ['text'])
tokenized_test = hf.pipeline(test, ['text'])

  if _pandas_api.is_sparse(col):


  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [12]:
trainer = hf.training(model, tokenized_train, tokenized_test)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0095,0.006502,0.998811
2,0.004,0.009693,0.998595


In [13]:
test_df['generated'] = test_df['text'].apply(lambda x: hf.predict(x, model, tokenizer))
test_df['generated']

0    0.999997
1    0.999986
2    0.999962
Name: generated, dtype: float64

In [14]:
test_df.drop(['prompt_id', 'text'], axis=1, inplace=True)

In [15]:
test_df.to_csv('submission.csv', index=False)