# Imports

In [None]:
import peft
import transformers
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import (
    Trainer, DataCollatorWithPadding, BitsAndBytesConfig, AutoTokenizer,
    LlamaForSequenceClassification, DataCollatorWithPadding, Trainer
)
from datasets import Dataset
from sklearn.metrics import roc_auc_score
import torch
import numpy as np
import pandas as pd
import gc
import time

print(f'Torch Version: {torch.__version__}')
print(transformers.__version__)
print(peft.__version__)



Torch Version: 2.0.0
4.35.0
0.6.0


# Mistral Model Inference

In [None]:
# Load the fine-tuned Mistral model
class MistralModelInference:
    def __init__(self, model_path, adapter_path):
        self.MODEL_PATH = model_path  # The path to Mistral model
        self.ADAPTER_PATH = adapter_path # The path to the fine-tuned adapter
        self.NUM_LABELS = 1
        self.MAX_LENGTH = 512
        self.load_model()

    def load_model(self):
        start = time.time()
        # Load the tokenizer of LLM model
        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_PATH, use_fast=False)
        self.tokenizer.pad_token = self.tokenizer.eos_token

         # set the pad token of the model's configuration
        bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16
            )

        # Load base/pretrained LLM model
        base_model = LlamaForSequenceClassification.from_pretrained(self.MODEL_PATH,
                                                                    num_labels=self.NUM_LABELS,
                                                                    quantization_config=bnb_config,
                                                                    )

        # No idea why this is needed
        base_model.config.pretraining_tp = 1  # 1 is 7b
        # Assign Padding TOKEN
        base_model.config.pad_token_id = self.tokenizer.pad_token_id

        # Load the fine-tuned adapter layer on top of base model
        self.model = PeftModel.from_pretrained(base_model, self.ADAPTER_PATH)
        print(f"Complete loading pretrained LLM model {time.time() - start:.1f} seconds")

    def preprocess_function(self, examples):
        examples["text"] = list(map(lambda text: pre_processing_text(text), examples["text"]))
        return self.tokenizer(examples["text"], truncation=True,
                              max_length=self.MAX_LENGTH, padding=True)
    # Map x to 0 to 1.
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def inference(self, test_texts):
        test_data = pd.DataFrame({'text': test_texts})
        test_dataset = Dataset.from_pandas(test_data)
        test_tokenized_ds = test_dataset.map(self.preprocess_function, batched=True)
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding="longest")
        trainer = Trainer(model=self.model,
                          tokenizer=self.tokenizer,
                          data_collator=data_collator)

        pred_output = trainer.predict(test_tokenized_ds)
        logits = pred_output.predictions
        print(logits)
        predicted_probs = self.sigmoid(logits[:, 0]) # Get the probability of texts generated by LLMs
        return predicted_probs

In [None]:
model_path = "/mistral/pytorch/7b-v0.1-hf/1"  # Mistral"
# Adapter path stores the fine-tuned adapter, generated from the notebook to improve Mistral model's performance
adpater_path = "/mistral-7b-tpu-trained-checkpoint/mistral_7b/mistral_7b_TPU"
mistral_inference = MistralModelInference(model_path, adpater_path)

test_texts = ["Your test text goes here.", "Another test text."]

# Perform inference
predicted_probs = mistral_inference.inference(test_texts)

print("Predicted Probabilities:", predicted_probs)

You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/mistral/pytorch/7b-v0.1-hf/1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Complete loading pretrained LLM model 143.6 seconds


  0%|          | 0/1 [00:00<?, ?ba/s]

[[12.47]
 [10.56]]
Predicted Probabilities: [1. 1.]


In [None]:
# Load test data
test_df = pd.read_csv("/llm-detect-ai-generated-text/test_essays.csv", sep=',')
test_df = test_df.rename(columns={'generated': 'label'})

display(test_df)

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [None]:
# Infer the probabilities of texts in the testing dataset
probs = mistral_inference.inference(test_df['text'].tolist())

IDs = test_df['id'].values
predictions = []
for ID, prob in zip(IDs, probs):
    print(f"ID {ID}, prob = {prob}")
    predictions.append({'id': ID, 'generated': prob})
print(f"predictions = {predictions}")

  0%|          | 0/1 [00:00<?, ?ba/s]

[[6.332]
 [1.454]
 [7.03 ]]
ID 0000aaaa, prob = 0.998046875
ID 1111bbbb, prob = 0.810546875
ID 2222cccc, prob = 0.9990234375
predictions = [{'id': '0000aaaa', 'generated': 0.998}, {'id': '1111bbbb', 'generated': 0.8105}, {'id': '2222cccc', 'generated': 0.999}]


In [None]:
mistral_res = pd.DataFrame(predictions)
gc.collect()

113

### distilroberta + deberta

In [None]:
import transformers
import datasets
import pandas as pd
import numpy as np
from datasets import Dataset
import os
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from transformers import AutoTokenizer

In [None]:
model_checkpoint = "/detect-llm-models/distilroberta-finetuned_v5/checkpoint-49654"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length = 512 , padding=True, truncation=True)
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Move your model and data to the GPU
model.to(device);
trainer = Trainer(
    model,
    tokenizer=tokenizer,
)
test = pd.read_csv('/llm-detect-ai-generated-text/test_essays.csv').sort_values('id')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)
test_preds = trainer.predict(test_ds_enc)
logits = test_preds.predictions
probs = (np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True))[:,0]
res = pd.DataFrame()
res['id'] = test['id']
res['generated'] = probs
res = res.sort_values('id')
res.head()

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Unnamed: 0,id,generated
0,0000aaaa,0.001225
1,1111bbbb,0.001146
2,2222cccc,0.001189


In [None]:
del model, trainer, test_ds, test_ds_enc, tokenizer, test_preds, Trainer, TrainingArguments
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import random
import os
import numpy as np
import pandas as pd
from datasets import load_dataset
from datasets import Dataset

import torch
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

df = pd.read_csv("/llm-detect-ai-generated-text/test_essays.csv").sort_values('id')
dataset = Dataset.from_pandas(df[["text"]])

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/huggingfacedebertav3variants/deberta-v3-small')

def tokenize_function(examples):
    tokz = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    return tokz

tokenized_dataset = dataset.map(tokenize_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('/debertav3-small-llm-trained/checkpoint-24579-20240118T191843Z-001/checkpoint-24579')

In [None]:
test_args = TrainingArguments(
    output_dir = 'deberta-small',
    do_train = False,
    do_predict = True,
    dataloader_drop_last = False
)

# init trainer
trainer = Trainer(
              model = model,
              args = test_args)

test_results = trainer.predict(tokenized_dataset)



In [None]:
probas = torch.nn.functional.softmax(torch.from_numpy(test_results.predictions), dim=1).numpy()

In [None]:
deb_res = pd.DataFrame()
deb_res['generated'] = probas.T[1]
deb_res['id'] = df['id']

In [None]:
WEIGHTS = [0.30, 0.35, 0.25]

submission = pd.DataFrame({
    'id': test["id"],
    'generated': mistral_res['generated'] * WEIGHTS[0] + res['generated'] * WEIGHTS[1] + deb_res['generated'] * WEIGHTS[2]
})
submission = submission.dropna()

In [None]:
hkw = ['because', 'then', 'dont', 'texting', 'probably', 'almost']
mkw = ['additionally', 'significant', 'attitude', 'failure', 'climate', 'ensures', 'address', 'achieving', 'graduating', 'engagement', 'determination', 'impression', 'drawbacks', 'modes', 'enthusiasm', 'kindness', 'prioritize', 'urban', 'commitments', 'embrace', 'reliance', 'supportive', 'fulfilling', 'stricter', 'adopting', 'argues', 'conservation', 'gun', 'artificial', 'violent', 'foster', 'failures', 'initial', 'employers', 'stability', 'meat', 'monitoring', 'aim', 'libraries', 'geological', 'committing', 'external', 'maintenance', 'footprint', 'undemocratic', 'platforms', 'consumption', 'shaping', 'biases', 'highlights', 'invaluable', 'societal', 'infrastructure', 'integral', 'diet', 'populous', 'insights', 'chronic', 'profound', 'sustainable', 'livable', 'internships', 'constitutional', 'setbacks', 'codes', 'inclusive', 'align', 'successes', 'delays', 'densely', 'marine', 'ethical', 'belonging', 'guns', 'addressing', 'appreciation', 'trump', 'smith', 'fosters', 'wage', 'firsthand', 'cyberbullying', 'emphasizes', 'embracing']

def count_keywords(text, keywords):
    count = 0
    ltext = text.lower()
    for kw in keywords:
        count += (kw in ltext)

    return count

prob_df = pd.DataFrame()
df["h_keyword_count"] = df['text'].apply(count_keywords, keywords = hkw)
df["m_keyword_count"] = df['text'].apply(count_keywords, keywords = mkw)
prob_df = submission
prob_df['new_prob'] = np.where(prob_df['generated'] <= prob_df['generated'].quantile(0.50),
                               prob_df['generated'] - df['h_keyword_count'] * (prob_df['generated'] * 0.10), prob_df['generated'])
prob_df['new_prob'] = np.where(prob_df['generated'] > prob_df['generated'].quantile(0.50),
                               prob_df['generated'] + df['m_keyword_count'] * (prob_df['generated'] * 0.10), prob_df['new_prob'])

In [None]:
prob_df['generated'] = prob_df['new_prob']

In [None]:
del prob_df['new_prob']
prob_df

Unnamed: 0,id,generated
0,0000aaaa,0.549987
1,1111bbbb,0.493563
2,2222cccc,0.550219


In [None]:
prob_df.to_csv('sheet.csv', index=False)