In [1]:
import sys
import os
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
    SentencePieceBPETokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, sum_models
from xgboost import XGBClassifier
from xgboost import DMatrix
from sklearn.svm import SVC

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from scipy import sparse
from joblib import Parallel, delayed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 10
ROOT = "../input"
train = pd.read_csv(f"{ROOT}/daigt-v3-train-dataset/train_v3_drcat_02.csv", sep=",")
train = train.loc[
    train.source != "train_essays"
]  # .sample(frac=1.0, random_state=SEED)

In [3]:
models = np.unique(train["model"])
models

array(['ada', 'babbage', 'claude', 'cohere', 'curie', 'davinci', 'falcon',
       'gpt', 'human', 'llama', 'mistral', 'palm'], dtype=object)

How should I divide K-folds for proper validation?
- model -> prompt? (seems more appropriate) -> then match human labels with corresponding prompts?
- Balance labels for each prompts (by removing excess ones)


In [4]:
model_prompts = []
for i in range(0, len(train)):
    row = train.iloc[i]
    model_prompts.append(f"{row['model']}_{row['prompt_name']}")
model_prompt_species = np.unique(model_prompts)

In [5]:
model_prompt_indices = {}
prompt_to_species_map = {}
for species in model_prompt_species:
    if "human" not in species:
        model_prompt_indices[species] = [[], []]
        prompt = species.split("_")[-1]
        if prompt in prompt_to_species_map.keys():
            prompt_to_species_map[prompt].append(species)
        else:
            prompt_to_species_map[prompt] = [species]

In [6]:
for i in range(0, len(model_prompts)):
    if "human" not in model_prompts[i]:
        origin = model_prompt_indices[model_prompts[i]]
        origin[0].append(i)
        model_prompt_indices[model_prompts[i]] = origin

In [7]:
for i in range(0, len(model_prompts)):
    if "human" in model_prompts[i]:
        prompt = model_prompts[i].split("_")[-1]
        candidates = prompt_to_species_map[prompt]
        for candidate in candidates:
            if len(model_prompt_indices[candidate][0]) > len(
                model_prompt_indices[candidate][1]
            ):
                origin = model_prompt_indices[candidate]
                origin[1].append(i)
                model_prompt_indices[candidate] = origin
                break

In [8]:
for species in model_prompt_species:
    if "human" not in species:
        # remove unbalanced categories
        n_model_samples = len(model_prompt_indices[species][0])
        n_human_samples = len(model_prompt_indices[species][1])
        if (n_model_samples != n_human_samples) or (
            max(n_model_samples, n_human_samples) < 10
        ):
            del model_prompt_indices[species]
            gc.collect()

for species in model_prompt_indices.keys():
    if "human" not in species:
        print(
            f"{species}: {len(model_prompt_indices[species][0])} | {len(model_prompt_indices[species][1])}"
        )

ada_"A Cowboy Who Rode the Waves": 99 | 99
ada_Car-free cities: 97 | 97
ada_Does the electoral college work?: 99 | 99
ada_Driverless cars: 98 | 98
ada_Exploring Venus: 100 | 100
ada_Facial action coding system: 100 | 100
ada_The Face on Mars: 99 | 99
babbage_"A Cowboy Who Rode the Waves": 99 | 99
babbage_Car-free cities: 100 | 100
babbage_Does the electoral college work?: 100 | 100
babbage_Driverless cars: 100 | 100
babbage_Exploring Venus: 99 | 99
babbage_Facial action coding system: 100 | 100
babbage_The Face on Mars: 100 | 100
claude_"A Cowboy Who Rode the Waves": 141 | 141
claude_Car-free cities: 97 | 97
claude_Cell phones at school: 132 | 132
claude_Community service: 118 | 118
claude_Distance learning: 121 | 121
claude_Does the electoral college work?: 130 | 130
claude_Driverless cars: 155 | 155
claude_Exploring Venus: 138 | 138
claude_Facial action coding system: 148 | 148
claude_Grades for extracurricular activities: 126 | 126
claude_Mandatory extracurricular activities: 147 | 

In [9]:
model_species = np.unique(train.model)
model_to_species_map = {}
for species in model_prompt_species:
    if "human" not in species:
        prompt = species.split("_")[0]
        if prompt in model_to_species_map.keys():
            model_to_species_map[prompt].append(species)
        else:
            model_to_species_map[prompt] = [species]

In [10]:
model_species = np.unique(train.model)

fold_indices = -np.ones(len(train), dtype=int)
remain_model_prompt_keys = list(model_prompt_indices.keys())
for i in range(0, len(model_species)):
    prompt = prompt
    if model_species[i] == "human":
        continue
    candidates = model_to_species_map[model_species[i]]
    for candidate in candidates:
        if candidate in list(model_prompt_indices.keys()):
            inds = model_prompt_indices[candidate]
            for j in range(0, len(inds[0])):
                fold_indices[inds[0][j]] = i
                fold_indices[inds[1][j]] = i
np.unique(fold_indices, return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  9, 10]),
 array([27047,  1384,  1396,  4000,   698,  1392,  4198,  9072,  8116,
         3940,  2646]))

In [11]:
# reorder
for i in range(9, 11):
    fold_indices[fold_indices == i] = i - 1
np.unique(fold_indices)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9])

In [12]:
train["fold"] = fold_indices
train = train.loc[train.fold > -1]
train = train.sample(frac=1.0, random_state=SEED)
train.drop(["source", "RDizzl3_seven"], axis=1, inplace=True)
train.rename({"label": "generated"}, axis=1, inplace=True)
train.to_csv("../input/20230115_v3_model-split_balanced.csv", index=False)

In [13]:
np.unique(train.fold, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([1384, 1396, 4000,  698, 1392, 4198, 9072, 8116, 3940, 2646]))

In [4]:
train = pd.read_csv("../input/20230115_v3_model-split_balanced.csv")
train_prev = pd.read_csv("../input/model-wise-split_10folds.csv")

In [5]:
train.loc[train.fold == 9]

Unnamed: 0,text,generated,prompt_name,model,fold
17,"As an eighth-grade student, I have many goals ...",1,Summer projects,mistral,9
24,The school board should not add an additional...,1,Mandatory extracurricular activities,mistral,9
57,"Dear Principal,\n\nCommunity service is great ...",0,Community service,human,9
65,"Thomas Jefferson's words, ""Determine never to...",1,Mandatory extracurricular activities,mistral,9
67,"Dear Principal,\n\nI would like to take positi...",0,Community service,human,9
...,...,...,...,...,...
36730,Do I agree or do I disagree with my principals...,0,Mandatory extracurricular activities,human,9
36752,It has been said that some school's are requir...,0,Summer projects,human,9
36764,"Dear Hiring Manager,\n\nI am writing to expres...",1,Community service,mistral,9
36776,Inactivity has a significant impact on a perso...,1,Mandatory extracurricular activities,mistral,9


In [15]:
print(np.unique(train.fold, return_counts=True))
print(np.unique(train_prev.fold, return_counts=True))

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([1384, 1396, 4000,  698, 1392, 4198, 9072, 8116, 3940, 2646]))
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([1384, 1396, 4000,  698, 1392, 4198, 9072, 8116, 3940, 2646]))


In [12]:
train = pd.read_csv("../input/20230112_model-wise-split_all-data.csv")

In [20]:
fold_models = []
folds = sorted(np.unique(train.fold))
target_model = "ada"
target_model_fold = 0

for i in range(0, len(folds)):
    part_train = train.loc[train.fold == i]
    model_list = np.unique(part_train.model)
    for model in model_list:
        if model != "human":
            print(model)

ada
babbage
claude
cohere
curie
davinci
falcon
gpt
llama
mistral
palm


In [25]:
train = pd.read_csv("../input/20230112_model-wise-split_all-data.csv")
target_model = "babbage"


def last_fold_changer(train, model_name_last_fold):
    folds = sorted(np.unique(train.fold))
    fold_models = []
    target_model_fold = 0
    last_fold = np.max(folds)
    for i in range(0, len(folds)):
        part_train = train.loc[train.fold == i]
        model_list = np.unique(part_train.model)
        for model in model_list:
            if model != "human":
                if model == target_model:
                    target_model_fold = i
                if i == last_fold:
                    last_fold_model = model
    train.loc[train.fold == last_fold, "fold"] = target_model_fold
    train.loc[train.model == target_model, "fold"] = last_fold

    for i in range(0, len(folds)):
        part_train = train.loc[train.fold == i]
        model_list = np.unique(part_train.model)
        for model in model_list:
            if model != "human":
                print(model)
    return train


train = last_fold_changer(train, "ada")

ada
palm
claude
cohere
curie
davinci
falcon
gpt
llama
mistral
babbage


In [26]:
train

Unnamed: 0,text,generated,prompt_name,model,fold
0,"As a scientist at NASA, I've had the privilege...",1,The Face on Mars,llama,8
1,Have you ever been forced to go somewhere you ...,0,Mandatory extracurricular activities,human,3
2,"Dear Principal,\n\nI personally think or belie...",0,Community service,human,4
3,Some people do extracurricular activities and ...,0,Mandatory extracurricular activities,human,2
4,Driverless Cars are Coming: Navigating the Roa...,1,Driverless cars,falcon,6
...,...,...,...,...,...
63884,Title: The Face on Mars: A Natural Geological ...,1,The Face on Mars,llama,8
63885,Would you like to be able to not drive you car...,0,Driverless cars,human,1
63886,Luke Bomberger was just an ordinary high schoo...,1,"""A Cowboy Who Rode the Waves""",falcon,6
63887,"""Resent studies suggest that americans are buy...",0,Car-free cities,human,4


In [28]:
np.unique(train.loc[train.model == "ada"].generated)

array([1])

In [16]:
train_prev.head(5)

Unnamed: 0,text,generated,prompt_name,model,fold
0,I don't like the idea that students forget al...,1,Summer projects,mistral,9
1,The Challenge of Exploring Venus\n\nIntroducti...,1,Exploring Venus,falcon,6
2,Have you ever participated in a extracurricula...,0,Mandatory extracurricular activities,human,2
3,When to choose between the two either online o...,0,Distance learning,human,2
4,"Summer Projects\n\nSummer, one of the greatest...",0,Summer projects,human,2
