In [1]:
import csv
import os
from typing import Dict, List

import numpy as np
import pandas as pd
from tqdm import tqdm

from ex_params import (
    DATA_AI_PATH,
    DATA_HUMAN_PATH,
    DATASETS,
    DATASETS_PATH,
    STATS_PATH,
    SEED,
)
from ex_utils import get_csv_paths

np.random.seed(SEED)
BATCH_SIZE = 256

In [2]:
def get_master_stats(stats: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    master_stats = {
        "data": [],
        "model": [],
        "num_samples": [],
        "num_sentences": [],
        "num_words": [],
        "num_chars": [],
        "num_tokens": [],
    }
    for k, v in stats.items():
        data, model = k.split("_")
        master_stats["data"].append(data)
        master_stats["model"].append(model)
        master_stats["num_samples"].append(len(v))
        for col in v.columns:
            master_stats[col].append(v[col].sum())
    df = pd.DataFrame(master_stats)
    return df

In [4]:
paths = get_csv_paths(STATS_PATH + "data_ai/", recursive=True) + get_csv_paths(
    STATS_PATH + "data_human/"
)

In [5]:
len(paths)

220

In [112]:
stats = dict(
    {
        f"{path.split("/")[-1].split("_")[0]}_{path.split("/")[-1].split("_")[1]}": pd.read_csv(
            path
        )
        for path in paths
    }
)

In [113]:
df = get_master_stats(stats)

In [126]:
DATASETS_PATH

'../../../data/datasets/'

In [68]:
df = pd.read_csv(DATASETS_PATH + "detect-gpt-4.1-nano-2025-04-14/train_idx.csv")
cols_c0 = ["gpt-4.1-nano-2025-04-14"]

In [69]:
df

Unnamed: 0,index,num_sentences,num_words,num_chars,num_tokens,data,model
0,154642,10,235,1099,242,nyt-comments,Qwen2.5-14B-Instruct
1,56305,3,65,323,68,nyt-comments,Qwen2.5-14B-Instruct
2,143092,3,56,248,54,nyt-comments,Qwen2.5-14B-Instruct
3,154098,1,11,44,11,nyt-comments,Qwen2.5-14B-Instruct
4,150720,1,12,44,14,nyt-comments,Qwen2.5-14B-Instruct
...,...,...,...,...,...,...,...
404895,16987,2,42,198,41,nyt-comments,gpt-4.1-nano-2025-04-14
404896,200915,3,55,265,57,nyt-comments,gpt-4.1-nano-2025-04-14
404897,49826,3,63,289,64,nyt-comments,gpt-4.1-nano-2025-04-14
404898,57638,2,36,177,36,nyt-comments,gpt-4.1-nano-2025-04-14


In [70]:
df["is_human"] = np.where(df["model"].isin(cols_c0), "human", "ai")

In [71]:
df.groupby("is_human").sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data,model
is_human,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ai,10081945676,1459990,28931227,121580982,30198626,nyt-commentsnyt-commentsnyt-commentsnyt-commen...,Qwen2.5-14B-InstructQwen2.5-14B-InstructQwen2....
human,16218696146,1464612,28657397,127956096,29803284,nyt-commentsnyt-commentsnyt-commentsnyt-commen...,gpt-4.1-nano-2025-04-14gpt-4.1-nano-2025-04-14...


In [72]:
df = pd.read_csv(DATASETS_PATH + "detect-gpt-4.1-nano-2025-04-14/val_idx.csv")
cols_c0 = ["gpt-4.1-nano-2025-04-14"]

In [73]:
df["is_human"] = np.where(df["model"].isin(cols_c0), "human", "ai")

In [74]:
df.groupby("is_human").sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data,model
is_human,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ai,4862240517,419059,8583209,36973989,8960557,writingpromptswritingpromptswritingpromptswrit...,Meta-Llama-3.1-70B-Instruct-AWQ-INT4Meta-Llama...
human,15925424002,495112,8743743,40454444,9047797,xsumxsumxsumxsumxsumxsumxsumxsumxsumxsumxsumxs...,gpt-4.1-nano-2025-04-14gpt-4.1-nano-2025-04-14...
