In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [3]:
df_main = pd.read_csv('../data/stats/data_stats_master.csv')
df_main["avg_sent_per_sample"] = df_main["num_sentences"] / df_main["num_samples"]
df_main

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens,avg_sent_per_sample
0,blogs,Phi-3-small-128k-instruct,28836,677403,903150,57362602,15187889,23.491573
1,blogs,Llama-3.2-3B-Instruct,28836,447683,701052,42284112,10675655,15.525142
2,blogs,Qwen2.5-14B-Instruct,28836,330384,391804,24062475,6315173,11.457345
3,blogs,Ministral-8B-Instruct-2410,28836,742094,929327,42635432,12117456,25.734984
4,blogs,Phi-3-medium-128k-instruct,28836,939614,1822958,71160696,18987094,32.584755
...,...,...,...,...,...,...,...,...
58,essays,Qwen2.5-14B-Instruct,2638,91191,44632,4982535,1394185,34.568234
59,essays,Phi-3.5-mini-instruct,2638,129167,73580,6103995,1806459,48.963988
60,blogs,human,576774,8370715,9554000,560257945,165283569,14.512989
61,nyt-articles,human,15813,21318,272825,1759817,421260,1.348131


In [None]:
def create_dataset_idx(max_tokens, batch_size, stats, df_main, cols_c0, save_path):

    for ds in ["blogs", "essays", "nyt-articles"]:
        mask_c0 = (df_main["data"].values == ds) & (df_main["model"].isin(cols_c0))
        mask_c1 = (df_main["data"].values == ds) & (~df_main["model"].isin(cols_c0))
        
        df_main.loc[mask_c1, "prob"] = 1 / df_main.loc[mask_c1, "avg_sent_per_sample"] / (1 / df_main.loc[mask_c1, "avg_sent_per_sample"]).sum()
        
        avg_h = df_main.loc[mask_c0, "avg_sent_per_sample"].values[0]
        avg_ai = (df_main.loc[mask_c1, "avg_sent_per_sample"] * df_main.loc[mask_c1, "prob"]).sum()
        
        c = 1 / (1 + avg_ai / avg_h)
        p = 1 - c
        
        df_main.loc[mask_c1, "prob"] *= c
        df_main.loc[mask_c0, "prob"] = p

    weights = [
        1 / (df_main.loc[df_main["data"] == ds, "avg_sent_per_sample"] * df_main.loc[df_main["data"] == ds, "prob"]).sum()
        for ds in ["blogs", "essays", "nyt-articles"]
    ]
    probs = np.array(weights) / np.sum(weights)

    total_tokens = 0
    total_sentences = 0
    total_samples = 0
    cnt = 0
    while total_tokens < max_tokens:
        data = np.random.choice(["blogs", "essays", "nyt-articles"], p=probs)
        tmp = df_main[(df_main["data"] == data)]
        model = np.random.choice(tmp["model"], p=tmp["prob"])

        stat = stats[f"{data}_{model}"]

        slct = stat.sample(n=batch_size)
        stat.drop(slct.index, inplace=True)

        total_tokens += slct.sum()["num_tokens"]
        total_sentences += slct.sum()["num_sentences"]
        total_samples += batch_size


        # save data, model, slct.index to csv
        slct["data"] = data
        slct["model"] = model
        slct.reset_index(inplace=True)
        # slct.drop(columns=["num_sentences", "num_words", "num_chars", "num_tokens"], inplace=True)
        slct.to_csv(save_path, mode='a', header=not os.path.exists(save_path), index=False)

        cnt += 1
        if cnt % 1000 == 0:
            print(f"total_tokens: {total_tokens}, total_sentences: {total_sentences}, total_samples: {total_samples}")

    print(
        f"Final samples: {total_samples}, Final sentences: {total_sentences}, Final tokens: {total_tokens}"
    )


In [26]:
DATA_HUMAN_PATH = "../data/stats/data_human"
DATA_AI_PATH = "../data/stats/data_ai"
DATASET_IDX_PATH = "../data/datasets/test2_idx.csv"
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [27]:
stats = dict({f"{path.split("/")[-1].split("_")[0]}_{path.split("/")[-1].split("_")[1]}": pd.read_csv(path) for path in paths})

In [None]:
max_tokens = 1e6
total_tokens = 0
total_sentences = 0
total_samples = 0
batch_size = 16
cols_c0 = "human"

In [None]:
for ds in ["blogs", "essays", "nyt-articles"]:
    mask_c0 = (df_main["data"].values == ds) & (df_main["model"].isin(cols_c0))
    mask_c1 = (df_main["data"].values == ds) & (~df_main["model"].isin(cols_c0))
    
    df_main.loc[mask_c1, "prob"] = 1 / df_main.loc[mask_c1, "avg_sent_per_sample"] / (1 / df_main.loc[mask_c1, "avg_sent_per_sample"]).sum()
    
    avg_h = df_main.loc[mask_c0, "avg_sent_per_sample"].values[0]
    avg_ai = (df_main.loc[mask_c1, "avg_sent_per_sample"] * df_main.loc[mask_c1, "prob"]).sum()
    
    c = 1 / (1 + avg_ai / avg_h)
    p = 1 - c
    
    df_main.loc[mask_c1, "prob"] *= c
    df_main.loc[mask_c0, "prob"] = p

weights = [
    1 / (df_main.loc[df_main["data"] == ds, "avg_sent_per_sample"] * df_main.loc[df_main["data"] == ds, "prob"]).sum()
    for ds in ["blogs", "essays", "nyt-articles"]
]
probs = np.array(weights) / np.sum(weights)

total_tokens = 0
total_sentences = 0
total_samples = 0
cnt = 0
while total_tokens < max_tokens:
    data = np.random.choice(["blogs", "essays", "nyt-articles"], p=probs)
    tmp = df_main[(df_main["data"] == data)]
    model = np.random.choice(tmp["model"], p=tmp["prob"])

    stat = stats[f"{data}_{model}"]

    slct = stat.sample(n=batch_size)
    stat.drop(slct.index, inplace=True)

    total_tokens += slct.sum()["num_tokens"]
    total_sentences += slct.sum()["num_sentences"]
    total_samples += batch_size


    # save data, model, slct.index to csv
    slct["data"] = data
    slct["model"] = model
    slct.reset_index(inplace=True)
    # slct.drop(columns=["num_sentences", "num_words", "num_chars", "num_tokens"], inplace=True)
    slct.to_csv(DATASET_IDX_PATH, mode='a', header=not os.path.exists(DATASET_IDX_PATH), index=False)

    cnt += 1
    if cnt % 1000 == 0:
        print(f"total_tokens: {total_tokens}, total_sentences: {total_sentences}, total_samples: {total_samples}")

print(
    f"Final samples: {total_samples}, Final sentences: {total_sentences}, Final tokens: {total_tokens}"
)


Final samples: 9488, Final sentences: 52319, Final tokens: 1013086


In [None]:
create_dataset_idx(
    max_tokens=max_tokens,
    batch_size=batch_size,
    stats=stats,
    df_main=df_main,
    col_c0="human",
    save_path=DATASET_IDX_PATH
)

total_tokens: 1829673, total_sentences: 91767, total_samples: 16000


ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
df = pd.read_csv(DATASET_IDX_PATH)
df.head()

Unnamed: 0,index,num_sentences,num_words,num_chars,num_tokens,data,model
0,8956,1,21,89,23,nyt-articles,human
1,3736,2,1,41,14,nyt-articles,human
2,3133,1,19,83,29,nyt-articles,human
3,10762,1,22,124,27,nyt-articles,human
4,13421,2,14,102,26,nyt-articles,human


In [None]:
df["is_human"] = np.where(df["model"] == "human", "human", "ai")

In [None]:
df.groupby("is_human").sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data,model
is_human,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ai,193876359,233805,650686,22550325,5322657,nyt-articlesnyt-articlesnyt-articlesnyt-articl...,Qwen2-7B-InstructQwen2-7B-InstructQwen2-7B-Ins...
human,2165432127,255290,1180776,17303355,4677614,nyt-articlesnyt-articlesnyt-articlesnyt-articl...,humanhumanhumanhumanhumanhumanhumanhumanhumanh...


In [None]:
df.groupby(["data", "is_human"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,num_sentences,num_words,num_chars,num_tokens,model
data,is_human,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
blogs,ai,66290103,77207,165177,6004061,1633273,Qwen2.5-7B-InstructQwen2.5-7B-InstructQwen2.5-...
blogs,human,1686301471,84363,95677,5621581,1654614,humanhumanhumanhumanhumanhumanhumanhumanhumanh...
essays,ai,2718590,76446,30331,4363208,1197528,phi-4phi-4phi-4phi-4phi-4phi-4phi-4phi-4phi-4p...
essays,human,2534817,90067,46643,4993122,1420840,humanhumanhumanhumanhumanhumanhumanhumanhumanh...
nyt-articles,ai,124867666,80152,455178,12183056,2491856,Qwen2-7B-InstructQwen2-7B-InstructQwen2-7B-Ins...
nyt-articles,human,476595839,80860,1038456,6688652,1602160,humanhumanhumanhumanhumanhumanhumanhumanhumanh...
