In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [84]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [85]:
df_main = pd.read_csv('../../data/stats/data_stats_master.csv')
df_main["avg_token_per_sample"] = df_main["num_tokens"] / df_main["num_samples"]
df_main

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens,avg_token_per_sample
0,blogs,Phi-3-small-128k-instruct,28836,677403,903150,57362602,15187889,526.698883
1,blogs,Llama-3.2-3B-Instruct,28836,447683,701052,42284112,10675655,370.219691
2,blogs,Qwen2.5-14B-Instruct,28836,330384,391804,24062475,6315173,219.003086
3,blogs,Ministral-8B-Instruct-2410,28836,742094,929327,42635432,12117456,420.219725
4,blogs,Phi-3-medium-128k-instruct,28836,939614,1822958,71160696,18987094,658.451033
...,...,...,...,...,...,...,...,...
58,essays,Qwen2.5-14B-Instruct,2638,91191,44632,4982535,1394185,528.500758
59,essays,Phi-3.5-mini-instruct,2638,129167,73580,6103995,1806459,684.783548
60,blogs,human,576774,8370715,9554000,560257945,165283569,286.565568
61,nyt-articles,human,15813,21318,272825,1759817,421260,26.640106


In [94]:
DATA_HUMAN_PATH = "../../data/stats/data_human"
DATA_AI_PATH = "../../data/stats/data_ai"
DATASET_IDX_PATH = "../../data/datasets/test3_idx.csv"
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [95]:
stats = dict({f"{path.split("/")[-1].split("_")[0]}_{path.split("/")[-1].split("_")[1]}": pd.read_csv(path) for path in paths})

In [96]:
max_tokens = 1e7
total_tokens = 0
total_sentences = 0
total_samples = 0
batch_size = 16
cols_c0 = ["human", "phi-4"]

In [None]:
for ds in ["blogs", "essays", "nyt-articles"]:
    df_main.loc[df_main["data"].values == ds, "prob"] = df_main.loc[df_main["data"].values == ds, "avg_token_per_sample"].values / df_main.loc[df_main["data"].values == ds, "avg_token_per_sample"].sum()
    mask_c0 = (df_main["data"].values == ds) & (df_main["model"].isin(cols_c0))
    mask_c1 = (df_main["data"].values == ds) & (~df_main["model"].isin(cols_c0))

    class0 = df_main[mask_c0]
    class1 = df_main[mask_c1]

    s1 = (class0["avg_token_per_sample"] * class0["prob"]).sum()
    s2 = (class1["avg_token_per_sample"] * class1["prob"]).sum()
    p1 = class0["prob"].sum()
    p2 = class1["prob"].sum()

    c1 = 1 / (s2 / s1 * p1 + p2)
    c0 = c1 * s2 / s1

    df_main.loc[mask_c0, "prob"] *= c0
    df_main.loc[mask_c1, "prob"] *= c1

weights = [df_main.loc[df_main["data"] == ds, "num_tokens"].sum() for ds in ["blogs", "essays", "nyt_articles"]]
# weights = [
#     (df_main.loc[df_main["data"] == ds, "num_tokens"] * df_main.loc[df_main["data"] == ds, "prob"]).sum()
#     for ds in ["blogs", "essays", "nyt-articles"]
# ]
probs = np.array(weights) / np.sum(weights)

total_tokens = 0
total_sentences = 0
total_samples = 0
cnt = 0
while total_tokens < max_tokens:
    data = np.random.choice(["blogs", "essays", "nyt-articles"], p=probs)
    tmp = df_main[(df_main["data"] == data)]
    model = np.random.choice(tmp["model"], p=tmp["prob"])

    stat = stats[f"{data}_{model}"]

    slct = stat.sample(n=batch_size)
    stat.drop(slct.index, inplace=True)

    total_tokens += slct.sum()["num_tokens"]
    total_sentences += slct.sum()["num_sentences"]
    total_samples += batch_size


    # save data, model, slct.index to csv
    slct["data"] = data
    slct["model"] = model
    slct.reset_index(inplace=True)
    # slct.drop(columns=["num_sentences", "num_words", "num_chars", "num_tokens"], inplace=True)
    slct.to_csv(DATASET_IDX_PATH, mode='a', header=not os.path.exists(DATASET_IDX_PATH), index=False)

    cnt += 1
    if cnt % 1000 == 0:
        print(f"total_tokens: {total_tokens}, total_sentences: {total_sentences}, total_samples: {total_samples}")

print(
    f"Final samples: {total_samples}, Final sentences: {total_sentences}, Final tokens: {total_tokens}"
)


total_tokens: 5096328, total_sentences: 276924, total_samples: 16000
total_tokens: 9984802, total_sentences: 534353, total_samples: 32000
Final samples: 32080, Final sentences: 535166, Final tokens: 10001120


In [98]:
df = pd.read_csv(DATASET_IDX_PATH)
df.head()

Unnamed: 0,index,num_sentences,num_words,num_chars,num_tokens,data,model
0,1676,6,17,789,145,nyt-articles,phi-4
1,14657,4,23,620,138,nyt-articles,phi-4
2,3933,5,32,854,176,nyt-articles,phi-4
3,12750,4,43,772,137,nyt-articles,phi-4
4,12113,5,29,784,146,nyt-articles,phi-4


In [99]:
df["is_human"] = np.where(df["model"].isin(cols_c0), "human", "ai")

In [100]:
df.groupby("is_human").sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data,model
is_human,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ai,98415559,277924,412635,19663385,5024247,nyt-articlesnyt-articlesnyt-articlesnyt-articl...,Phi-4-mini-instructPhi-4-mini-instructPhi-4-mi...
human,1100025314,257242,448043,20184135,4976873,nyt-articlesnyt-articlesnyt-articlesnyt-articl...,phi-4phi-4phi-4phi-4phi-4phi-4phi-4phi-4phi-4p...


In [101]:
df.groupby(["data", "is_human"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,num_sentences,num_words,num_chars,num_tokens,model
data,is_human,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
blogs,ai,46350724,93033,147816,5977502,1683878,Phi-4-mini-instructPhi-4-mini-instructPhi-4-mi...
blogs,human,1001385855,94281,96866,6095521,1750351,humanhumanhumanhumanhumanhumanhumanhumanhumanh...
essays,ai,2533925,123203,93296,5776686,1621302,Phi-3-medium-128k-instructPhi-3-medium-128k-in...
essays,human,3101982,106476,41152,5545363,1551255,humanhumanhumanhumanhumanhumanhumanhumanhumanh...
nyt-articles,ai,49530910,61688,171523,7909197,1719067,Phi-4-mini-instructPhi-4-mini-instructPhi-4-mi...
nyt-articles,human,95537477,56485,310025,8543251,1675267,phi-4phi-4phi-4phi-4phi-4phi-4phi-4phi-4phi-4p...
