In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [3]:
df_main = pd.read_csv('../../data/stats/data_stats_master.csv')
df_main = df_main[df_main["data"] != "natural-questions"]
df_main = df_main[df_main["model"] != "gpt-4.1-nano-2025-04-14"]
df_main["avg_token_per_sample"] = df_main["num_tokens"] / df_main["num_samples"]
df_main

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens,avg_token_per_sample
0,nyt-comments,human,4223213,18713462,75699056,1418028952,367081295,86.919910
1,blogs,human,576731,8328335,11967700,557323671,164358740,284.983363
2,raid,human,138244,1808789,7756169,215270586,95663743,691.992007
4,writingprompts,human,303140,13802625,4407470,721933659,209316368,690.494056
5,essays,human,2638,123010,67709,6702698,1910966,724.399545
...,...,...,...,...,...,...,...,...
190,blogs,Phi-3-medium-128k-instruct,28836,928790,2403937,70950486,18821223,652.698814
191,blogs,Qwen2-7B-Instruct,28836,447564,759965,41207063,10143808,351.775836
192,blogs,Qwen2-72B-Instruct-AWQ,28835,590847,561720,45952728,11875452,411.841581
193,blogs,Llama-3.1-8B-Instruct,28836,471158,895753,43924826,11125419,385.817000


In [4]:
DATA_HUMAN_PATH = "../../data/stats/data_human"
DATA_AI_PATH = "../../data/stats/data_ai"
DATASET_IDX_PATH = "../../data/datasets/test3_idx.csv"
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [5]:
stats = dict({f"{path.split("/")[-1].split("_")[0]}_{path.split("/")[-1].split("_")[1]}": pd.read_csv(path) for path in paths})

In [6]:
max_tokens = 5e7
total_tokens = 0
total_sentences = 0
total_samples = 0
batch_size = 16
cols_c0 = ["human"]

In [29]:
for ds in ["blogs", "essays"]:
    df_main.loc[df_main["data"].values == ds, "prob"] =  (1 / df_main.loc[df_main["data"].values == ds, "avg_token_per_sample"].values) /  (1 / df_main.loc[df_main["data"].values == ds, "avg_token_per_sample"]).sum()
    mask_c0 = (df_main["data"].values == ds) & (df_main["model"].isin(cols_c0))
    mask_c1 = (df_main["data"].values == ds) & (~df_main["model"].isin(cols_c0))

    class0 = df_main[mask_c0]
    class1 = df_main[mask_c1]

    s1 = (class0["avg_token_per_sample"] * class0["prob"]).sum()
    s2 = (class1["avg_token_per_sample"] * class1["prob"]).sum()
    p1 = class0["prob"].sum()
    p2 = class1["prob"].sum()

    c1 = 1 / (s2 / s1 * p1 + p2)
    c0 = c1 * s2 / s1

    df_main.loc[mask_c0, "prob"] *= c0
    df_main.loc[mask_c1, "prob"] *= c1

weights = [
    (df_main.loc[df_main["data"] == ds, "num_tokens"] * df_main.loc[df_main["data"] == ds, "prob"]).sum()
    for ds in df_main["data"].unique()
]
probs = np.array(weights) / np.sum(weights)

# total_tokens = 0
# total_sentences = 0
# total_samples = 0
# cnt = 0
# while total_tokens < max_tokens:
#     data = np.random.choice(df_main["data"].unique(), p=probs)
#     tmp = df_main[(df_main["data"] == data)]
#     model = np.random.choice(tmp["model"], p=tmp["prob"])

#     stat = stats[f"{data}_{model}"]

#     slct = stat.sample(n=batch_size)
#     stat.drop(slct.index, inplace=True)

#     total_tokens += slct.sum()["num_tokens"]
#     total_sentences += slct.sum()["num_sentences"]
#     total_samples += batch_size


#     # save data, model, slct.index to csv
#     slct["data"] = data
#     slct["model"] = model
#     slct.reset_index(inplace=True)
#     # slct.drop(columns=["num_sentences", "num_words", "num_chars", "num_tokens"], inplace=True)
#     slct.to_csv(DATASET_IDX_PATH, mode='a', header=not os.path.exists(DATASET_IDX_PATH), index=False)

#     cnt += 1
#     if cnt % 1000 == 0:
#         print(f"total_tokens: {total_tokens}, total_sentences: {total_sentences}, total_samples: {total_samples}")

# print(
#     f"Final samples: {total_samples}, Final sentences: {total_sentences}, Final tokens: {total_tokens}"
# )


In [63]:
import pandas as pd
df = pd.read_csv(DATASET_IDX_PATH)
df.head()

Unnamed: 0,index,num_sentences,num_words,num_chars,num_tokens,data,model
0,22550,34,20,2522,636,blogs,Llama-3.1-8B-Instruct
1,9473,15,26,1787,405,blogs,Llama-3.1-8B-Instruct
2,20600,11,26,1480,323,blogs,Llama-3.1-8B-Instruct
3,23943,6,78,1058,244,blogs,Llama-3.1-8B-Instruct
4,20857,22,24,1790,478,blogs,Llama-3.1-8B-Instruct


In [64]:
df["is_human"] = np.where(df["model"].isin(cols_c0), "human", "ai")

In [65]:
df.groupby("is_human").sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data,model
is_human,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ai,989425257,1245943,1864094,91922494,24886382,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,Llama-3.1-8B-InstructLlama-3.1-8B-InstructLlam...
human,24154483641,1286201,1777485,85321878,25115887,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,humanhumanhumanhumanhumanhumanhumanhumanhumanh...


In [74]:
df.groupby(["model"]).sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data,is_human
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Falcon3-3B-Instruct,40449566,56955,43858,4635235,1226797,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Falcon3-7B-Instruct,68720793,66238,89868,4854747,1283953,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Llama-3.1-8B-Instruct,41411787,48073,89742,4495386,1152977,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Llama-3.2-3B-Instruct,43652666,46631,70223,4399048,1108830,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Meta-Llama-3.1-70B-Instruct-AWQ-INT4,63983411,61399,104453,5374332,1370300,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Meta-Llama-3.3-70B-Instruct-AWQ-INT4,59456762,53363,94822,4889710,1242738,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Ministral-8B-Instruct-2410,46685710,84620,91459,4686317,1337393,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Mistral-Nemo-Instruct-2407,77235804,72022,141751,4969484,1406950,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Phi-3-medium-128k-instruct,29849886,65417,128841,5067603,1349274,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Phi-3-mini-128k-instruct,25448181,90611,214916,5202073,1401653,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
