In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [3]:
df_main = pd.read_csv('../../data/stats/data_stats_master.csv')
df_main["avg_token_per_sample"] = df_main["num_tokens"] / df_main["num_samples"]
df_main

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens,avg_token_per_sample
0,nyt-comments,human,4223213,18713269,342590681,1418281599,367129360,86.931291
1,blogs,human,576731,8328325,150710195,557327652,164361476,284.988107
2,raid,human,138244,1808791,46559309,215280947,95664352,691.996412
3,natural-questions,human,231628,544546,12325923,52668992,14758408,63.715993
4,writingprompts,human,303140,13802625,196423260,721935184,209317891,690.499080
...,...,...,...,...,...,...,...,...
215,blogs,phi-4,28836,412861,6932437,27219979,7414709,257.133756
216,blogs,gpt-4.1-nano-2025-04-14,28836,388053,7112835,27563134,7398632,256.576224
217,blogs,Qwen2.5-7B-Instruct,28836,405020,6880299,26436051,7130797,247.288008
218,blogs,Mistral-Nemo-Instruct-2407,28836,370692,6602922,24712562,6961997,241.434214


In [4]:
DATA_HUMAN_PATH = "../../data/stats/data_human"
DATA_AI_PATH = "../../data/stats/data_ai"
DATASET_IDX_PATH = "../../data/datasets/test3_idx.csv"
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [5]:
stats = dict({f"{path.split("/")[-1].split("_")[0]}_{path.split("/")[-1].split("_")[1]}": pd.read_csv(path) for path in paths})
for k, v in stats.items():
    stats[k] = v[v["num_tokens"] <= 8192]

In [6]:
from typing import Dict, List
def get_master_stats(stats: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    master_stats = {
        "data": [],
        "model": [],
        "num_samples": [],
        "num_sentences": [],
        "num_words": [],
        "num_chars": [],
        "num_tokens": [],
    }
    for k, v in stats.items():
        data, model = k.split("_")
        master_stats["data"].append(data)
        master_stats["model"].append(model)
        master_stats["num_samples"].append(len(v))
        for col in v.columns:
            master_stats[col].append(v[col].sum())
    df = pd.DataFrame(master_stats)
    return df


def calculate_probs(df_main: pd.DataFrame, cols_c0: List[str]) -> pd.DataFrame:
    df_main["avg_token_per_sample"] = df_main["num_tokens"] / df_main["num_samples"]

    for ds in df_main["data"].unique():
        df_main.loc[df_main["data"].values == ds, "prob"] = (
            1 / df_main.loc[df_main["data"].values == ds, "avg_token_per_sample"].values
        ) / (
            1 / df_main.loc[df_main["data"].values == ds, "avg_token_per_sample"]
        ).sum()
        mask_c0 = (df_main["data"].values == ds) & (df_main["model"].isin(cols_c0))
        mask_c1 = (df_main["data"].values == ds) & (~df_main["model"].isin(cols_c0))

        class0 = df_main[mask_c0]
        class1 = df_main[mask_c1]

        s1 = (class0["avg_token_per_sample"] * class0["prob"]).sum()
        s2 = (class1["avg_token_per_sample"] * class1["prob"]).sum()
        p1 = class0["prob"].sum()
        p2 = class1["prob"].sum()

        c1 = 1 / (s2 / s1 * p1 + p2)
        c0 = c1 * s2 / s1

        df_main.loc[mask_c0, "prob"] *= c0
        df_main.loc[mask_c1, "prob"] *= c1

    return df_main

In [7]:
df_main = get_master_stats(stats)
df_main = calculate_probs(df_main, ["human"])

In [8]:
df_main[df_main["data"] == "blogs"]

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens,avg_token_per_sample,prob
7,blogs,human,576632,8278264,149799988,553852318,163310829,283.214995,0.506773
10,blogs,Ministral-8B-Instruct-2410,28423,327060,5780431,21880227,5967332,209.947296,0.032554
11,blogs,Phi-3-small-128k-instruct,28556,486577,9732865,39712544,9995757,350.040517,0.019525
12,blogs,Qwen2-7B-Instruct,28817,442256,9554373,40416854,9864740,342.323628,0.019965
13,blogs,Qwen2.5-3B-Instruct,28691,369457,6297449,24656943,6610644,230.408281,0.029663
14,blogs,Phi-3-medium-128k-instruct,28374,626548,12211407,49949200,12643982,445.618594,0.015337
15,blogs,gpt-4.1-nano-2025-04-14,28831,387962,6981435,27314590,7234827,250.939163,0.027236
16,blogs,phi-4,28807,400179,6657379,25892557,6965826,241.810185,0.028264
17,blogs,Llama-3.2-3B-Instruct,28818,441771,10186381,41664330,10422019,361.649629,0.018898
18,blogs,Qwen2.5-72B-Instruct-AWQ,28818,414282,7389139,27489484,7602582,263.813658,0.025907


In [16]:
max_tokens = 5e7
total_tokens = 0
total_sentences = 0
total_samples = 0
batch_size = 16
cols_c0 = ["human"]

In [33]:
for ds in df_main["data"].unique():
    df_main.loc[df_main["data"].values == ds, "prob"] =  (1 / df_main.loc[df_main["data"].values == ds, "avg_token_per_sample"].values) /  (1 / df_main.loc[df_main["data"].values == ds, "avg_token_per_sample"]).sum()
    mask_c0 = (df_main["data"].values == ds) & (df_main["model"].isin(cols_c0))
    mask_c1 = (df_main["data"].values == ds) & (~df_main["model"].isin(cols_c0))

    class0 = df_main[mask_c0]
    class1 = df_main[mask_c1]

    s1 = (class0["avg_token_per_sample"] * class0["prob"]).sum()
    s2 = (class1["avg_token_per_sample"] * class1["prob"]).sum()
    p1 = class0["prob"].sum()
    p2 = class1["prob"].sum()

    c1 = 1 / (s2 / s1 * p1 + p2)
    c0 = c1 * s2 / s1

    df_main.loc[mask_c0, "prob"] *= c0
    df_main.loc[mask_c1, "prob"] *= c1

weights = [
    (df_main.loc[df_main["data"] == ds, "num_tokens"] * df_main.loc[df_main["data"] == ds, "prob"]).sum()
    for ds in df_main["data"].unique()
]
probs = np.array(weights) / np.sum(weights)

# total_tokens = 0
# total_sentences = 0
# total_samples = 0
# cnt = 0
# while total_tokens < max_tokens:
#     data = np.random.choice(df_main["data"].unique(), p=probs)
#     tmp = df_main[(df_main["data"] == data)]
#     model = np.random.choice(tmp["model"], p=tmp["prob"])

#     stat = stats[f"{data}_{model}"]

#     slct = stat.sample(n=batch_size)
#     stat.drop(slct.index, inplace=True)

#     total_tokens += slct.sum()["num_tokens"]
#     total_sentences += slct.sum()["num_sentences"]
#     total_samples += batch_size


#     # save data, model, slct.index to csv
#     slct["data"] = data
#     slct["model"] = model
#     slct.reset_index(inplace=True)
#     # slct.drop(columns=["num_sentences", "num_words", "num_chars", "num_tokens"], inplace=True)
#     slct.to_csv(DATASET_IDX_PATH, mode='a', header=not os.path.exists(DATASET_IDX_PATH), index=False)

#     cnt += 1
#     if cnt % 1000 == 0:
#         print(f"total_tokens: {total_tokens}, total_sentences: {total_sentences}, total_samples: {total_samples}")

# print(
#     f"Final samples: {total_samples}, Final sentences: {total_sentences}, Final tokens: {total_tokens}"
# )


In [35]:
df_tmp = df_main.groupby("data").sum().reset_index()
df_tmp

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens,avg_token_per_sample,prob
0,blogs,humanMinistral-8B-Instruct-2410Phi-3-small-128...,1178202,17831146,330865381,1278414439,351283690,6857.325439,1.0
1,essays,humanphi-4Ministral-8B-Instruct-2410gpt-4.1-na...,57578,2145711,33391899,119943300,32774134,12512.632963,1.0
2,natural-questions,humanQwen2-7B-InstructMeta-Llama-3.3-70B-Instr...,472560,1337769,28095003,120576958,33254831,1686.200278,1.0
3,nyt-articles,humanFalcon3-7B-InstructPhi-4-mini-instructQwe...,347831,2150590,59458545,303014797,63217304,3999.040963,1.0
4,nyt-comments,humanQwen2.5-14B-InstructPhi-3.5-mini-instruct...,8655021,34636096,674001938,2957694720,709465295,1709.175772,1.0
5,raid,humanPhi-3-small-128k-instructPhi-4-mini-instr...,862420,10457196,250151060,1144041635,309592286,7051.87586,1.0
6,reddit,humanPhi-3-mini-128k-instructMeta-Llama-3.3-70...,3398891,11007460,188317684,768644650,199528671,1328.587421,1.0
7,tweets,humanFalcon3-7B-InstructFalcon3-3B-InstructLla...,3663591,7641000,85125304,328381727,95481119,580.272067,1.0
8,writingprompts,humanQwen2-72B-Instruct-AWQQwen2.5-7B-Instruct...,621132,23004906,374931823,1447750598,394793174,12940.854513,1.0
9,xsum,humanQwen2.5-14B-InstructPhi-3-small-128k-inst...,939348,12779150,337633028,1570106286,358037119,7897.9742,1.0


In [36]:
df_tmp["weight"] = weights
df_tmp["probability"] = probs
df_tmp

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens,avg_token_per_sample,prob,weight,probability
0,blogs,humanMinistral-8B-Instruct-2410Phi-3-small-128...,1178202,17831146,330865381,1278414439,351283690,6857.325439,1.0,175843000.0,0.366148
1,essays,humanphi-4Ministral-8B-Instruct-2410gpt-4.1-na...,57578,2145711,33391899,119943300,32774134,12512.632963,1.0,719427.1,0.001498
2,natural-questions,humanQwen2-7B-InstructMeta-Llama-3.3-70B-Instr...,472560,1337769,28095003,120576958,33254831,1686.200278,1.0,33102850.0,0.068928
3,nyt-articles,humanFalcon3-7B-InstructPhi-4-mini-instructQwe...,347831,2150590,59458545,303014797,63217304,3999.040963,1.0,1649805.0,0.003435
4,nyt-comments,humanQwen2.5-14B-InstructPhi-3.5-mini-instruct...,8655021,34636096,674001938,2957694720,709465295,1709.175772,1.0,6393383.0,0.013313
5,raid,humanPhi-3-small-128k-instructPhi-4-mini-instr...,862420,10457196,250151060,1144041635,309592286,7051.87586,1.0,99572580.0,0.207334
6,reddit,humanPhi-3-mini-128k-instructMeta-Llama-3.3-70...,3398891,11007460,188317684,768644650,199528671,1328.587421,1.0,7530097.0,0.015679
7,tweets,humanFalcon3-7B-InstructFalcon3-3B-InstructLla...,3663591,7641000,85125304,328381727,95481119,580.272067,1.0,86873020.0,0.180891
8,writingprompts,humanQwen2-72B-Instruct-AWQQwen2.5-7B-Instruct...,621132,23004906,374931823,1447750598,394793174,12940.854513,1.0,20742110.0,0.04319
9,xsum,humanQwen2.5-14B-InstructPhi-3-small-128k-inst...,939348,12779150,337633028,1570106286,358037119,7897.9742,1.0,47825380.0,0.099584


In [37]:
df_tmp[["data", "num_tokens", "weight", "probability"]]

Unnamed: 0,data,num_tokens,weight,probability
0,blogs,351283690,175843000.0,0.366148
1,essays,32774134,719427.1,0.001498
2,natural-questions,33254831,33102850.0,0.068928
3,nyt-articles,63217304,1649805.0,0.003435
4,nyt-comments,709465295,6393383.0,0.013313
5,raid,309592286,99572580.0,0.207334
6,reddit,199528671,7530097.0,0.015679
7,tweets,95481119,86873020.0,0.180891
8,writingprompts,394793174,20742110.0,0.04319
9,xsum,358037119,47825380.0,0.099584


In [63]:
import pandas as pd
df = pd.read_csv(DATASET_IDX_PATH)
df.head()

Unnamed: 0,index,num_sentences,num_words,num_chars,num_tokens,data,model
0,22550,34,20,2522,636,blogs,Llama-3.1-8B-Instruct
1,9473,15,26,1787,405,blogs,Llama-3.1-8B-Instruct
2,20600,11,26,1480,323,blogs,Llama-3.1-8B-Instruct
3,23943,6,78,1058,244,blogs,Llama-3.1-8B-Instruct
4,20857,22,24,1790,478,blogs,Llama-3.1-8B-Instruct


In [64]:
df["is_human"] = np.where(df["model"].isin(cols_c0), "human", "ai")

In [65]:
df.groupby("is_human").sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data,model
is_human,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ai,989425257,1245943,1864094,91922494,24886382,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,Llama-3.1-8B-InstructLlama-3.1-8B-InstructLlam...
human,24154483641,1286201,1777485,85321878,25115887,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,humanhumanhumanhumanhumanhumanhumanhumanhumanh...


In [74]:
df.groupby(["model"]).sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data,is_human
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Falcon3-3B-Instruct,40449566,56955,43858,4635235,1226797,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Falcon3-7B-Instruct,68720793,66238,89868,4854747,1283953,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Llama-3.1-8B-Instruct,41411787,48073,89742,4495386,1152977,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Llama-3.2-3B-Instruct,43652666,46631,70223,4399048,1108830,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Meta-Llama-3.1-70B-Instruct-AWQ-INT4,63983411,61399,104453,5374332,1370300,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Meta-Llama-3.3-70B-Instruct-AWQ-INT4,59456762,53363,94822,4889710,1242738,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Ministral-8B-Instruct-2410,46685710,84620,91459,4686317,1337393,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Mistral-Nemo-Instruct-2407,77235804,72022,141751,4969484,1406950,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Phi-3-medium-128k-instruct,29849886,65417,128841,5067603,1349274,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
Phi-3-mini-128k-instruct,25448181,90611,214916,5202073,1401653,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,aiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiaiai...
