In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [3]:
df_main = pd.read_csv('../data/stats/data_stats_master.csv')
df_main["avg_sent_per_sample"] = df_main["num_sentences"] / df_main["num_samples"]
df_main

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens,avg_sent_per_sample
0,blogs,Phi-3-small-128k-instruct,28836,677403,4577975,57362602,15187889,23.491573
1,blogs,Llama-3.2-3B-Instruct,28836,447683,875182,42284112,10675655,15.525142
2,blogs,Qwen2.5-14B-Instruct,28836,330384,501842,24062475,6315173,11.457345
3,blogs,Ministral-8B-Instruct-2410,28836,742094,1352648,42635432,12117456,25.734984
4,blogs,Qwen2.5-3B-Instruct,28836,486262,1051199,31826561,9011862,16.863018
...,...,...,...,...,...,...,...,...
59,reddit,human,655485,1817602,11207395,117002414,32581910,2.772912
60,nyt-comments,human,4223213,18713269,75762131,1418281599,367129360,4.431050
61,blogs,human,576774,8370715,11971512,560257945,165283569,14.512989
62,nyt-articles,human,15813,21318,316981,1759817,421260,1.348131


In [21]:
def create_dataset_idx(max_tokens, batch_size, stats, df_main, col_c0, save_path):

    for ds in ["blogs", "essays", "nyt-articles"]:
        mask_c1 = (df_main["data"].values == ds) & (df_main["model"].values != col_c0)
        mask_c0 = (df_main["data"].values == ds) & (df_main["model"].values == col_c0)
        
        df_main.loc[mask_c1, "prob"] = 1 / df_main.loc[mask_c1, "avg_sent_per_sample"] / (1 / df_main.loc[mask_c1, "avg_sent_per_sample"]).sum()
        
        avg_h = df_main.loc[mask_c0, "avg_sent_per_sample"].values[0]
        avg_ai = (df_main.loc[mask_c1, "avg_sent_per_sample"] * df_main.loc[mask_c1, "prob"]).sum()
        
        c = 1 / (1 + avg_ai / avg_h)
        p = 1 - c
        
        df_main.loc[mask_c1, "prob"] *= c
        df_main.loc[mask_c0, "prob"] = p

    weights = [
        1 / (df_main.loc[df_main["data"] == ds, "avg_sent_per_sample"] * df_main.loc[df_main["data"] == ds, "prob"]).sum()
        for ds in ["blogs", "essays", "nyt-articles"]
    ]
    probs = np.array(weights) / np.sum(weights)

    total_tokens = 0
    total_sentences = 0
    total_samples = 0
    cnt = 0
    while total_tokens < max_tokens:
        data = np.random.choice(["blogs", "essays", "nyt-articles"], p=probs)
        tmp = df_main[(df_main["data"] == data)]
        model = np.random.choice(tmp["model"], p=tmp["prob"])

        stat = stats[f"{data}_{model}"]

        slct = stat.sample(n=batch_size)
        #stat.drop(slct.index, inplace=True)

        total_tokens += slct.sum()["num_tokens"]
        total_sentences += slct.sum()["num_sentences"]
        total_samples += batch_size


        # save data, model, slct.index to csv
        slct["data"] = data
        slct["model"] = model
        slct.reset_index(inplace=True)
        # slct.drop(columns=["num_sentences", "num_words", "num_chars", "num_tokens"], inplace=True)
        slct.to_csv(save_path, mode='a', header=not os.path.exists(save_path), index=False)

        cnt += 1
        if cnt % 1000 == 0:
            print(f"total_tokens: {total_tokens}, total_sentences: {total_sentences}, total_samples: {total_samples}")

    print(
        f"Final samples: {total_samples}, Final sentences: {total_sentences}, Final tokens: {total_tokens}"
    )


In [22]:
DATA_HUMAN_PATH = "../data/stats/data_human"
DATA_AI_PATH = "../data/stats/data_ai"
DATASET_IDX_PATH = "../data/datasets/test2_idx.csv"
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [23]:
stats = dict({f"{path.split("/")[-1].split("_")[0]}_{path.split("/")[-1].split("_")[1]}": pd.read_csv(path) for path in paths})

In [24]:
MAX_TOKENS = 1e7
total_tokens = 0
total_sentences = 0
total_samples = 0
batch_size = 16

In [25]:
create_dataset_idx(
    max_tokens=MAX_TOKENS,
    batch_size=batch_size,
    stats=stats,
    df_main=df_main,
    col_c0="human",
    save_path=DATASET_IDX_PATH
)

total_tokens: 1867193, total_sentences: 96577, total_samples: 16000
total_tokens: 3463418, total_sentences: 173326, total_samples: 32000
total_tokens: 5268502, total_sentences: 265180, total_samples: 48000
total_tokens: 6809625, total_sentences: 340717, total_samples: 64000
total_tokens: 8534114, total_sentences: 422001, total_samples: 80000
Final samples: 94112, Final sentences: 493206, Final tokens: 10000161


In [26]:
df = pd.read_csv(DATASET_IDX_PATH)
df.head()

Unnamed: 0,index,num_sentences,num_words,num_chars,num_tokens,data,model
0,1394,8,8,777,197,blogs,Falcon3-3B-Instruct
1,847,14,17,1186,364,blogs,Falcon3-3B-Instruct
2,19211,27,10,1205,344,blogs,Falcon3-3B-Instruct
3,1773,10,18,518,139,blogs,Falcon3-3B-Instruct
4,23495,4,20,223,54,blogs,Falcon3-3B-Instruct


In [27]:
df["is_human"] = np.where(df["model"] == "human", "human", "ai")

In [28]:
df.groupby("is_human").sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data,model
is_human,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ai,470996738,554727,1796667,52491170,12307585,blogsblogsblogsblogsblogsblogsblogsblogsblogsb...,Falcon3-3B-InstructFalcon3-3B-InstructFalcon3-...
human,5035621795,569030,3204418,38721760,10458170,nyt-articlesnyt-articlesnyt-articlesnyt-articl...,humanhumanhumanhumanhumanhumanhumanhumanhumanh...


In [29]:
df.groupby(["data", "is_human"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,num_sentences,num_words,num_chars,num_tokens,model
data,is_human,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
blogs,ai,168388524,181304,480359,14032834,3706822,Falcon3-3B-InstructFalcon3-3B-InstructFalcon3-...
blogs,human,3914700825,202666,275540,13424880,3964243,humanhumanhumanhumanhumanhumanhumanhumanhumanh...
essays,ai,6110246,184327,82931,9888205,2742713,Llama-3.1-8B-InstructLlama-3.1-8B-InstructLlam...
essays,human,4979156,176386,99466,9593609,2735457,humanhumanhumanhumanhumanhumanhumanhumanhumanh...
nyt-articles,ai,296497968,189096,1233377,28570131,5858050,Meta-Llama-3.3-70B-Instruct-AWQ-INT4Meta-Llama...
nyt-articles,human,1115941814,189978,2829412,15703271,3758470,humanhumanhumanhumanhumanhumanhumanhumanhumanh...
