In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [14]:
df_main = pd.read_csv('../data/stats/data_stats_master.csv')
df_main["avg_sent_per_sample"] = df_main["num_sentences"] / df_main["num_samples"]
df_main

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens,avg_sent_per_sample
0,xsum,human,226394,4298208,5320921,416556574,105943034,18.985521
1,writingprompts,human,303140,13802625,4407474,721935184,209317891,45.53218
2,raid,human,138244,1808791,7760746,215280947,95664352,13.084047
3,tweets,human,640908,1048720,5909292,34207222,10630839,1.636303
4,reddit,human,655485,1817602,11207395,117002414,32581910,2.772912
5,nyt-comments,human,4223213,18713269,75762131,1418281599,367129360,4.43105
6,blogs,human,576774,8370715,11971512,560257945,165283569,14.512989
7,nyt-articles,human,15813,21318,316981,1759817,421260,1.348131
8,essays,human,2638,123010,67709,6702708,1910971,46.630023
9,blogs,Llama-3.2-1B-Instruct,384,6856,19799,613036,160203,17.854167


In [26]:
df_ai = df_main[df_main["model"] != "human"]
df_human = df_main[df_main["model"] == "human"]

In [27]:
df_ai["prob"] =  1 / df_ai["avg_sent_per_sample"] / (1 / df_ai["avg_sent_per_sample"]).sum()
df_human["prob"] = 1 / df_human["avg_sent_per_sample"] / (1 / df_human["avg_sent_per_sample"]).sum()

In [31]:
p1 = (df_human["avg_sent_per_sample"] * df_human["prob"]).sum()
p1

np.float64(4.127256449480798)

In [32]:
p2 = (df_ai["avg_sent_per_sample"] * df_ai["prob"]).sum()
p2

np.float64(4.89868399589739)

In [33]:
p_human = p2 / (p1 + p2)
p_human

np.float64(0.5427339151573732)

In [36]:
DATA_HUMAN_PATH = "../data/stats/data_human"
DATA_AI_PATH = "../data/stats/data_ai"
DATASET_IDX_PATH = "../data/datasets/test_idx.csv"
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [37]:
stats = dict({f"{path.split("/")[-1].split("_")[0]}_{path.split("/")[-1].split("_")[1]}": pd.read_csv(path) for path in paths})

In [43]:
MAX_TOKENS = 1e6
total_tokens = 0
total_sentences = 0
total_samples = 0
batch_size = 1

In [44]:
cnt = 0
while total_tokens < MAX_TOKENS:
    p = np.random.rand()
    if p < p_human:
        slct = df_human.sample(n=1, weights=df_human["prob"].values)
    else:
        slct = df_ai.sample(n=1, weights=df_ai["prob"].values)

    data, model = slct["data"].values[0], slct["model"].values[0]
    stat = stats[f"{data}_{model}"]

    # select batch_size random rows from stat and remove them
    slct = stat.sample(n=batch_size)
    #stat.drop(slct.index, inplace=True)

    total_tokens += slct.sum()["num_tokens"]
    total_sentences += slct.sum()["num_sentences"]
    total_samples += batch_size


    # save data, model, slct.index to csv
    slct["data"] = data
    slct["model"] = model
    slct.reset_index(inplace=True)
    # slct.drop(columns=["num_sentences", "num_words", "num_chars", "num_tokens"], inplace=True)
    slct.to_csv(DATASET_IDX_PATH, mode='a', header=not os.path.exists(DATASET_IDX_PATH), index=False)

    cnt += 1
    if cnt % 1000 == 0:
        print(f"total_tokens: {total_tokens}, total_sentences: {total_sentences}, total_samples: {total_samples}")

total_tokens: 94988, total_sentences: 4183, total_samples: 1000
total_tokens: 184221, total_sentences: 8390, total_samples: 2000
total_tokens: 278363, total_sentences: 12818, total_samples: 3000
total_tokens: 396388, total_sentences: 17753, total_samples: 4000
total_tokens: 492931, total_sentences: 22009, total_samples: 5000
total_tokens: 597229, total_sentences: 26920, total_samples: 6000
total_tokens: 682138, total_sentences: 30721, total_samples: 7000
total_tokens: 808641, total_sentences: 35853, total_samples: 8000
total_tokens: 908720, total_sentences: 40360, total_samples: 9000


In [45]:
df = pd.read_csv(DATASET_IDX_PATH)
df.head()

Unnamed: 0,index,num_sentences,num_words,num_chars,num_tokens,data,model
0,546485,1,11,43,13,tweets,human
1,18,1,24,122,45,xsum,Llama-3.2-1B-Instruct
2,6,1,11,54,11,tweets,Llama-3.2-1B-Instruct
3,4,2,5,120,34,tweets,Llama-3.2-1B-Instruct
4,4223,2,8,108,23,nyt-articles,human


In [46]:
df.groupby("model").sum()

Unnamed: 0_level_0,index,num_sentences,num_words,num_chars,num_tokens,data
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Llama-3.2-1B-Instruct,150721,20925,109330,2245076,519290,xsumtweetstweetsxsumnyt-commentsxsumnyt-commen...
human,2107856553,23467,95432,1675412,480716,tweetsnyt-articlesnyt-articlesnyt-commentsnyt-...


In [47]:
df.groupby(["model", "data"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,num_sentences,num_words,num_chars,num_tokens
model,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Llama-3.2-1B-Instruct,blogs,22035,2199,11831,227355,57895
Llama-3.2-1B-Instruct,essays,907,1925,1596,134353,35903
Llama-3.2-1B-Instruct,nyt-articles,84182,2258,15567,369606,77713
Llama-3.2-1B-Instruct,nyt-comments,6984,2406,15178,299829,64273
Llama-3.2-1B-Instruct,raid,2421,2180,5814,316334,61272
Llama-3.2-1B-Instruct,reddit,11525,2593,16816,212987,53010
Llama-3.2-1B-Instruct,tweets,14002,2424,19408,130063,36517
Llama-3.2-1B-Instruct,writingprompts,762,2305,1072,143694,39112
Llama-3.2-1B-Instruct,xsum,7903,2635,22048,410855,93595
human,blogs,50382974,2316,4612,163614,49361
