In [2]:
import os
import csv
import tiktoken
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize

In [3]:
enc = tiktoken.get_encoding("o200k_base") #cl100k_base

In [4]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [5]:
def calc_stats(texts):
    num_samples = len(texts)
    num_sentences = 0
    num_words = 0
    num_chars = 0
    num_tokens = 0
    for text in tqdm(texts):
        num_tokens += len(enc.encode(text))
        sentences = sent_tokenize(text)
        num_sentences += len(sentences)
        for sentence in sentences:
            words = word_tokenize(sentence)
            num_words += len(words)
            num_chars += sum([len(word) for word in words])
    return num_samples, num_sentences, num_words, num_chars, num_tokens

In [6]:
DATA_HUMAN_PATH = "../data/data_human"
DATA_AI_PATH = "../data/data_ai"
STATS_PATH = "../data/data_stats.csv"

In [7]:
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [8]:
with open(STATS_PATH, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["data", "model", "num_samples", "num_sentences", "num_words", "num_chars", "num_tokens"])

for path in paths:
    df = pd.read_csv(path)
    texts = df["text"].values

    num_samples, num_sentences, num_words, num_chars, num_tokens = calc_stats(texts)

    data_name, model = path.split("/")[-1].split("_")
    model = model.removesuffix(".csv")

    with open(STATS_PATH, mode="a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow([data_name, model, num_samples, num_sentences, num_words, num_chars, num_tokens])

100%|██████████| 226394/226394 [11:38<00:00, 323.96it/s]
100%|██████████| 303140/303140 [26:02<00:00, 193.96it/s]
100%|██████████| 138244/138244 [06:17<00:00, 366.54it/s]
100%|██████████| 640908/640908 [01:34<00:00, 6795.06it/s]
100%|██████████| 655485/655485 [03:59<00:00, 2738.57it/s]
100%|██████████| 4223213/4223213 [45:59<00:00, 1530.46it/s] 
100%|██████████| 576774/576774 [16:54<00:00, 568.74it/s] 
100%|██████████| 15813/15813 [00:02<00:00, 6377.01it/s]
100%|██████████| 2638/2638 [00:11<00:00, 233.23it/s]
100%|██████████| 384/384 [00:00<00:00, 443.82it/s]
100%|██████████| 384/384 [00:00<00:00, 1134.20it/s]
100%|██████████| 24/24 [00:00<00:00, 285.26it/s]
100%|██████████| 24/24 [00:00<00:00, 4104.85it/s]
100%|██████████| 24/24 [00:00<00:00, 1517.84it/s]
100%|██████████| 24/24 [00:00<00:00, 2699.91it/s]
100%|██████████| 24/24 [00:00<00:00, 1424.49it/s]
100%|██████████| 24/24 [00:00<00:00, 558.58it/s]
100%|██████████| 24/24 [00:00<00:00, 409.50it/s]
