In [1]:
import os
import csv
import tiktoken
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
enc = tiktoken.get_encoding("o200k_base") #cl100k_base

In [3]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [4]:
def calc_stats(texts):
    results = []
    total_sentences, total_words, total_chars, total_tokens = 0, 0, 0, 0
    total_samples = len(texts)

    for text in tqdm(texts):
        text_chars = 0
        text_tokens = enc.encode(text)
        sentences = sent_tokenize(text)
        for sentence in sentences:
            words = word_tokenize(sentence)
            text_chars += sum([len(word) for word in words])

        total_sentences += len(sentences)
        total_words += len(words)
        total_chars += text_chars
        total_tokens += len(text_tokens)

        results.append([len(sentences), len(words), text_chars, len(text_tokens)])
    return results, total_samples, total_sentences, total_words, total_chars, total_tokens

In [5]:
DATA_HUMAN_PATH = "../data/data_human"
DATA_AI_PATH = "../data/data_ai"
STATS_PATH = "../data/stats/"
MASTER_STATS_PATH = "../data/stats/data_stats_master.csv"

In [6]:
paths = get_csv_paths(DATA_HUMAN_PATH) + get_csv_paths(DATA_AI_PATH, recursive=True)

In [None]:
with open(MASTER_STATS_PATH, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["data", "model", "num_samples", "num_sentences", "num_words", "num_chars", "num_tokens"])

for path in paths:
    if path.split("_")[-1] == "human.csv":
        stats_path = os.path.join(STATS_PATH, path.split("/")[-2], path.split("/")[-1].replace(".csv", "_stats.csv"))
    else:
        stats_path = os.path.join(STATS_PATH, path.split("/")[-3], path.split("/")[-2], path.split("/")[-1].replace(".csv", "_stats.csv"))


    df = pd.read_csv(path)
    texts = df["text"].values

    results, num_samples, num_sentences, num_words, num_chars, num_tokens = calc_stats(texts)

    results = pd.DataFrame(results, columns=["num_sentences", "num_words", "num_chars", "num_tokens"])
    results.to_csv(stats_path, index=False)

    data_name, model = path.split("/")[-1].split("_")
    model = model.removesuffix(".csv")

    with open(MASTER_STATS_PATH, mode="a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow([data_name, model, num_samples, num_sentences, num_words, num_chars, num_tokens])

100%|██████████| 226394/226394 [11:06<00:00, 339.45it/s]
100%|██████████| 303140/303140 [25:25<00:00, 198.69it/s]
100%|██████████| 138244/138244 [05:01<00:00, 458.51it/s]
100%|██████████| 640908/640908 [01:19<00:00, 8078.73it/s]
100%|██████████| 655485/655485 [03:15<00:00, 3351.28it/s]
100%|██████████| 4223213/4223213 [38:04<00:00, 1848.32it/s]
100%|██████████| 576774/576774 [16:28<00:00, 583.53it/s] 
100%|██████████| 15813/15813 [00:02<00:00, 6377.34it/s]
100%|██████████| 2638/2638 [00:11<00:00, 239.23it/s]
100%|██████████| 384/384 [00:00<00:00, 440.23it/s]
100%|██████████| 384/384 [00:00<00:00, 1121.78it/s]
100%|██████████| 24/24 [00:00<00:00, 309.37it/s]
100%|██████████| 24/24 [00:00<00:00, 4271.55it/s]
100%|██████████| 24/24 [00:00<00:00, 1730.53it/s]
100%|██████████| 24/24 [00:00<00:00, 3162.53it/s]
100%|██████████| 24/24 [00:00<00:00, 1447.19it/s]
100%|██████████| 24/24 [00:00<00:00, 588.73it/s]
100%|██████████| 24/24 [00:00<00:00, 408.58it/s]
