In [1]:
import os
import csv
import tiktoken
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
enc = tiktoken.get_encoding("o200k_base") #cl100k_base

In [3]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [29]:
def calc_stats(texts):
    results = []
    total_sentences, total_words, total_chars, total_tokens = 0, 0, 0, 0
    total_samples = len(texts)

    for text in tqdm(texts):
        text_words = 0
        text_chars = 0
        text_tokens = enc.encode(text)
        sentences = sent_tokenize(text)
        for sentence in sentences:
            words = word_tokenize(sentence)
            text_words += len(words)
            text_chars += sum([len(word) for word in words])

        total_sentences += len(sentences)
        total_words += text_words
        total_chars += text_chars
        total_tokens += len(text_tokens)

        results.append([len(sentences), text_words, text_chars, len(text_tokens)])
    return results, total_samples, total_sentences, total_words, total_chars, total_tokens

In [5]:
DATA_HUMAN_PATH = "../data/data_human/"
DATA_AI_PATH = "../data/data_ai/"
STATS_PATH = "../data/stats/"
MASTER_STATS_PATH = "../data/stats/data_stats_master.csv"

In [6]:
paths = get_csv_paths(DATA_HUMAN_PATH)
paths[-3:]

['../data/data_human/blogs_human.csv',
 '../data/data_human/nyt-articles_human.csv',
 '../data/data_human/essays_human.csv']

In [7]:
# with open(MASTER_STATS_PATH, mode="w", newline="", encoding="utf-8") as file:
#     writer = csv.writer(file)
#     writer.writerow(["data", "model", "num_samples", "num_sentences", "num_words", "num_chars", "num_tokens"])

for path in paths[-3:]:
    print(path)
    if path.split("_")[-1] == "human.csv":
        stats_path = os.path.join(STATS_PATH, path.split("/")[-2], path.split("/")[-1].replace(".csv", "_stats.csv"))
    else:
        stats_path = os.path.join(STATS_PATH, path.split("/")[-3], path.split("/")[-2], path.split("/")[-1].replace(".csv", "_stats.csv"))

    try:
        df = pd.read_csv(path)
        texts = df["text"].values

        results, num_samples, num_sentences, num_words, num_chars, num_tokens = calc_stats(texts)

        results = pd.DataFrame(results, columns=["num_sentences", "num_words", "num_chars", "num_tokens"])
        results.to_csv(stats_path, index=False)

        data_name, model = path.split("/")[-1].split("_")
        model = model.removesuffix(".csv")

        with open(MASTER_STATS_PATH, mode="a", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow([data_name, model, num_samples, num_sentences, num_words, num_chars, num_tokens])
    except:
        print(f"Error processing {path}")

../data/data_human/blogs_human.csv


100%|██████████| 576774/576774 [21:10<00:00, 454.02it/s] 


../data/data_human/nyt-articles_human.csv


100%|██████████| 15813/15813 [00:03<00:00, 4750.98it/s]


../data/data_human/essays_human.csv


100%|██████████| 2638/2638 [00:13<00:00, 191.18it/s]
