In [1]:
import os
import csv
import tiktoken
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
enc = tiktoken.get_encoding("o200k_base") #cl100k_base

In [3]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [None]:
def calc_stats(texts):
    results = []
    total_sentences, total_words, total_chars, total_tokens = 0, 0, 0, 0
    total_samples = len(texts)

    for text in tqdm(texts):
        text_chars = 0
        text_tokens = enc.encode(text)
        sentences = sent_tokenize(text)
        for sentence in sentences:
            words = word_tokenize(sentence)
            words_only = [token for token in words if token.isalpha()]
            text_chars += sum([len(word) for word in words])

        total_sentences += len(sentences)
        total_words += len(words_only)
        total_chars += text_chars
        total_tokens += len(text_tokens)

        results.append([len(sentences), len(words), text_chars, len(text_tokens)])
    return results, total_samples, total_sentences, total_words, total_chars, total_tokens

In [5]:
DATA_HUMAN_PATH = "../data/data_human/"
DATA_AI_PATH = "../data/data_ai/blogs/"
STATS_PATH = "../data/stats/"
MASTER_STATS_PATH = "../data/stats/data_stats_master.csv"

In [6]:
paths = get_csv_paths(DATA_AI_PATH, recursive=True)
paths

['../data/data_ai/blogs/blogs_Phi-3-small-128k-instruct.csv',
 '../data/data_ai/blogs/blogs_Llama-3.2-3B-Instruct.csv',
 '../data/data_ai/blogs/blogs_Qwen2.5-14B-Instruct.csv',
 '../data/data_ai/blogs/blogs_Ministral-8B-Instruct-2410.csv',
 '../data/data_ai/blogs/blogs_Phi-3-medium-128k-instruct.csv',
 '../data/data_ai/blogs/blogs_Qwen2.5-3B-Instruct.csv',
 '../data/data_ai/blogs/blogs_Meta-Llama-3.1-70B-Instruct-AWQ-INT4.csv',
 '../data/data_ai/blogs/blogs_Falcon3-3B-Instruct.csv',
 '../data/data_ai/blogs/blogs_Phi-3-mini-128k-instruct.csv',
 '../data/data_ai/blogs/blogs_Falcon3-7B-Instruct.csv',
 '../data/data_ai/blogs/blogs_Qwen2-7B-Instruct.csv',
 '../data/data_ai/blogs/blogs_Qwen2-72B-Instruct-AWQ.csv',
 '../data/data_ai/blogs/blogs_Qwen2.5-72B-Instruct-AWQ.csv',
 '../data/data_ai/blogs/blogs_Llama-3.1-8B-Instruct.csv',
 '../data/data_ai/blogs/blogs_Meta-Llama-3.3-70B-Instruct-AWQ-INT4.csv',
 '../data/data_ai/blogs/blogs_Phi-3.5-mini-instruct.csv',
 '../data/data_ai/blogs/blogs_ph

In [7]:
# with open(MASTER_STATS_PATH, mode="w", newline="", encoding="utf-8") as file:
#     writer = csv.writer(file)
#     writer.writerow(["data", "model", "num_samples", "num_sentences", "num_words", "num_chars", "num_tokens"])

for path in paths:
    print(path)
    if path.split("_")[-1] == "human.csv":
        stats_path = os.path.join(STATS_PATH, path.split("/")[-2], path.split("/")[-1].replace(".csv", "_stats.csv"))
    else:
        stats_path = os.path.join(STATS_PATH, path.split("/")[-3], path.split("/")[-2], path.split("/")[-1].replace(".csv", "_stats.csv"))

    try:
        df = pd.read_csv(path)
        texts = df["text"].values

        results, num_samples, num_sentences, num_words, num_chars, num_tokens = calc_stats(texts)

        results = pd.DataFrame(results, columns=["num_sentences", "num_words", "num_chars", "num_tokens"])
        results.to_csv(stats_path, index=False)

        data_name, model = path.split("/")[-1].split("_")
        model = model.removesuffix(".csv")

        # with open(MASTER_STATS_PATH, mode="a", newline="", encoding="utf-8") as file:
        #     writer = csv.writer(file)
        #     writer.writerow([data_name, model, num_samples, num_sentences, num_words, num_chars, num_tokens])
    except:
        pass

../data/data_ai/blogs/blogs_Phi-3-small-128k-instruct.csv


100%|██████████| 28836/28836 [02:11<00:00, 219.48it/s]


../data/data_ai/blogs/blogs_Llama-3.2-3B-Instruct.csv


100%|██████████| 28836/28836 [01:03<00:00, 455.77it/s]


../data/data_ai/blogs/blogs_Qwen2.5-14B-Instruct.csv


100%|██████████| 28836/28836 [00:40<00:00, 705.73it/s]


../data/data_ai/blogs/blogs_Ministral-8B-Instruct-2410.csv


100%|██████████| 28836/28836 [01:17<00:00, 371.80it/s]


../data/data_ai/blogs/blogs_Phi-3-medium-128k-instruct.csv


100%|██████████| 19300/19300 [01:26<00:00, 222.38it/s]


../data/data_ai/blogs/blogs_Qwen2.5-3B-Instruct.csv


100%|██████████| 28836/28836 [01:09<00:00, 414.88it/s]


../data/data_ai/blogs/blogs_Meta-Llama-3.1-70B-Instruct-AWQ-INT4.csv


100%|██████████| 28836/28836 [00:48<00:00, 594.24it/s]


../data/data_ai/blogs/blogs_Falcon3-3B-Instruct.csv


100%|██████████| 28836/28836 [01:35<00:00, 302.91it/s]


../data/data_ai/blogs/blogs_Phi-3-mini-128k-instruct.csv


100%|█████████▉| 19260/19300 [07:59<00:00, 40.14it/s] 


../data/data_ai/blogs/blogs_Falcon3-7B-Instruct.csv


100%|██████████| 28836/28836 [01:25<00:00, 337.95it/s]


../data/data_ai/blogs/blogs_Qwen2-7B-Instruct.csv


100%|██████████| 28836/28836 [01:06<00:00, 435.65it/s]


../data/data_ai/blogs/blogs_Qwen2-72B-Instruct-AWQ.csv


100%|██████████| 28836/28836 [01:18<00:00, 365.22it/s]


../data/data_ai/blogs/blogs_Qwen2.5-72B-Instruct-AWQ.csv


100%|██████████| 28836/28836 [01:07<00:00, 424.81it/s] 


../data/data_ai/blogs/blogs_Llama-3.1-8B-Instruct.csv


100%|██████████| 28836/28836 [01:04<00:00, 446.63it/s]


../data/data_ai/blogs/blogs_Meta-Llama-3.3-70B-Instruct-AWQ-INT4.csv


100%|██████████| 28836/28836 [00:49<00:00, 582.78it/s] 


../data/data_ai/blogs/blogs_Phi-3.5-mini-instruct.csv


  8%|▊         | 1864/23232 [00:12<02:28, 144.31it/s]


../data/data_ai/blogs/blogs_phi-4.csv


100%|██████████| 28836/28836 [01:03<00:00, 457.44it/s]


../data/data_ai/blogs/blogs_Qwen2.5-7B-Instruct.csv


100%|██████████| 28836/28836 [00:50<00:00, 568.81it/s]


../data/data_ai/blogs/blogs_Mistral-Nemo-Instruct-2407.csv


100%|██████████| 28836/28836 [00:53<00:00, 538.61it/s]


In [27]:
df = pd.read_csv("../data/data_ai/nyt_articles/nyt-articles_Meta-Llama-3.1-70B-Instruct-AWQ-INT4.csv")
texts = df["text"].values

In [28]:
err = []
for i, text in enumerate(texts):
    try:
        text_tokens = enc.encode(text)
    except:
        err.append(i)

In [29]:
err

[]

In [26]:
df.drop(err, inplace=True)
df.to_csv("../data/data_ai/nyt_articles/nyt-articles_Meta-Llama-3.1-70B-Instruct-AWQ-INT4.csv", index=False)

In [8]:
df = pd.read_csv(MASTER_STATS_PATH)
df

Unnamed: 0,data,model,num_samples,num_sentences,num_words,num_chars,num_tokens
0,blogs,Phi-3-small-128k-instruct,28836,677403,4577975,57362602,15187889
1,blogs,Llama-3.2-3B-Instruct,28836,447683,875182,42284112,10675655
2,blogs,Qwen2.5-14B-Instruct,28836,330384,501842,24062475,6315173
3,blogs,Ministral-8B-Instruct-2410,28836,742094,1352648,42635432,12117456
4,blogs,Qwen2.5-3B-Instruct,28836,486262,1051199,31826561,9011862
...,...,...,...,...,...,...,...
59,reddit,human,655485,1817602,11207395,117002414,32581910
60,nyt-comments,human,4223213,18713269,75762131,1418281599,367129360
61,blogs,human,576774,8370715,11971512,560257945,165283569
62,nyt-articles,human,15813,21318,316981,1759817,421260
