In [1]:
import os
import tiktoken
import pandas as pd

In [2]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [3]:
DATA_HUMAN_PATH = "../data/data_human/"
DATA_AI_PATH = "../data/data_ai/"

In [4]:
paths = get_csv_paths(DATA_AI_PATH, recursive=True)

In [5]:
enc = tiktoken.get_encoding("o200k_base") #cl100k_base

In [None]:
def fix_text(text: str) -> str:
    if isinstance(eval(text), list) and len(eval(text)) == 1:
        text = eval(text)[0]
        return text
    else:
        return text 

In [None]:
for path in paths:
    print(f"Processing {path}...")
    df = pd.read_csv(path)
    texts = df['text'].tolist()
    print(f"Number of texts: {len(texts)}")
    err = []
    for i, text in enumerate(texts):
        try:
            enc.encode(text)
        except:
            err.append([i, text])

    if len(err) > 0:
        print(f"{len(err)} Errors in {path}:")
        for i, text in err:
            print(f"Index: {i}, Text: {text}")

        # get user input
        user_input = input(f"Do you want to remove the errors in {path}? (y/n): ")
        if user_input.lower() == "y":
            df.drop(index=[i for i, _ in err], inplace=True)
            df['text'] = df['text'].apply(fix_text)          
            df.to_csv(path, index=False)
        else:
            print(f"Errors in {path} were not removed.")

Processing ../data/data_ai/blogs/blogs_Phi-4-mini-instruct.csv...
Number of texts: 28836
Processing ../data/data_ai/blogs/blogs_Meta-Llama-3.3-70B-Instruct-AWQ-INT4.csv...
Number of texts: 28836
Processing ../data/data_ai/blogs/blogs_Phi-3.5-mini-instruct.csv...
Number of texts: 28836
6 Errors in ../data/data_ai/blogs/blogs_Phi-3.5-mini-instruct.csv:
Index: 1864, Text: nan
Index: 1886, Text: nan
Index: 1888, Text: nan
Index: 1894, Text: nan
Index: 19211, Text: nan
Index: 19224, Text: nan
Processing ../data/data_ai/blogs/blogs_phi-4.csv...
Number of texts: 28836
Processing ../data/data_ai/blogs/blogs_Qwen2.5-7B-Instruct.csv...
Number of texts: 28836
Processing ../data/data_ai/blogs/blogs_Mistral-Nemo-Instruct-2407.csv...
Number of texts: 28836
Processing ../data/data_ai/nyt_articles/nyt-articles_Qwen2-72B-Instruct-AWQ.csv...
Number of texts: 15813
Processing ../data/data_ai/nyt_articles/nyt-articles_phi-4.csv...
Number of texts: 15813
Processing ../data/data_ai/nyt_articles/nyt-articles