In [1]:
import os
import tiktoken
import pandas as pd
from tqdm import tqdm

In [2]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [3]:
DATA_HUMAN_PATH = "../data/data_human/"
DATA_AI_PATH = "../data/data_ai/reddit/"

In [4]:
paths = get_csv_paths(DATA_AI_PATH, recursive=True)

In [5]:
enc = tiktoken.get_encoding("o200k_base") #cl100k_base

In [None]:
def fix_text(s: str) -> str:
    """
    Remove a leading '", [' or '[' and a trailing '", ]' or ']' from the given string.
    """
    # Remove leading patterns
    for prefix in ('"', '[', '["'):
        if s.startswith(prefix):
            s = s[len(prefix):]
            break

    # Remove trailing patterns
    for suffix in ('"', ']', '"]'):
        if s.endswith(suffix):
            s = s[:-len(suffix)]
            break

    return s

In [None]:
for path in paths:
    print(f"Processing {path}...")
    df = pd.read_csv(path)
    texts = df['text'].tolist()
    print(f"Number of texts: {len(texts)}")
    err = []
    for i, text in enumerate(tqdm(texts)):
        try:
            enc.encode(text)
        except:
            err.append([i, text])

    if len(err) > 0:
        print(f"{len(err)} Errors in {path}:")
        for i, text in err:
            print(f"Index: {i}, Text: {text}")

        # get user input
        user_input = input(f"Do you want to remove the errors in {path}? (y/n): ")
        if user_input.lower() == "y":
            df.drop(index=[i for i, _ in err], inplace=True)
        else:
            print(f"Errors in {path} were not removed.")
    
    df['text'] = df['text'].apply(fix_text)          
    df.to_csv(path, index=False)

Processing ../data/data_ai/reddit/reddit_Qwen2.5-72B-Instruct-AWQ.csv...
Number of texts: 131096


100%|██████████| 131096/131096 [00:44<00:00, 2965.84it/s] 


Processing ../data/data_ai/reddit/reddit_Qwen2.5-3B-Instruct.csv...
Number of texts: 131096


100%|██████████| 131096/131096 [00:31<00:00, 4189.43it/s]


Processing ../data/data_ai/reddit/reddit_Phi-3-mini-128k-instruct.csv...
Number of texts: 131096


100%|██████████| 131096/131096 [01:38<00:00, 1330.35it/s]


Processing ../data/data_ai/reddit/reddit_Falcon3-3B-Instruct.csv...
Number of texts: 131096


100%|██████████| 131096/131096 [00:57<00:00, 2271.24it/s]


Processing ../data/data_ai/reddit/reddit_Phi-3-small-128k-instruct.csv...
Number of texts: 131096


 21%|██        | 27642/131096 [2:13:37<10:55:01,  2.63it/s]