In [26]:
import os
import tiktoken
import pandas as pd
from tqdm import tqdm

In [27]:
def get_csv_paths(folder_path, recursive=False):
    if recursive:
        # Walk through all subdirectories
        file_paths = [os.path.join(root, file) 
                      for root, _, files in os.walk(folder_path) 
                      for file in files if file.endswith('.csv')]
    else:
        # Get files in the root folder only
        file_paths = [os.path.join(folder_path, file) 
                      for file in os.listdir(folder_path) 
                      if file.endswith('.csv')]
    
    return file_paths

In [28]:
DATA_HUMAN_PATH = "../data/data_human/"
DATA_AI_PATH = "../data/data_ai/nyt_comments/"

In [29]:
paths = get_csv_paths(DATA_AI_PATH, recursive=True)

In [30]:
enc = tiktoken.get_encoding("cl100k_base") #cl100k_base

In [31]:
import re
def clean_newlines(text):
    # Replace 3 or more consecutive newlines with just 2
    return re.sub(r'\n{3,}', '\n\n', text)

In [54]:
def clean_text(s: str) -> str:
    """
    Remove a leading '", [' or '[' and a trailing '", ]' or ']' from the given string.
    """
    # Remove leading patterns
    try:
        for prefix in ('"', "[", '["'):
            if s.startswith(prefix):
                s = s[len(prefix) :]
                break

        # Remove trailing patterns
        for suffix in ('"', "]", '"]'):
            if s.endswith(suffix):
                s = s[: -len(suffix)]
                break
        
        s = s.replace("  ", "")
        s = s.strip()
        s = re.sub(r'\n{3,}', '\n\n', s)
    except AttributeError:
        pass

    return s

In [None]:
for path in paths:
    print(f"Processing {path}...")
    df = pd.read_csv(path)
    df['text'] = df['text'].apply(clean_text)
    texts = df['text'].tolist()
    print(f"Number of texts: {len(texts)}")
    err = []
    for i, text in enumerate(tqdm(texts)):
        try:
            enc.encode(text)
        except:
            err.append([i, text])

    if len(err) > 0:
        print(f"{len(err)} Errors in {path}:")
        for i, text in err:
            print(f"Index: {i}, Text: {text}")

        # get user input
        user_input = input(f"Do you want to remove the errors in {path}? (y/n): ")
        if user_input.lower() == "y":
            df.drop(index=[i for i, _ in err], inplace=True)
        else:
            print(f"Errors in {path} were not removed.")
            
    df.to_csv(path, index=False)