In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import concurrent.futures

In [2]:
def remove_quotes_from_file(path):
    try:
        # Indicate that processing of the file has started
        print(f"Processing file '{path}'...")
        
        # Open the file with a specified encoding to handle special characters
        with open(path, 'r', encoding='utf-8', errors='replace') as file:
            # Read file content and replace single quotes
            content = file.read().replace("'", "")
        
        # Write modified content back to the file
        with open(path, 'w', encoding='utf-8') as file:
            file.write(content)
        
        # Indicate successful cleaning
        print(f"File '{path}' has been cleaned.")

    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e} for file {path}")
    except Exception as e:
        print(f"An error occurred: {e} for file {path}")

In [3]:
# Function to process files in parallel
def process_files_in_parallel(folder_path):
    # List all .txt files in the folder
    file_names = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    file_paths = [os.path.join(folder_path, file_name) for file_name in file_names]

    # Indicate that parallel processing is starting
    print(f"Starting parallel processing for {len(file_paths)} files...")

    # Use ThreadPoolExecutor to process files in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Map the function to the list of file paths
        executor.map(remove_quotes_from_file, file_paths)
    
    # Indicate that processing is complete
    print("Parallel processing complete.")


In [4]:
folder_path = "."
process_files_in_parallel(folder_path) #this is to clean the file

Starting parallel processing for 101 files...
Processing file '.\page_1.txt'...
Processing file '.\page_10.txt'...
Processing file '.\page_100.txt'...
Processing file '.\page_101.txt'...
Processing file '.\page_11.txt'...
Processing file '.\page_12.txt'...
Processing file '.\page_13.txt'...
Processing file '.\page_14.txt'...
File '.\page_1.txt' has been cleaned.
Processing file '.\page_15.txt'...
Processing file '.\page_16.txt'...
Processing file '.\page_17.txt'...
Processing file '.\page_18.txt'...
Processing file '.\page_19.txt'...
Processing file '.\page_2.txt'...
Processing file '.\page_20.txt'...
Processing file '.\page_21.txt'...
Processing file '.\page_22.txt'...
File '.\page_11.txt' has been cleaned.File '.\page_15.txt' has been cleaned.
Processing file '.\page_23.txt'...
File '.\page_10.txt' has been cleaned.
Processing file '.\page_24.txt'...

Processing file '.\page_25.txt'...
File '.\page_16.txt' has been cleaned.
Processing file '.\page_26.txt'...
File '.\page_101.txt' has

In [5]:
#old code (does manual(one at a time)nworks)
#for file_name in os.listdir("."):                                         
 #   if file_name.endswith(".txt"):
  #      file_number = file_name.split('_')[1].split('.')[0]
   #     remove_quotes_from_file(file_name)
    #    print(f"File '{file_name}' has been cleaned.")