# Step 1: Setup and Imports

In [None]:
# Import necessary libraries
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download NLTK data (if not already done)
nltk.download('punkt')  # For tokenization
nltk.download('stopwords')  # For stopwords

# Define the directory containing your text files
directory = "text_files"
preprocessed_directory = "preprocessed_files"


[nltk_data] Downloading package punkt to /Users/kanishk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kanishk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Step 2: Define Preprocessing Functions

In [None]:
def to_lowercase(text):
    """Convert text to lowercase."""
    return text.lower()

def tokenize_text(text):
    """Tokenize the text into words."""
    return word_tokenize(text)

def remove_stopwords(tokens):
    """Remove stopwords from the list of tokens."""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def remove_punctuation(tokens):
    """Remove punctuation from the list of tokens."""
    return [word for word in tokens if word.isalpha()]

def remove_blank_tokens(tokens):
    """Remove any blank space tokens."""
    return [token for token in tokens if token.strip() != '']


# Step 3: Process and Save Preprocessed Files

In [None]:
# For demonstration, process only the first 5 files
sample_files = os.listdir(directory)[:999]
count = 0

for filename in sample_files:
    original_filepath = os.path.join(directory, filename)
    count = count + 1
    with open(original_filepath, 'r', encoding='utf-8') as file:
        
        original_text = file.read()
        if(count <= 5):
            print(f"Original text in {filename}:\n{original_text}\n")
        
        # a. Lowercase the text
        text_lower = to_lowercase(original_text)
        
        # b. Perform tokenization
        tokens = tokenize_text(text_lower)
        
        # c. Remove stopwords
        tokens_no_stopwords = remove_stopwords(tokens)
        
        # d. Remove punctuations
        tokens_no_punctuation = remove_punctuation(tokens_no_stopwords)
        
        # e. Remove blank space tokens
        final_tokens = remove_blank_tokens(tokens_no_punctuation)
        preprocessed_text = ' '.join(final_tokens)
        
        # Print preprocessed text for comparison
        if(count <= 5):
            print(f"Preprocessed text in {filename}:\n{preprocessed_text}\n")
            print ("-" * 40)
        
        # Define the path for the preprocessed file
        preprocessed_filepath = os.path.join(preprocessed_directory, f"preprocessed_{filename}")
        
        # Save the preprocessed text to the new file in preprocessed_files folder
        with open(preprocessed_filepath, 'w', encoding='utf-8') as outfile:
            outfile.write(preprocessed_text)


Original text in file502.txt:
Kit is awesome. I play in my garage just for personal enjoyment not for performances or anything. Once you take the time to break down all the settings, your able to dial in pretty much any kit and sound. With the expansion options and the relatively inexpensive parts expanding is easy and fun.

After a few weeks of daily use for at least an hour a day it still looks and plays beautifully. Overall one of the best purchases I could have made.

Preprocessed text in file502.txt:
kit awesome play garage personal enjoyment performances anything take time break settings able dial pretty much kit sound expansion options relatively inexpensive parts expanding easy fun weeks daily use least hour day still looks plays beautifully overall one best purchases could made

----------------------------------------
Original text in file264.txt:
I just tested this fog fluid with a 1byone 400W fogger. Two 30 second bursts were sufficient to create enough fog layers for a moo