In [1]:
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
# Download required NLTK data

# Downloads a pre-trained model that helps split text into individual words and sentences
nltk.download('punkt')
nltk.download('punkt_tab')
# Downloads a list of common English words (like 'the', 'a', 'is') that usually don't carry significant meaning.
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
def text_cleaner(text):
    """
    Clean and tokenize text by:
    1. Removing punctuation
    2. Converting to lowercase
    3. Tokenizing into words
    """
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Convert to lowercase
    text = text.lower()

    # Tokenize into words
    tokens = word_tokenize(text)

    return tokens


In [17]:
def word_frequency_counter(text, remove_stopwords):
    """
    Count word frequencies in text, optionally removing stop words
    """
    # Clean and tokenize the text
    tokens = text_cleaner(text)

    # Remove stopwords if requested
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        #print(stop_words)
        tokens = [word for word in tokens if word not in stop_words]

    # Count word frequencies
    word_counts = Counter(tokens)

    return word_counts


In [18]:
def main():
    # Sample text - you can replace this with reading from a file
    sample_text = """
                  Deep into that darkness peering, long I stood there wondering, fearing,
                  Doubting, dreaming dreams no mortal ever dared to dream before;
                  But the silence was unbroken, and the stillness gave no token,
                  And the only word there spoken was the whispered word, ‘Lenore!’
                  """

    print("=== Text Cleaning and Tokenization ===")
    cleaned_tokens = text_cleaner(sample_text)
    print("Cleaned and tokenized text:")
    print(cleaned_tokens)  # Print first the tokens
    print("------------------------------------------\n")

    print("=== Word Frequency Count ===")
    print("With stopwords:")
    freq_with_stopwords = word_frequency_counter(sample_text, remove_stopwords=False)
    print(freq_with_stopwords.most_common(5))

    print("\nWithout stopwords:")
    freq_without_stopwords = word_frequency_counter(sample_text, remove_stopwords=True)
    print(freq_without_stopwords.most_common(5))

if __name__ == "__main__":
    main()

=== Text Cleaning and Tokenization ===
Cleaned and tokenized text:
['deep', 'into', 'that', 'darkness', 'peering', 'long', 'i', 'stood', 'there', 'wondering', 'fearing', 'doubting', 'dreaming', 'dreams', 'no', 'mortal', 'ever', 'dared', 'to', 'dream', 'before', 'but', 'the', 'silence', 'was', 'unbroken', 'and', 'the', 'stillness', 'gave', 'no', 'token', 'and', 'the', 'only', 'word', 'there', 'spoken', 'was', 'the', 'whispered', 'word', '‘', 'lenore', '’']
------------------------------------------

=== Word Frequency Count ===
With stopwords:
[('the', 4), ('there', 2), ('no', 2), ('was', 2), ('and', 2)]

Without stopwords:
[('word', 2), ('deep', 1), ('darkness', 1), ('peering', 1), ('long', 1)]
