In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/alice29-text-dataset/alice29.txt


In [2]:
with open('/kaggle/input/alice29-text-dataset/alice29.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# NLTK

In [3]:
# Install nltk
!pip install nltk



In [4]:
import nltk
# Download required NLTK resources for tokenization and stopword handling
nltk.download('punkt_tab')  # Punkt tokenizer for splitting sentences
nltk.download('stopwords')  # Stopwords list for filtering common words

# Import libraries for tokenization, stopword handling, frequency analysis, and text processing
from nltk.tokenize import word_tokenize, sent_tokenize  # Functions for tokenizing words and sentences
from nltk.corpus import stopwords  # Stopwords to filter out common words
from collections import Counter  # To count word frequencies
import re  # Regular expressions for text cleaning
import time  # To measure execution time

# Start the timer to measure runtime
start_time = time.time()

# Function to clean the text data
def clean_text(text):
    text = text.lower()  # Convert all text to lowercase 
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation marks
    text = re.sub(r'\d+', '', text)  # Remove numerical digits
    return text

cleaned_text = clean_text(text)  # Apply the function to the input text

# Save the cleaned text to a file
output_cleaned_file = '/kaggle/working/cleaned_nltk.txt'  # File path for saving the cleaned text
with open(output_cleaned_file, 'w') as f:
    f.write(cleaned_text)  # Write the cleaned text to the file

# Tokenize the text into sentences
sentences = sent_tokenize(text)  # Split the text into individual sentences

# Save tokenized sentences to a file
output_words_sentences_file = '/kaggle/working/sentences_nltk.txt'  # File path for saving tokenized sentences
with open(output_words_sentences_file, 'w') as f:
    f.write("Tokenized Sentences:\n") 
    for sentence in sentences:
        f.write(sentence + "\n")  # Write each sentence to a new line

# Tokenize the cleaned text into words
words = word_tokenize(cleaned_text)  # Split the cleaned text into individual words

# Remove stopwords from the tokenized words
stop_words = set(stopwords.words('english'))  # Load the list of stopwords
filtered_words = [word for word in words if word not in stop_words]  # Filter out stopwords

# Save the tokenized and filtered words to a file
output_words_file = '/kaggle/working/words_nltk.txt'  # File path for saving tokenized words
with open(output_words_file, 'w') as f:
    f.write('\n'.join(filtered_words))  # Write each word to a new line

# Perform frequency analysis to find the top 10 most common words
word_freq = Counter(filtered_words)  # Count occurrences of each word
top_10_words = word_freq.most_common(10)  # Get the top 10 most frequent words

# Convert the top 10 words and their frequencies to a DataFrame
top_words_df = pd.DataFrame(top_10_words, columns=["Word", "Frequency"])

# Save the DataFrame as a CSV file for tabular output
output_top_words_file = '/kaggle/working/top10words_nltk.csv'  # File path for saving the top 10 words
top_words_df.to_csv(output_top_words_file, index=False)  # Save DataFrame to a CSV file without the index

# Measure the total runtime for the process
end_time = time.time()  # Record the end time
nltk_runtime = end_time - start_time  # Calculate the elapsed time

# Save the runtime to a file
output_runtime_file = '/kaggle/working/time_compares_nltk.txt'  # File path for saving runtime data
with open(output_runtime_file, 'w') as f:
    f.write(f"NLTK Runtime: {nltk_runtime:.4f} seconds\n")  # Write the runtime to the file

# Print output details for verification
print("Cleaned text saved to 'cleaned_nltk.txt'")  
print("Tokenized sentences saved to 'sentences_nltk.txt'")  
print("Tokenized words saved to 'words_nltk.txt'")  
print("Top 10 words saved to 'top10words_nltk.txt'") 
print("Runtime saved to 'time_compares_nltk.txt'") 
print(f"NLTK Runtime: {nltk_runtime:.4f} seconds")  # Print the total runtime
print("Top 10 most common words:")  # Print the header for the top 10 words
# Print the DataFrame as a table in the console
print(top_words_df)



[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Cleaned text saved to 'cleaned_nltk.txt'
Tokenized sentences saved to 'sentences_nltk.txt'
Tokenized words saved to 'words_nltk.txt'
Top 10 words saved to 'top10words_nltk.txt'
Runtime saved to 'time_compares_nltk.txt'
NLTK Runtime: 0.1698 seconds
Top 10 most common words:
      Word  Frequency
0     said        462
1    alice        385
2   little        128
3      one        101
4     know         86
5     like         85
6    would         83
7     went         83
8    could         77
9  thought         74


# textBlob

In [5]:
# Install TextBlob
!pip install textblob

Collecting nltk>=3.8 (from textblob)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.9.1 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.9.1


In [6]:
# Import necessary libraries
from textblob import TextBlob  # For text processing, tokenization, and analysis
from collections import Counter  # To perform word frequency analysis
import re  # For text cleaning using regular expressions
import time  # To measure execution time

# Start the timer to measure runtime
start_time = time.time()

# Clean the text data
def clean_text(text):
    text = text.lower()  # Convert the text to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation marks
    text = re.sub(r'\d+', '', text)  # Remove numeric digits
    return text

cleaned_text = clean_text(text)  # Apply the function to the input text

# Save the cleaned text to a file
output_cleaned_file = '/kaggle/working/cleaned_textBlob.txt'  # File path to save the cleaned text
with open(output_cleaned_file, 'w') as f:
    f.write(cleaned_text)  # Write the cleaned text to the file

# Tokenize sentences and words using TextBlob
blob = TextBlob(text)  # Create a TextBlob object from the input text

# Sentence tokenization
sentences = blob.sentences  # Tokenize the text into sentences

# Save tokenized sentences to a file
output_words_sentences_file = '/kaggle/working/sentences_textBlob.txt'  # File path for saving tokenized sentences
with open(output_words_sentences_file, 'w') as f:
    f.write("Tokenized Sentences:\n")  
    for sentence in sentences:
        f.write(str(sentence) + "\n")  # Write each sentence on a new line

# Tokenize the cleaned text into words
blob = TextBlob(cleaned_text)  # Create a TextBlob object from the cleaned text
words = blob.words  # Extract words from the cleaned text

# Remove stopwords manually
stop_words = set(["the", "and", "is", "in", "to", "of", "a", "that", "it", "on", "with", "for", "as", "this", "was", "by"])  # Define a custom list of stopwords
filtered_words = [word for word in words if word not in stop_words]  # Remove stopwords from the tokenized words

# Save tokenized words to a file
output_words_file = '/kaggle/working/words_textBlob.txt'  # File path for saving tokenized words
with open(output_words_file, 'w') as f:
    f.write('\n'.join(filtered_words))  # Write each word to a new line

# Perform frequency analysis for the top 10 words
word_freq = Counter(filtered_words)  # Count occurrences of each word
top_10_words = word_freq.most_common(10)  # Extract the top 10 most frequent words

# Convert the top 10 words and their frequencies to a DataFrame
top_words_df = pd.DataFrame(top_10_words, columns=["Word", "Frequency"])

# Save the DataFrame as a CSV file for tabular output
output_top_words_file = '/kaggle/working/top10words_textBlob.csv'  # File path for saving the top 10 words
top_words_df.to_csv(output_top_words_file, index=False)  # Save DataFrame to a CSV file without the index

# Stop the timer to measure runtime
end_time = time.time()  # Record the end time
textblob_runtime = end_time - start_time  # Calculate the total runtime

# Save runtime to a file
output_runtime_file = '/kaggle/working/time_compares_textBlob.txt'  # File path for saving the runtime
with open(output_runtime_file, 'w') as f:
    f.write(f"TextBlob Runtime: {textblob_runtime:.4f} seconds\n")  # Write the runtime in seconds

# Print output details for verification
print("Cleaned text saved to 'cleaned_textBlob.txt'")  
print("Tokenized words saved to 'words_textBlob.txt'")  
print("Tokenized sentences and words saved to 'words_sentences_textBlob.txt'")  
print("Top 10 words saved to 'top10words_textBlob.txt'")  
print("Runtime saved to 'time_compares_textBlob.txt'") 
print(f"TextBlob Runtime: {textblob_runtime:.4f} seconds")  # Print the total runtime
print("Top 10 most common words:")  # Print the header for the top 10 words
# Print the DataFrame as a table in the console
print(top_words_df)

Cleaned text saved to 'cleaned_textBlob.txt'
Tokenized words saved to 'words_textBlob.txt'
Tokenized sentences and words saved to 'words_sentences_textBlob.txt'
Top 10 words saved to 'top10words_textBlob.txt'
Runtime saved to 'time_compares_textBlob.txt'
TextBlob Runtime: 0.3492 seconds
Top 10 most common words:
    Word  Frequency
0    she        536
1   said        462
2      i        401
3  alice        385
4    you        362
5    her        247
6     at        209
7    all        180
8    had        178
9    but        166


# spaCy

In [7]:
# Install spaCy and download the English model
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting numpy>=1.19.0 (from spacy)
  Downloading numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Downloading numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m86.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires cloudpickle~=2.2.1, but you have cloudpickle 3.0.0 which is incompatible.
apache-beam 2.46.0 requires dill<

In [8]:
# Import necessary libraries
import spacy  # spaCy for advanced NLP tasks including tokenization
from collections import Counter  # To perform word frequency analysis
import re  # For text cleaning using regular expressions
import time  # To measure execution time

# Load spaCy model
nlp = spacy.load("en_core_web_sm")  # Load the English language model for spaCy

# Clean the text data
def clean_text(text):
    text = text.lower()  # Convert the text to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation marks
    text = re.sub(r'\d+', '', text)  # Remove numerical digits
    return text

cleaned_text = clean_text(text)  # Apply the cleaning function to the input text

# Save the cleaned text to a file
output_cleaned_file = '/kaggle/working/cleaned_spacy.txt'  # File path for saving the cleaned text
with open(output_cleaned_file, 'w') as f:
    f.write(cleaned_text)  # Write the cleaned text to the file

# Start the timer to measure runtime for spaCy
start_time = time.time()

# Tokenize sentences using spaCy
doc = nlp(text)  # Process the input text with spaCy

# Sentence tokenization
sentences = [sent.text for sent in doc.sents]  # Extract sentences from the processed text

# Save tokenized sentences to a file
output_words_sentences_file = '/kaggle/working/sentences_spacy.txt'  # File path for saving tokenized sentences
with open(output_words_sentences_file, 'w') as f:
    f.write("Tokenized Sentences:\n") 
    for sentence in sentences:
        f.write(sentence + "\n")  # Write each sentence on a new line

# Tokenize the cleaned text into words
doc = nlp(cleaned_text)  # Process the cleaned text with spaCy
words = [token.text for token in doc if token.is_alpha]  # Extract only alphabetic tokens (ignore numbers and symbols)

# Save tokenized words to a file
output_words_file = '/kaggle/working/words_spacy.txt'  # File path for saving tokenized words
with open(output_words_file, 'w') as f:
    f.write('\n'.join(words))  # Write each word to a new line

# Perform frequency analysis for the top 10 words
word_freq = Counter(words)  # Count occurrences of each word
top_10_words = word_freq.most_common(10)  # Extract the top 10 most frequent words

# Convert the top 10 words and their frequencies to a DataFrame
top_words_df = pd.DataFrame(top_10_words, columns=["Word", "Frequency"])

# Save the DataFrame as a CSV file for tabular output
output_top_words_file = '/kaggle/working/top10words_spaCy.csv'  # File path for saving the top 10 words
top_words_df.to_csv(output_top_words_file, index=False)  # Save DataFrame to a CSV file without the index

# Stop the timer to measure runtime
end_time = time.time()  # Record the end time
spacy_runtime = end_time - start_time  # Calculate the total runtime

# Save runtime to a file
output_runtime_file = '/kaggle/working/time_compares_spacy.txt'  # File path for saving the runtime
with open(output_runtime_file, 'w') as f:
    f.write(f"spaCy Runtime: {spacy_runtime:.4f} seconds\n")  # Write the runtime in seconds

# Print output details for verification
print("Step 1: Cleaned text saved to 'cleaned_spacy.txt'") 
print("Step 2: Tokenized words saved to 'words_spacy.txt'")  
print("Step 3: Tokenized sentences and words saved to 'words_sentences_spacy.txt'")  
print("Step 4: Top 10 words saved to 'top10words_spacy.txt'")  
print("Step 5: Runtime saved to 'time_compares_spacy.txt'")  
print(f"spaCy Runtime: {spacy_runtime:.4f} seconds")  # Print the total runtime
print("Top 10 most common words:")  # Print the header for the top 10 words
# Print the DataFrame as a table in the console
print(top_words_df)


Step 1: Cleaned text saved to 'cleaned_spacy.txt'
Step 2: Tokenized words saved to 'words_spacy.txt'
Step 3: Tokenized sentences and words saved to 'words_sentences_spacy.txt'
Step 4: Top 10 words saved to 'top10words_spacy.txt'
Step 5: Runtime saved to 'time_compares_spacy.txt'
spaCy Runtime: 8.6274 seconds
Top 10 most common words:
   Word  Frequency
0   the       1630
1   and        844
2    to        721
3     a        627
4   she        543
5    it        534
6    of        507
7     i        503
8  said        462
9   you        407


# Compare Runtime

In [9]:
import pandas as pd  # Import pandas to create a comparison table

# Example runtime data for NLTK, TextBlob, and spaCy (replace with actual values)
runtime_data = {
    "Framework": ["NLTK", "TextBlob", "spaCy"],
    "Runtime (seconds)": [0.1513, 0.4294, 8.5534]  # Replace these values with actual runtimes
}

# Convert the data to a pandas DataFrame
runtime_comparison_df = pd.DataFrame(runtime_data)

# Save the runtime comparison table as a CSV file
output_runtime_comparison_file = "/kaggle/working/runtime_comparison.csv"
runtime_comparison_df.to_csv(output_runtime_comparison_file, index=False)

# Display the DataFrame for user reference
runtime_comparison_df


Unnamed: 0,Framework,Runtime (seconds)
0,NLTK,0.1513
1,TextBlob,0.4294
2,spaCy,8.5534
