In [8]:
import pandas as pd
import time
import re
import os
from collections import Counter

In [9]:
# Clean all text
def clean(text):
    # convert to lowercase
    text = text.lower()
    # replace new line and tab with space
    text = text.replace('\n', ' ').replace('\t', ' ')
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # collapse multiple spaces into one space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Nltk

In [10]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

def Nltk(text):
    # Tokenize sentence
    sentences = nltk.sent_tokenize(text)

    # Clean sentence
    cleaned_sentences = [clean(s) for s in sentences]

    # Clean text
    cleaned_text = clean(text)

    # Tokenize words
    words = nltk.word_tokenize(cleaned_text)
    filtered_words = [w for w in words if w not in stop_words and w.strip() != '']

    top_words = Counter(filtered_words).most_common(10)

    return cleaned_text, cleaned_sentences, filtered_words, top_words

[nltk_data] Downloading package punkt_tab to /home/mix/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mix/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# TextBlob

In [19]:
from textblob import TextBlob

def Textblob(text):
    # Create blob from raw text so there still . , for tokenizing sentence
    blob_raw = TextBlob(text)

    # Tokenize sentences
    sentences = [str(s) for s in blob_raw.sentences]
    # Clean sentences
    cleaned_sentences = [clean(s) for s in sentences]
    
    # Create blob form cleaned text to tokenize word
    cleaned_text = clean(text)
    # Create blob from cleaned text
    blob_cleaned = TextBlob(cleaned_text)

    # Filter stop word
    filtered_words = [w for w in blob_cleaned.words if w not in stop_words and w.strip() != '']
    
    # Top Word
    top_words = Counter(filtered_words).most_common(10)
    
    return cleaned_text, cleaned_sentences, filtered_words, top_words

# Spacy

In [20]:
import spacy
spacy_nlp = spacy.load('en_core_web_sm')

def Spacy(text):
    # run spacy on raw text to keep . , for sentences tokenization
    spacy_raw = spacy_nlp(text)

    # Tokenize sentences
    sentences = [s.text.strip() for s in spacy_raw.sents]
    # Clean sentences
    cleaned_sentences = [clean(s) for s in sentences]

    cleaned_text = clean(text)
    spacy_cleand = spacy_nlp(cleaned_text)
    # Tokenize words
    filtered_words = [w.text.strip() for w in spacy_cleand if w.text.lower() not in stop_words]

    top_words = Counter(filtered_words).most_common(10)

    return cleaned_text, cleaned_sentences, filtered_words, top_words

In [21]:
def save_reports(framework_name, clean_text, sentences, words, top_words, elapsed):
    # Create output directory
    output_dir = "output"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save cleaned text
    with open(f"{output_dir}/cleaned_{framework_name}.txt", "w", encoding="utf-8") as f:
        f.write(clean_text)

    # Save tokenize sentence and word with their count
    with open(f"{output_dir}/words_{framework_name}.txt", "w", encoding="utf-8") as f:
        f.write(f"----Tokenized Sentences ({len(sentences)})----\n")
        f.write("\n".join(f"{s}" for s in  sentences))
        f.write(f"----\n\nTokenized Words ({len(words)})----\n")
        f.write("\n".join(words))

    # convert tuple to dataframe and save to textfile
    df_top = pd.DataFrame(top_words, columns=["Word", "Count"])
    with open(f"{output_dir}/top10words_{framework_name}.txt", "w", encoding="utf-8") as f:
        f.write(df_top.to_string(index=False))

    time_file = f"{output_dir}/time_compares.txt"
    
    # create data frame for the current run
    new_row = {"Framework": framework_name, "Time(s)": round(elapsed, 6)}
    df_new = pd.DataFrame([new_row])
    
    # check if time compare file already exist or not
    if os.path.exists(time_file):
        df_existing = pd.read_csv(time_file)
        df_final = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        # if not then it just the row
        df_final = df_new
    
    # save the file with out row number
    df_final.to_csv(time_file, index=False)

In [23]:
# Framework pattern
frameworks = [
    (Textblob, "TextBlob"), (Nltk, "Nltk"), (Spacy, "Spacy")
]

# Remove previous time compared.txt to prevent the result of new run to be append to the old file
time_file_path = "output/time_compares.txt"
if os.path.exists(time_file_path):
    os.remove(time_file_path)

input_file = "alice29.txt"
with open(input_file, 'r', encoding='utf-8') as file:
    raw_text = file.read()

# loop through each frameworks 
for framework_func, name in frameworks:
    start_time = time.time()
    
    # run the frameworks
    cleaned_text, sentences, final_words_str, top = framework_func(raw_text)
    
    end_time = time.time()
    # record run time
    elapsed = end_time - start_time
    
    # create report for each frameworks
    save_reports(name, cleaned_text, sentences, final_words_str, top, elapsed)