In [None]:
import re
import unicodedata
import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
    * Lowercase everything
    * Normalize unicode characters
    * Replace anything that is not a letter, number, whitespace or a single quote.

In [None]:
def basic_clean(text):
    """
    Apply basic text cleaning to the input string.
    
    This function performs the following steps:
    - Lowercase the text.
    - Normalize unicode characters.
    - Replace characters that are not letters, numbers, whitespace, or single quotes.
    
    Parameters:
    - text: The input text to be cleaned.
    
    Returns:
    Cleaned text after applying the basic cleaning operations.
    """
    # Lowercase the text
    text = text.lower()
    
    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    #Replace any characters that are not letters, numbers, spaces, or single quotes.

    text = re.sub(r"[^a-z0-9'\s]", '', text)
    return text

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [None]:
def tokenize(text):
    """
    Tokenize words in the input string.
    
    This function tokenizes the input text into individual words.
    
    Parameters:
    - text: The input text to be tokenized.
    
    Returns:
    A list of tokenized words.
    """
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [None]:
def stem(text):
    """
    Apply stemming to words in the input text.
    
    This function applies stemming to each word in the input text using the Porter Stemmer algorithm.
    
    Parameters:
    - text: The input text to apply stemming to.
    
    Returns:
    Text with words after stemming.
    """
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in nltk.word_tokenize(text)]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [None]:
def lemmatize(text):
    """
    Apply lemmatization to words in the input text.
    
    This function applies lemmatization to each word in the input text using the WordNet Lemmatizer.
    
    Parameters:
    - text: The input text to apply lemmatization to.
    
    Returns:
    Text with words after lemmatization.
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords. This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [None]:
def remove_stopwords(text, extra_words=None, exclude_words=None):
    """
    Remove stopwords from the input text.
    
    Parameters:
    - text (str): The input text to remove stopwords from.
    - extra_words (list): List of additional words to include as stopwords.
    - exclude_words (list): List of words not to be removed.
    
    Returns:
    str: Text with stopwords removed.
    """
    # Load the stopwords list from the NLTK library
    stopword_list = set(stopwords.words('english'))
    
    # Add extra words to the stopwords list if provided
    if extra_words:
        stopword_list.update(extra_words)
        
    # Remove excluded words from the stopwords list if provided
    if exclude_words:
        stopword_list.difference_update(exclude_words)
        
    # Tokenize the input text into individual words
    words = nltk.word_tokenize(text)
    
    # Filter out words that are in the stopwords list
    filtered_words = [word for word in words if word.lower() not in stopword_list]
    
    # Reconstruct the filtered words into a text string
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

8. For each dataframe, produce the following columns:
    * title to hold the title
    * original to hold the original article/post content
    * clean to hold the normalized and tokenized original with the stopwords removed.
    * stemmed to hold the stemmed version of the cleaned data.
    * lemmatized to hold the lemmatized version of the cleaned data.

9. Ask yourself:
    * If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    * If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    * If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?