In [5]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
import acquire as a
import prepare as p

In [49]:
example = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

Lowercase everything

Normalize unicode characters

Replace anything that is not a letter, number, whitespace or a single quote.

In [50]:
# lowercase everything
example = example.lower()

In [51]:
# Normalize unicode characters
example = unicodedata.normalize('NFKD', example)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')

example


"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [52]:
# remove anything that is not a through z, a number, a single quote, or whitespace
example = re.sub(r"[^a-z0-9'\s]", '', example)
print(example)


paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity


In [53]:
def basic_clean(string):
    # lowercase everything
    string = string.lower()
    
    # Normalize unicode characters
    string = unicodedata.normalize('NFKD', string)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore')
    
    # remove anything that is not a through z, a number, a single quote, or whitespace
    return re.sub(r"[^a-z0-9'\s]", '', string)


In [54]:
# check import function
example = p.basic_clean(example)
example

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [48]:
example

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [29]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()

    return tokenizer.tokenize(string, return_str=True)


In [55]:
tokenize(example)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [56]:
def stem(string):
    #create porter stemmer
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    stemmed = ' '.join(stems)
    return stemmed

In [57]:
stem(example)

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

In [58]:
#create porter stemmer
ps = nltk.porter.PorterStemmer()

In [59]:
#test stemmer
ps.stem('calling'), ps.stem('calls'), ps.stem('called'), ps.stem('call')

('call', 'call', 'call', 'call')

In [60]:
ps.stem('mouse'), ps.stem('mice')

('mous', 'mice')

In [34]:
#use stemmer 
ps.stem(example)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necess"

In [37]:
#use stemmer - apply stem to each word in our string
stems = [ps.stem(word) for word in example.split()]
stems[:10]

['paul',
 'erdo',
 'and',
 'georg',
 'polya',
 'were',
 'influenti',
 'hungarian',
 'mathematician',
 'who']

In [38]:
#join words back together
article_stemmed = ' '.join(stems)
article_stemmed

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [63]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    lemma = ' '.join(lemmas)
    return lemma

In [112]:
example = lemmatize(example)

In [41]:
# download the first time
# nltk.download('all')

In [42]:
#create the lemmatizer
wnl = nltk.stem.WordNetLemmatizer()
wnl

<WordNetLemmatizer>

In [43]:
#test lemmatizer
wnl.lemmatize('mouse'), wnl.lemmatize('mice')

('mouse', 'mouse')

In [44]:
#use lemmatize - apply stem to each word in our string
# wnl.lemmatize(article)
lemmas = [wnl.lemmatize(word) for word in example.split()]
lemmas[:10]

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematician',
 'who']

In [45]:
#join words back together
article_lemma = ' '.join(lemmas)
article_lemma

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

4. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [65]:
#import stopwords list
from nltk.corpus import stopwords

In [66]:
#only need to do once
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kcamarillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [152]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text, extra_words=None, exclude_words=None):
    # Get the default English stopwords list from NLTK
    stopwords_list = set(stopwords.words('english'))

    # Add extra words to the stopwords list
    if extra_words:
        stopwords_list.update(extra_words)

    # Remove exclude words from the stopwords list
    if exclude_words:
        stopwords_list.difference_update(set(exclude_words))

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stopwords from the list of words
    filtered_words = [word for word in words if word.lower() not in stopwords_list]

    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)

    return filtered_text


In [164]:
remove_stopwords(example, extra_words=["'", "contributed"])

"paul erdos george polya influential hungarian mathematician lot field erdos 's name contains hungarian letter double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

In [67]:
#save stopwords
stopwords_ls = stopwords.words('english')
stopwords_ls[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [68]:
stopwords_ls.sort()

In [69]:
stopwords_ls[:10]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [70]:
#set a list to remove some stopwords
extra = ['all','about','after']
extra

['all', 'about', 'after']

In [72]:
set(stopwords_ls) - set(extra)

{'a',
 'above',
 'again',
 'against',
 'ain',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',


In [74]:
#split words in lemmatized article
words = article_lemma.split()
words[:10]

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematician',
 'who']

In [75]:
#remove stopwords from list of words
[word for word in words if word not in stopwords_ls]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematician',
 'contributed',
 'lot',
 'field',
 "erdos's",
 'name',
 'contains',
 'hungarian',
 'letter',
 "'o'",
 "'o'",
 'double',
 'acute',
 'accent',
 'often',
 'incorrectly',
 'written',
 'erdos',
 'erdos',
 'either',
 'mistake',
 'typographical',
 'necessity']

In [76]:
stopwords_ls.append("'")

In [166]:
filtered = [word for word in words if word not in stopwords_ls]
filtered[:10]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematician',
 'contributed',
 'lot',
 'field']

In [78]:
#show how many words we removed
len(words) - len(filtered)

16

In [79]:
#join words back together
parsed_article =' '.join(filtered) 
parsed_article

"paul erdos george polya influential hungarian mathematician contributed lot field erdos's name contains hungarian letter 'o' 'o' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [6]:
news_df = a.get_news_articles()


In [7]:
# turn into df
news_df = pd.DataFrame(news_df)

7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.`

In [8]:
codeup_df = pd.DataFrame(a.get_blog_articles())

In [9]:
# rename columns
codeup_df = codeup_df.rename(columns={"content": "original"})

In [11]:
import prepare as p
p.basic_clean([news_df["content"]])

AttributeError: 'list' object has no attribute 'lower'

8. For each dataframe, produce the following columns:

title to hold the title
original to hold the original article/post content
clean to hold the normalized and tokenized original with the stopwords removed.
stemmed to hold the stemmed version of the cleaned data.
lemmatized to hold the lemmatized version of the cleaned data.

9. Ask yourself:

If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?