In [26]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as ac

# Exercises

The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

## 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

* Lowercase everything
* Normalize unicode characters
* Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(article):
    '''
    This function takes in a article in string format.
    
    Turns all letters into lowercase.
    
    Normalizes the unicode characters using the NFKD method,
    while ignoring any unknow characters.
    
    Will replace anything that is NOT letters, numbers, whitespace or single quote.
    
    This funtion will return a basic cleaned article in string format
    '''
    
    # Lowercase 
    article = article.lower()
    
    # Normalization
    article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    # Replace
    article = re.sub(r"[^a-z0-9'\s]", '', article)
    
    return article

In [3]:
article = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to\
the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often\
incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [4]:
article = basic_clean(article)
article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot tothe field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is oftenincorrectly written as erdos or erdos either by mistake or out of typographical necessity"

## 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [5]:
def tokenize(article):
    '''
    This function takes in an article as a string.
    Creates a tokenizer using nltk.
    Uses the tokenize on the artical and returns the article in string fromat.
    '''
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    tokenizer.tokenize(article, return_str = True)
    
    return article

In [6]:
article = tokenize(article)
article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot tothe field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is oftenincorrectly written as erdos or erdos either by mistake or out of typographical necessity"

## 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [7]:
def stem(article):
    '''
    This function takes in an article as a string.
    Creates a porter stemmer.
    Applies the stemmer to each word in the article/string.
    Joins the stems into a single string called article_stemmed.
    Returns article_stemmed with all stemed characters. 
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Apply the stemmer to each word in our string.
    stems = [ps.stem(word) for word in article.split()]
    
    # Join stems
    article_stemmed = ' '.join(stems)
    
    return article_stemmed

In [8]:
article_stemmed = stem(article)
article_stemmed

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot toth field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is oftenincorrectli written as erdo or erdo either by mistak or out of typograph necess"

## 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [9]:
def lemmatize(article):
    '''
    This function takes in an article as a string.
    Downloads 'wordnet' from nltk.
    Creates a lemmatizer.
    Applies the lemmatizer to each word in the article/string.
    Joins the lemmmas into a single string called article_lemmatized.
    Returns artical_lemmatized with all lemmatized characters. 
    '''
    # Download the first time.
    nltk.download('wordnet')
    
    # Create the Lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Apply the lemmatize to each word in our string.
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    
    # Join lemmas
    article_lemmatized = ' '.join(lemmas)
    
    return article_lemmatized

In [10]:
article_lemmatized = lemmatize(article)
article_lemmatized

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


"paul erdos and george polya are influential hungarian mathematician who contributed a lot tothe field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is oftenincorrectly written a erdos or erdos either by mistake or out of typographical necessity"

## 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [11]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [12]:
extra_words = 'my'
exclude_words = 'are'

In [13]:
remove_stopwords(article)


"paul erdos george polya influential hungarian mathematicians contributed lot tothe field erdos's name contains hungarian letter 'o' 'o' double acute accent oftenincorrectly written erdos erdos either mistake typographical necessity"

## 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [14]:
from requests import get
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [15]:
def get_article(article, category):
    """
    This function takes in a category and artical as a string. 
    Category must be an available category in inshorts, article is the link.
    Returns a single inshort article.
    """
    # Attribute selector
    title = article.select("[itemprop='headline']")[0].text
    
    # article body
    content = article.select("[itemprop='articleBody']")[0].text
    
    output = {}
    output["title"] = title
    output["content"] = content
    output["category"] = category
    
    return output

In [59]:
#Test my function on the business page
business_test = ac.scrape_one_page('business')
topics = ['business', 'sports', 'technology', 'entertainment']

news_df = ac.get_news_articles(topics)

news_df = pd.DataFrame(ac.get_news_articles(topics))

In [61]:
news_df

Unnamed: 0,category,title,content
0,business,Ratan Tata must check whether his donations re...,"Wrestler Vinesh Phogat on Wednesday said, ""I r..."
1,business,I was laid off from LinkedIn before even start...,"Lea Schuhmacher, a woman who signed the contra..."
2,business,"India's high streets for shopping ranked, Beng...",Bengaluru's MG Road ranked first in the list o...
3,business,PepsiCo bottler Varun Beverages enters ₹1 lakh...,"Varun Beverages, PepsiCo's India franchise bot..."
4,business,All eyes on India as it becomes a 'plus 1' to ...,Vedanta Resources Chairman Anil Agarwal said a...
...,...,...,...
95,entertainment,Papa used to tell friends that I'll be a polic...,Actress Sonakshi Sinha revealed that when she ...
96,entertainment,Your so called whatevers not getting opening: ...,Actress Richa Chadha took to Instagram and urg...
97,entertainment,'Kantara' actor Rishab Shetty casts vote in Ka...,Actor-filmmaker Rishab Shetty on Wednesday arr...
98,entertainment,"Didn’t tell, thought let work go as it is: Bin...","Veteran actress Bindu, who appeared in several..."


## 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [36]:
codeup_df = pd.DataFrame(ac.get_blog_articles('blog_posts.json'))

In [37]:
codeup_df

Unnamed: 0,title,link,date_published,content
0,Women in tech: Panelist Spotlight – Magdalena ...,https://codeup.com/events/black-excellence-in-...,"Mar 28, 2023",\nWomen in tech: Panelist Spotlight – Magdalen...
1,Women in tech: Panelist Spotlight – Rachel Rob...,https://codeup.com/events/black-excellence-in-...,"Mar 20, 2023",\nWomen in tech: Panelist Spotlight – Rachel R...
2,Women in Tech: Panelist Spotlight – Sarah Mellor,https://codeup.com/events/black-excellence-in-...,"Mar 13, 2023",\nWomen in tech: Panelist Spotlight – Sarah Me...
3,Women in Tech: Panelist Spotlight – Madeleine ...,https://codeup.com/events/black-excellence-in-...,"Mar 6, 2023",\nWomen in tech: Panelist Spotlight – Madelein...
4,Black Excellence in Tech: Panelist Spotlight –...,https://codeup.com/events/black-excellence-in-...,"Feb 16, 2023",\nBlack excellence in tech: Panelist Spotlight...
5,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/events/black-excellence-in-...,"Feb 13, 2023",\nBlack excellence in tech: Panelist Spotlight...


## 8. For each dataframe, produce the following columns:

* title to hold the title
* original to hold the original article/post content
* clean to hold the normalized and tokenized original with the stopwords removed.
* stemmed to hold the stemmed version of the cleaned data.
* lemmatized to hold the lemmatized version of the cleaned data.


In [50]:
codeup_df['content'].apply(basic_clean).apply(tokenize).apply(lemmatize).apply(remove_stopwords)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    woman tech panelist spotlight magdalena rahn c...
1    woman tech panelist spotlight rachel robbinsma...
2    woman tech panelist spotlight sarah mellor cod...
3    woman tech panelist spotlight madeleine capper...
4    black excellence tech panelist spotlight wilma...
5    black excellence tech panelist spotlight steph...
Name: content, dtype: object

In [51]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    #original text from content column
    df['original'] = df['content']
    
    #chain together clean, tokenize, remove stopwords
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    #chain clean, tokenize, stem, remove stopwords
    df['stemmed'] = df['clean'].apply(stem)
    
    #clean clean, tokenize, lemmatize, remove stopwords
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', 'original', 'clean', 'stemmed', 'lemmatized']]

In [64]:
prep_codeup = prep_article_data(codeup_df, 'content', extra_words =[], exclude_words=[])

#take a look
prep_codeup.head(5)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Women in tech: Panelist Spotlight – Magdalena ...,\nWomen in tech: Panelist Spotlight – Magdalen...,women tech panelist spotlight magdalena rahn c...,women tech panelist spotlight magdalena rahn c...,woman tech panelist spotlight magdalena rahn c...
1,Women in tech: Panelist Spotlight – Rachel Rob...,\nWomen in tech: Panelist Spotlight – Rachel R...,women tech panelist spotlight rachel robbinsma...,women tech panelist spotlight rachel robbinsma...,woman tech panelist spotlight rachel robbinsma...
2,Women in Tech: Panelist Spotlight – Sarah Mellor,\nWomen in tech: Panelist Spotlight – Sarah Me...,women tech panelist spotlight sarah mellor cod...,women tech panelist spotlight sarah mellor cod...,woman tech panelist spotlight sarah mellor cod...
3,Women in Tech: Panelist Spotlight – Madeleine ...,\nWomen in tech: Panelist Spotlight – Madelein...,women tech panelist spotlight madeleine capper...,women tech panelist spotlight madelein capper ...,woman tech panelist spotlight madeleine capper...
4,Black Excellence in Tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...,black excellence tech panelist spotlight wilma...,black excel tech panelist spotlight wilmari de...,black excellence tech panelist spotlight wilma...


In [66]:
prep_codeup.iloc[1]


title         Women in tech: Panelist Spotlight – Rachel Rob...
original      \nWomen in tech: Panelist Spotlight – Rachel R...
clean         women tech panelist spotlight rachel robbinsma...
stemmed       women tech panelist spotlight rachel robbinsma...
lemmatized    woman tech panelist spotlight rachel robbinsma...
Name: 1, dtype: object

In [62]:
prep_news = prep_article_data(news_df, 'content', extra_words =[], exclude_words=[])

#take a look
prep_news.head(5)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/garrettarnett/nltk_data...
[nltk_data]   Package wordnet i

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Ratan Tata must check whether his donations re...,"Wrestler Vinesh Phogat on Wednesday said, ""I r...",wrestler vinesh phogat wednesday said request ...,wrestler vinesh phogat wednesday said request ...,wrestler vinesh phogat wednesday said request ...
1,I was laid off from LinkedIn before even start...,"Lea Schuhmacher, a woman who signed the contra...",lea schuhmacher woman signed contract fulltime...,lea schuhmach woman sign contract fulltim job ...,lea schuhmacher woman signed contract fulltime...
2,"India's high streets for shopping ranked, Beng...",Bengaluru's MG Road ranked first in the list o...,bengaluru's mg road ranked first list india's ...,bengaluru' mg road rank first list india' top ...,bengaluru's mg road ranked first list india's ...
3,PepsiCo bottler Varun Beverages enters ₹1 lakh...,"Varun Beverages, PepsiCo's India franchise bot...",varun beverages pepsico's india franchise bott...,varun beverag pepsico' india franchis bottler ...,varun beverage pepsico's india franchise bottl...
4,All eyes on India as it becomes a 'plus 1' to ...,Vedanta Resources Chairman Anil Agarwal said a...,vedanta resources chairman anil agarwal said e...,vedanta resourc chairman anil agarw said eye i...,vedanta resource chairman anil agarwal said ey...


In [63]:
prep_news.iloc[1]


title         I was laid off from LinkedIn before even start...
original      Lea Schuhmacher, a woman who signed the contra...
clean         lea schuhmacher woman signed contract fulltime...
stemmed       lea schuhmach woman sign contract fulltim job ...
lemmatized    lea schuhmacher woman signed contract fulltime...
Name: 1, dtype: object

In [67]:
prep_codeup.iloc[4]

title         Black Excellence in Tech: Panelist Spotlight –...
original      \nBlack excellence in tech: Panelist Spotlight...
clean         black excellence tech panelist spotlight wilma...
stemmed       black excel tech panelist spotlight wilmari de...
lemmatized    black excellence tech panelist spotlight wilma...
Name: 4, dtype: object

## 9. Ask yourself:

* If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    * lemmatize is slower, so smaller is ok to take longer.
* If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    * stem because dataset is larger and stemming is faster
* If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    * stemming because it is faster