# Exercises

In [1]:
from acquire import get_blog_articles, get_news_article
import unicodedata
import re
import json

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

[nltk_data] Downloading package wordnet to /Users/jay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/jay/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(string_cleaning):
    '''
    basic clean will take in a string and after making all characters lower case and making sure only ASCII characters 
    will return the cleaned string
    '''
    string_cleaning = string_cleaning.lower()
    string_cleaning = re.sub(r"[^a-z0-9'\s]", '', string_cleaning)
    return string_cleaning

In [3]:
basic_clean("If there were 40 people, and 38 apples, then 2 didn't. Alternatively, let's say there were 313 people at a dinner party, and 98 salads, so 215 didn't. One more, if there were 33 cats, and 28 mice, how many didn't?")

"if there were 40 people and 38 apples then 2 didn't alternatively let's say there were 313 people at a dinner party and 98 salads so 215 didn't one more if there were 33 cats and 28 mice how many didn't"

## Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [4]:
def tokenize(string_cleaning):
    '''
    tokenize will take in a string and return the string tokenized
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string_cleaning, return_str=True)

In [5]:
tokenize("if there were 40 people and 38 apples then 2 didn't alternatively let's say there were 313 people at a dinner party and 98 salads so 215 didn't one more if there were 33 cats and 28 mice how many didn't")

"if there were 40 people and 38 apples then 2 didn ' t alternatively let ' s say there were 313 people at a dinner party and 98 salads so 215 didn ' t one more if there were 33 cats and 28 mice how many didn ' t"

## Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [6]:
def stem(string_cleaning):
    '''
    stem will take in a string and return the string after stemming
    '''
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string_cleaning.split()]
    spring_cleaning_stemmed = ' '.join(stems)
    return spring_cleaning_stemmed

In [7]:
stem("if there were 40 people and 38 apples then 2 didn ' t alternatively let ' s say there were 313 people at a dinner party and 98 salads so 215 didn ' t one more if there were 33 cats and 28 mice how many didn ' t")

"if there were 40 peopl and 38 appl then 2 didn ' t altern let ' s say there were 313 peopl at a dinner parti and 98 salad so 215 didn ' t one more if there were 33 cat and 28 mice how mani didn ' t"

## Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [8]:
def lemmatize(string_cleaning):
    '''
    lemmatize will take in a string and return the string after lemmatizing it
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string_cleaning.split()]
    spring_cleaning_lemmatized = ' '.join(lemmas)
    return spring_cleaning_lemmatized

In [9]:
lemmatize("if there were 40 people and 38 apples then 2 didn ' t alternatively let ' s say there were 313 people at a dinner party and 98 salads so 215 didn ' t one more if there were 33 cats and 28 mice how many didn ' t")

"if there were 40 people and 38 apple then 2 didn ' t alternatively let ' s say there were 313 people at a dinner party and 98 salad so 215 didn ' t one more if there were 33 cat and 28 mouse how many didn ' t"

## Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
- This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [10]:
extra_words = ['dinner', 'salad']

In [11]:
stopword_list = stopwords.words('english')
stopword_list = stopword_list + [extra_words]
stopword_list[-1]

['dinner', 'salad']

In [12]:
def remove_stopwords(string_cleaning, extra_words='', exclude_words=''):
    '''
    remove_stopwords will take in a string, any extra words to be removed, and and words to keep, and return the string with stop words removed
    '''
    stopword_list = stopwords.words('english')
    stopword_list = stopword_list.remove(exclude_words)
    stopword_list = [stopword_list] + [extra_words]
    words = string_cleaning.split()
    filtered_words = [w for w in words if w not in stopword_list]
    string_without_stopwords = ' '.join(filtered_words)
    return string_without_stopwords

In [13]:
remove_stopwords("if there were 40 people and 38 apple then 2 didn ' t alternatively let ' s say there were 313 people at a dinner party and 98 salad so 215 didn ' t one more if there were 33 cat and 28 mouse how many didn ' t", ['dinner', 'salad'], 'if')

"if there were 40 people and 38 apple then 2 didn ' t alternatively let ' s say there were 313 people at a dinner party and 98 salad so 215 didn ' t one more if there were 33 cat and 28 mouse how many didn ' t"

## Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [14]:
news_df = get_news_article()
news_df

Unnamed: 0,title,body,category
0,Victoria's Secret ex-CEO cuts Harvard ties for...,Victoria's Secret ex-CEO Leslie Wexner's found...,business
1,HDFC Bank's Vigil Aunty ad gets criticism for ...,HDFC Bank's latest advertisement featuring Vig...,business
2,IMEC big opportunity for investors to partner ...,PM Narendra Modi at the Global Maritime India ...,business
3,"ICICI Bank fined ₹12 crore, Kotak ₹3.95 crore ...",The Reserve Bank of India (RBI) has imposed a ...,business
4,35 lakh weddings in 23 days to generate record...,Traders' body Confederation of All India Trade...,business
5,"Mahadev app key accused arrives from Dubai, he...","Mrugank Mishra, a key accused in Mahadev betti...",business
6,Bankman-Fried's trial delay request over Adder...,A US court denied FTX Founder Sam Bankman-Frie...,business
7,Indian wheat prices hit 8-month high,Prices of wheat in India have reached an eight...,business
8,Dabur gets ₹321 crore GST demand notice,Dabur India has received a notice to pay Goods...,business
9,'Kill list' of LinkedIn staff being fired was ...,A list with names of about 500 employees was l...,business


## Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [15]:
codeup_df = get_blog_articles()
codeup_df

Unnamed: 0,title,article
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...


## For each dataframe, produce the following columns:
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

### News

In [16]:
# let's store the dataframe here for easier access
news_df = get_news_article()
news_df

Unnamed: 0,title,body,category
0,Victoria's Secret ex-CEO cuts Harvard ties for...,Victoria's Secret ex-CEO Leslie Wexner's found...,business
1,HDFC Bank's Vigil Aunty ad gets criticism for ...,HDFC Bank's latest advertisement featuring Vig...,business
2,IMEC big opportunity for investors to partner ...,PM Narendra Modi at the Global Maritime India ...,business
3,"ICICI Bank fined ₹12 crore, Kotak ₹3.95 crore ...",The Reserve Bank of India (RBI) has imposed a ...,business
4,35 lakh weddings in 23 days to generate record...,Traders' body Confederation of All India Trade...,business
5,"Mahadev app key accused arrives from Dubai, he...","Mrugank Mishra, a key accused in Mahadev betti...",business
6,Bankman-Fried's trial delay request over Adder...,A US court denied FTX Founder Sam Bankman-Frie...,business
7,Indian wheat prices hit 8-month high,Prices of wheat in India have reached an eight...,business
8,Dabur gets ₹321 crore GST demand notice,Dabur India has received a notice to pay Goods...,business
9,'Kill list' of LinkedIn staff being fired was ...,A list with names of about 500 employees was l...,business


In [17]:
news_df = news_df.rename(columns = {'body': 'original'})

In [18]:
basic_clean(news_df.original[0])

"victoria's secret exceo leslie wexner's foundation announced it's cutting its financial and programmatic ties with harvard we are stunned and sickened at the dismal failure of harvard's leadership to take a clearstand against the barbaric murders of innocent israeli civilians the wexner foundation said harvard's leaders were tiptoeing over hamas' attacks against israel it added"

In [19]:
# test of code for cleaning column
remove_stopwords(tokenize(basic_clean(news_df.original[0])),extra_words = None, exclude_words='not')

"victoria ' s secret exceo leslie wexner ' s foundation announced it ' s cutting its financial and programmatic ties with harvard we are stunned and sickened at the dismal failure of harvard ' s leadership to take a clearstand against the barbaric murders of innocent israeli civilians the wexner foundation said harvard ' s leaders were tiptoeing over hamas ' attacks against israel it added"

In [20]:
# create new column
i = 0
for line in news_df.original:
    news_df['clean'] = remove_stopwords(tokenize(basic_clean(news_df.original[i])),extra_words = None, exclude_words='not')
    i += 1
# for loop for clean column to iterate through articles
i = 0
for line in news_df.original:
    news_df['clean'][i] = remove_stopwords(tokenize(basic_clean(news_df.original[i])),extra_words = None, exclude_words='not')
    i += 1

In [22]:
#create the new column first
i = 0
for line in news_df.original:
    news_df['stemmed'] = remove_stopwords(stem(tokenize(basic_clean(news_df.original[i]))),extra_words = None, exclude_words='not')
    i += 1
# fill the new column with stemmed info per index entry    
i = 0
for line in news_df.original:
    news_df['stemmed'][i] = remove_stopwords(stem(tokenize(basic_clean(news_df.original[i]))),extra_words = None, exclude_words='not')
    i += 1

In [24]:
#create the new column first
i = 0
for line in news_df.original:
    news_df['lemmatized'] = remove_stopwords(lemmatize(tokenize(basic_clean(news_df.original[i]))),extra_words = None, exclude_words='not')
    i += 1
# fill the new column with stemmed info per index entry 
i = 0
for line in news_df.original:
    news_df['lemmatized'][i] = remove_stopwords(lemmatize(tokenize(basic_clean(news_df.original[i]))),extra_words = None, exclude_words='not')
    i += 1

#### Done.

In [None]:
news_df

### Codeup

In [26]:
codeup_df = get_blog_articles()
codeup_df

Unnamed: 0,title,article
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...


In [27]:
codeup_df = codeup_df.rename(columns = {'article': 'original'})

In [28]:
# create new column for 'cleaned' data
i = 0
for line in codeup_df.original:
    codeup_df['clean'] = remove_stopwords(tokenize(basic_clean(codeup_df.original[i])),extra_words = None, exclude_words='not')
    i += 1
# for loop for clean column to iterate through articles
i = 0
for line in codeup_df.original:
    codeup_df['clean'][i] = remove_stopwords(tokenize(basic_clean(codeup_df.original[i])),extra_words = None, exclude_words='not')
    i += 1

In [29]:
#create the new column for stemmed data
i = 0
for line in codeup_df.original:
    codeup_df['stemmed'] = remove_stopwords(stem(tokenize(basic_clean(codeup_df.original[i]))),extra_words = None, exclude_words='not')
    i += 1
# fill the new column with stemmed info per index entry    
i = 0
for line in codeup_df.original:
    codeup_df['stemmed'][i] = remove_stopwords(stem(tokenize(basic_clean(codeup_df.original[i]))),extra_words = None, exclude_words='not')
    i += 1

In [30]:
#create the new column for lemmatized data
i = 0
for line in codeup_df.original:
    codeup_df['lemmatized'] = remove_stopwords(lemmatize(tokenize(basic_clean(codeup_df.original[i]))),extra_words = None, exclude_words='not')
    i += 1
# fill the new column with stemmed info per index entry 
i = 0
for line in codeup_df.original:
    codeup_df['lemmatized'][i] = remove_stopwords(lemmatize(tokenize(basic_clean(codeup_df.original[i]))),extra_words = None, exclude_words='not')
    i += 1

#### Done.

In [31]:
# testing if for loops worked
codeup_df

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...,may is traditionally known as asian american a...,may is tradit known as asian american and paci...,may is traditionally known a asian american an...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women in tech panelist spotlight magdalena rah...,women in tech panelist spotlight magdalena rah...,woman in tech panelist spotlight magdalena rah...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women in tech panelist spotlight rachel robbin...,women in tech panelist spotlight rachel robbin...,woman in tech panelist spotlight rachel robbin...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...,women in tech panelist spotlight sarah mellor ...,women in tech panelist spotlight sarah mellor ...,woman in tech panelist spotlight sarah mellor ...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...,women in tech panelist spotlight madeleine cap...,women in tech panelist spotlight madelein capp...,woman in tech panelist spotlight madeleine cap...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,black excellence in tech panelist spotlight wi...,black excel in tech panelist spotlight wilmari...,black excellence in tech panelist spotlight wi...


## Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
  - Definitely lemmatized text here, with such a small file, the run time shouldn't be significant enough to even notice.
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
  - I still believe this is within a size margin to lemmatize, once we start getting into GB further consideration may be needed.
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
  - this is tough, although I personally prefer to lemmatize the data, stemmed would be significantly faster from such a large corpus. There would likely have to be more emphasis before attempting to extract the data so no return pulls are needed. 

# Personal Bonus

#### Testing functions to ensure reciprocity

In [8]:
from acquire import get_blog_articles, get_news_article
from prepare import basic_clean, create_clean, create_lemmatize, create_stemmed, remove_stopwords, tokenize, stem, lemmatize
import pandas as pd

## News

In [17]:
news_df = get_news_article()
news_df = news_df.rename(columns = {'body': 'original'})
news_df['clean'] = create_clean(news_df, extra_words = None, exclude_words = 'not')['clean']
news_df['stemmed'] = create_stemmed(news_df, extra_words = None, exclude_words = 'not')['stemmed']
news_df['lemmatized'] = create_lemmatize(news_df, extra_words = None, exclude_words = 'not')['lemmatized']
news_df

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,Victoria's Secret ex-CEO cuts Harvard ties for...,Victoria's Secret ex-CEO Leslie Wexner's found...,business,victoria ' s secret exceo leslie wexner ' s fo...,victoria ' s secret exceo lesli wexner ' s fou...,victoria ' s secret exceo leslie wexner ' s fo...
1,HDFC Bank's Vigil Aunty ad gets criticism for ...,HDFC Bank's latest advertisement featuring Vig...,business,hdfc bank ' s latest advertisement featuring v...,hdfc bank ' s latest advertis featur vigil aun...,hdfc bank ' s latest advertisement featuring v...
2,IMEC big opportunity for investors to partner ...,PM Narendra Modi at the Global Maritime India ...,business,pm narendra modi at the global maritime india ...,pm narendra modi at the global maritim india s...,pm narendra modi at the global maritime india ...
3,"ICICI Bank fined ₹12 crore, Kotak ₹3.95 crore ...",The Reserve Bank of India (RBI) has imposed a ...,business,the reserve bank of india rbi has imposed a pe...,the reserv bank of india rbi ha impos a penalt...,the reserve bank of india rbi ha imposed a pen...
4,35 lakh weddings in 23 days to generate record...,Traders' body Confederation of All India Trade...,business,traders ' body confederation of all india trad...,trader ' bodi confeder of all india trader cai...,trader ' body confederation of all india trade...
5,"Mahadev app key accused arrives from Dubai, he...","Mrugank Mishra, a key accused in Mahadev betti...",business,mrugank mishra a key accused in mahadev bettin...,mrugank mishra a key accus in mahadev bet app ...,mrugank mishra a key accused in mahadev bettin...
6,Bankman-Fried's trial delay request over Adder...,A US court denied FTX Founder Sam Bankman-Frie...,business,a us court denied ftx founder sam bankmanfried...,a us court deni ftx founder sam bankmanfri ' s...,a u court denied ftx founder sam bankmanfried ...
7,Indian wheat prices hit 8-month high,Prices of wheat in India have reached an eight...,business,prices of wheat in india have reached an eight...,price of wheat in india have reach an eightmon...,price of wheat in india have reached an eightm...
8,Dabur gets ₹321 crore GST demand notice,Dabur India has received a notice to pay Goods...,business,dabur india has received a notice to pay goods...,dabur india ha receiv a notic to pay good and ...,dabur india ha received a notice to pay good a...
9,'Kill list' of LinkedIn staff being fired was ...,A list with names of about 500 employees was l...,business,a list with names of about 500 employees was l...,a list with name of about 500 employe wa leak ...,a list with name of about 500 employee wa leak...


## Codeup

In [20]:
codeup_df = get_blog_articles()
codeup_df = codeup_df.rename(columns = {'article': 'original'})
codeup_df['clean'] = create_clean(codeup_df, extra_words = None, exclude_words = 'not')['clean']
codeup_df['stemmed'] = create_stemmed(codeup_df, extra_words = None, exclude_words = 'not')['stemmed']
codeup_df['lemmatized'] = create_lemmatize(codeup_df, extra_words = None, exclude_words = 'not')['lemmatized']
codeup_df

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...,may is traditionally known as asian american a...,may is tradit known as asian american and paci...,may is traditionally known a asian american an...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women in tech panelist spotlight magdalena rah...,women in tech panelist spotlight magdalena rah...,woman in tech panelist spotlight magdalena rah...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women in tech panelist spotlight rachel robbin...,women in tech panelist spotlight rachel robbin...,woman in tech panelist spotlight rachel robbin...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...,women in tech panelist spotlight sarah mellor ...,women in tech panelist spotlight sarah mellor ...,woman in tech panelist spotlight sarah mellor ...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...,women in tech panelist spotlight madeleine cap...,women in tech panelist spotlight madelein capp...,woman in tech panelist spotlight madeleine cap...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,black excellence in tech panelist spotlight wi...,black excel in tech panelist spotlight wilmari...,black excellence in tech panelist spotlight wi...
