In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire



# 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it.
- ### Lowercase everything
- ### Normalize unicode characters
- ### Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
test_string = 'In the broadest dêfinition, a time series is any data set where the values are measured at different points in time. Many time series are uniformly spaced $$$ at a specific frequency, for example, hourly weather measurements, daily counts of web site visits, or monthly sales totals. Time series can also be irregularly spaced and sporadic, for example, timestamped data in a computer system’s event log or a history of 911 emergency calls. Pandas time series tools apply equally well to either type of time series.'

Convert all to lowercase:

In [3]:
string = test_string.lower()
string

'in the broadest dêfinition, a time series is any data set where the values are measured at different points in time. many time series are uniformly spaced $$$ at a specific frequency, for example, hourly weather measurements, daily counts of web site visits, or monthly sales totals. time series can also be irregularly spaced and sporadic, for example, timestamped data in a computer system’s event log or a history of 911 emergency calls. pandas time series tools apply equally well to either type of time series.'

Remove Non-Ascii Characters:

In [4]:
string = unicodedata.normalize('NFKD', test_string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
string

'In the broadest definition, a time series is any data set where the values are measured at different points in time. Many time series are uniformly spaced $$$ at a specific frequency, for example, hourly weather measurements, daily counts of web site visits, or monthly sales totals. Time series can also be irregularly spaced and sporadic, for example, timestamped data in a computer systems event log or a history of 911 emergency calls. Pandas time series tools apply equally well to either type of time series.'

Remove special characters:

In [5]:
string = re.sub(r"[^a-z0-9'\s]", '', test_string)
string

'n the broadest dfinition a time series is any data set where the values are measured at different points in time any time series are uniformly spaced  at a specific frequency for example hourly weather measurements daily counts of web site visits or monthly sales totals ime series can also be irregularly spaced and sporadic for example timestamped data in a computer systems event log or a history of 911 emergency calls andas time series tools apply equally well to either type of time series'

##### Test function:

In [6]:
from prepare import basic_clean

In [7]:
string = basic_clean(test_string)
string

'in the broadest definition a time series is any data set where the values are measured at different points in time many time series are uniformly spaced  at a specific frequency for example hourly weather measurements daily counts of web site visits or monthly sales totals time series can also be irregularly spaced and sporadic for example timestamped data in a computer systems event log or a history of 911 emergency calls pandas time series tools apply equally well to either type of time series'

# 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [8]:
tokenizer = ToktokTokenizer()

string = tokenizer.tokenize(test_string, return_str=True)
string

'In the broadest dêfinition , a time series is any data set where the values are measured at different points in time. Many time series are uniformly spaced $ $ $ at a specific frequency , for example , hourly weather measurements , daily counts of web site visits , or monthly sales totals. Time series can also be irregularly spaced and sporadic , for example , timestamped data in a computer system ’ s event log or a history of 911 emergency calls. Pandas time series tools apply equally well to either type of time series .'

##### Test Function:

In [9]:
from prepare import tokenize

In [10]:
string = tokenize(test_string)
string

'In the broadest dêfinition , a time series is any data set where the values are measured at different points in time. Many time series are uniformly spaced $ $ $ at a specific frequency , for example , hourly weather measurements , daily counts of web site visits , or monthly sales totals. Time series can also be irregularly spaced and sporadic , for example , timestamped data in a computer system ’ s event log or a history of 911 emergency calls. Pandas time series tools apply equally well to either type of time series .'

# 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [11]:
ps = nltk.porter.PorterStemmer()
stems = [ps.stem(word) for word in test_string.split()]
stems[:5]

['in', 'the', 'broadest', 'dêfinition,', 'a']

##### Test Function:

In [12]:
from prepare import stem

In [13]:
stems = stem(test_string)
stems[:5]

'in th'

# 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [14]:
wnl = nltk.stem.WordNetLemmatizer()
lemmas = [wnl.lemmatize(word) for word in test_string.split()]
lemmas[:5]

['In', 'the', 'broadest', 'dêfinition,', 'a']

##### Test Function:

In [15]:
from prepare import lemmatize

In [16]:
lemmas = lemmatize(test_string)
lemmas[:5]

'In th'

# 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
-This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [17]:
stopword_list = stopwords.words('english')
# Remove stopwords to remove
exclude_words = ['are']
for word in exclude_words:
    stopword_list.remove(word)
# Adds additional stopwords
extra_words = ['data']
for word in extra_words:
    stopword_list.append(word)
words = test_string.split()
filtered_words = [w for w in words if w not in stopword_list]
print('Removed {} stopwords'.format(len(words) - len(filtered_words)))

stopwordless_string = ' '.join(filtered_words)
print('-----')
stopwordless_string

Removed 26 stopwords
-----


'In broadest dêfinition, time series set values are measured different points time. Many time series are uniformly spaced $$$ specific frequency, example, hourly weather measurements, daily counts web site visits, monthly sales totals. Time series also irregularly spaced sporadic, example, timestamped computer system’s event log history 911 emergency calls. Pandas time series tools apply equally well either type time series.'

##### Test Function:

In [18]:
from prepare import remove_stopwords

In [19]:
exclude_words = ['the']
extra_words = ['data','hourly']

string = remove_stopwords(test_string,extra_words,exclude_words)
string

Removed 27 stopwords


'In the broadest dêfinition, time series set the values measured different points time. Many time series uniformly spaced $$$ specific frequency, example, weather measurements, daily counts web site visits, monthly sales totals. Time series also irregularly spaced sporadic, example, timestamped computer system’s event log history 911 emergency calls. Pandas time series tools apply equally well either type time series.'

##### Altogether:

In [20]:
from prepare import full_clean

In [21]:
cleaned_string = full_clean(test_string, extra_words, exclude_words)
cleaned_string

Removed 28 stopwords


'the broadest definition time series set the value measured different point time many time series uniformly spaced specific frequency example weather measurement daily count web site visit monthly sale total time series also irregularly spaced sporadic example timestamped computer system event log history 911 emergency call panda time series tool apply equally well either type time series'

# 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df. 

In [38]:
articles = acquire.get_news_articles()

In [39]:
news_df = pd.DataFrame(articles)
news_df

Unnamed: 0,title,content,category
0,Rupee hits 80 per US dollar for the first time...,The Indian rupee touched 80 per US dollar for ...,Business
1,ED arrests ex-Mumbai Police chief Sanjay Pande...,The Enforcement Directorate (ED) on Tuesday ar...,Business
2,Gautam Adani overtakes Bill Gates to become wo...,Gautam Adani has overtaken Bill Gates to becom...,Business
3,Who are now the world's 10 richest people as A...,Gautam Adani has overtaken Bill Gates to becom...,Business
4,List of items exempt from GST when sold loose ...,Amid criticism over pre-packaged and pre-label...,Business
...,...,...,...
95,I salute Sushmita Sen for living life on her o...,Filmmaker Mahesh Bhatt defended Sushmita Sen a...,Entertainment
96,Joe Russo arrives in Mumbai ahead of 'The Gray...,Filmmaker Joe Russo has arrived in Mumbai ahea...,Entertainment
97,Told them to get lost: Mallika Sherawat on bei...,Actress Mallika Sherawat revealed she was offe...,Entertainment
98,"Jackie Chan opened doors for me in H'wood, he'...","Talking about Jackie Chan, Mallika Sherawat sa...",Entertainment


# 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [24]:
blogs = acquire.acquire_codeup_blog()

In [25]:
codeup_df = pd.DataFrame(blogs)
codeup_df

Unnamed: 0,Title,Published,Content
0,Project Quest Info Session: IT Jumpstart on Ma...,"May 11, 2022",Join our grant partner Project Quest as they d...
1,Codeup Dallas: How to Succeed at a Coding Boot...,"May 23, 2022",This event is the perfect opportunity for peop...
2,What Jobs Can You Get After a Coding Bootcamp?...,"Jul 14, 2022",Have you been considering a career in Cloud Ad...
3,5 Reasons To Attend Our New Cloud Administrati...,"May 17, 2022",Come Work In The Cloud\nWhen your Monday rolls...
4,In-Person Workshop: Learn to Code – Python on ...,"Jun 20, 2022","According to LinkedIn, the “#1 Most Promising ..."


# 8. For each dataframe, produce the following columns:
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

### News:

In [73]:
extra_words = []
exclude_words = []
df_holder = []

for rows in news_df.index:
    row = {}
    title = news_df.iloc[rows][0]
    content = news_df.iloc[rows][1]
    
    row['title'] = title
    row['content'] = content
    
    news_df_cleaned = basic_clean(content)
    news_df_cleaned = tokenize(news_df_cleaned)
    news_df_cleaned = remove_stopwords(news_df_cleaned)
    
    row['clean'] = news_df_cleaned
    
    news_df_stem = full_clean(content,extra_words,exclude_words,stem_or_lemma = 'stem')
    row['stemmed'] = news_df_stem

    news_df_lemma = full_clean(content,extra_words,exclude_words)
    row['lemmatized'] = news_df_lemma
    
    df_holder.append(row)
pd.DataFrame(df_holder)

Removed 20 stopwords
Removed 19 stopwords
Removed 19 stopwords
Removed 16 stopwords
Removed 15 stopwords
Removed 15 stopwords
Removed 20 stopwords
Removed 17 stopwords
Removed 18 stopwords
Removed 17 stopwords
Removed 16 stopwords
Removed 16 stopwords
Removed 21 stopwords
Removed 20 stopwords
Removed 19 stopwords
Removed 27 stopwords
Removed 25 stopwords
Removed 25 stopwords
Removed 22 stopwords
Removed 18 stopwords
Removed 20 stopwords
Removed 24 stopwords
Removed 21 stopwords
Removed 22 stopwords
Removed 15 stopwords
Removed 14 stopwords
Removed 14 stopwords
Removed 19 stopwords
Removed 18 stopwords
Removed 18 stopwords
Removed 25 stopwords
Removed 24 stopwords
Removed 24 stopwords
Removed 22 stopwords
Removed 20 stopwords
Removed 21 stopwords
Removed 22 stopwords
Removed 21 stopwords
Removed 21 stopwords
Removed 23 stopwords
Removed 21 stopwords
Removed 21 stopwords
Removed 22 stopwords
Removed 20 stopwords
Removed 21 stopwords
Removed 20 stopwords
Removed 20 stopwords
Removed 20 st

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Rupee hits 80 per US dollar for the first time...,The Indian rupee touched 80 per US dollar for ...,indian rupee touched 80 per us dollar first ti...,indian rupe touch 80 per us dollar first time ...,indian rupee touched 80 per u dollar first tim...
1,ED arrests ex-Mumbai Police chief Sanjay Pande...,The Enforcement Directorate (ED) on Tuesday ar...,enforcement directorate ed tuesday arrested fo...,enforc director ed tuesday arrest former mumba...,enforcement directorate ed tuesday arrested fo...
2,Gautam Adani overtakes Bill Gates to become wo...,Gautam Adani has overtaken Bill Gates to becom...,gautam adani overtaken bill gates become world...,gautam adani ha overtaken bill gate becom worl...,gautam adani ha overtaken bill gate become wor...
3,Who are now the world's 10 richest people as A...,Gautam Adani has overtaken Bill Gates to becom...,gautam adani overtaken bill gates become world...,gautam adani ha overtaken bill gate becom worl...,gautam adani ha overtaken bill gate become wor...
4,List of items exempt from GST when sold loose ...,Amid criticism over pre-packaged and pre-label...,amid criticism prepackaged prelabelled food it...,amid critic prepackag prelabel food item get c...,amid criticism prepackaged prelabelled food it...
...,...,...,...,...,...
95,I salute Sushmita Sen for living life on her o...,Filmmaker Mahesh Bhatt defended Sushmita Sen a...,filmmaker mahesh bhatt defended sushmita sen t...,filmmak mahesh bhatt defend sushmita sen wa tr...,filmmaker mahesh bhatt defended sushmita sen w...
96,Joe Russo arrives in Mumbai ahead of 'The Gray...,Filmmaker Joe Russo has arrived in Mumbai ahea...,filmmaker joe russo arrived mumbai ahead premi...,filmmak joe russo ha arriv mumbai ahead premie...,filmmaker joe russo ha arrived mumbai ahead pr...
97,Told them to get lost: Mallika Sherawat on bei...,Actress Mallika Sherawat revealed she was offe...,actress mallika sherawat revealed offered tesh...,actress mallika sherawat reveal wa offer teshe...,actress mallika sherawat revealed wa offered t...
98,"Jackie Chan opened doors for me in H'wood, he'...","Talking about Jackie Chan, Mallika Sherawat sa...",talking jackie chan mallika sherawat said jack...,talk jacki chan mallika sherawat said jacki ch...,talking jackie chan mallika sherawat said jack...


### Blogs:

In [79]:
extra_words = []
exclude_words = []
df_holder = []

for rows in codeup_df.index:
    row = {}
    title = codeup_df.iloc[rows][0]
    content = codeup_df.iloc[rows][2]
    
    row['title'] = title
    row['content'] = content
    
    news_df_cleaned = basic_clean(content)
    news_df_cleaned = tokenize(news_df_cleaned)
    news_df_cleaned = remove_stopwords(news_df_cleaned)
    
    row['clean'] = news_df_cleaned
    
    news_df_stem = full_clean(content,extra_words,exclude_words,stem_or_lemma = 'stem')
    row['stemmed'] = news_df_stem

    news_df_lemma = full_clean(content,extra_words,exclude_words)
    row['lemmatized'] = news_df_lemma
    
    df_holder.append(row)
pd.DataFrame(df_holder)

Removed 27 stopwords
Removed 25 stopwords
Removed 27 stopwords
Removed 48 stopwords
Removed 45 stopwords
Removed 48 stopwords
Removed 142 stopwords
Removed 135 stopwords
Removed 141 stopwords
Removed 530 stopwords
Removed 513 stopwords
Removed 528 stopwords
Removed 65 stopwords
Removed 63 stopwords
Removed 64 stopwords


Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Project Quest Info Session: IT Jumpstart on Ma...,Join our grant partner Project Quest as they d...,join grant partner project quest discuss lates...,join grant partner project quest discuss lates...,join grant partner project quest discus latest...
1,Codeup Dallas: How to Succeed at a Coding Boot...,This event is the perfect opportunity for peop...,event perfect opportunity people wondering exp...,thi event perfect opportun peopl wonder expect...,event perfect opportunity people wondering exp...
2,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...,considering career cloud administration idea j...,consid career cloud administr idea job titl po...,considering career cloud administration idea j...
3,5 Reasons To Attend Our New Cloud Administrati...,Come Work In The Cloud\nWhen your Monday rolls...,come work cloud monday rolls around start get ...,come work cloud monday roll around start get s...,come work cloud monday roll around start get s...
4,In-Person Workshop: Learn to Code – Python on ...,"According to LinkedIn, the “#1 Most Promising ...",according linkedin 1 promising job data scienc...,accord linkedin 1 promis job data scienc one m...,according linkedin 1 promising job data scienc...


# 9. Ask yourself:

### If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?

- Lemmatized, almost definitely

### If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?

- Probably lemmatize, maybe stem it

### If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

- Almost certainly stem