# In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.




In [1]:
import pandas as pd

import unicodedata
import re

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import requests
from bs4 import BeautifulSoup

# 1.Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.


In [2]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [3]:
new = original.lower()
new

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [4]:
new = unicodedata.normalize('NFKD' , new)\
    .encode('ascii', 'ignore')\
    .decode('utf-8')
new

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [5]:
new = re.sub(r'[^a-z0-9\'\s]', ' ', new)
new

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field  erdos's name contains the hungarian letter 'o'  'o' with double acute accent   but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [6]:
def basic_clean(original):
    '''
    Input: original text 
    lowercase everything,
    normalize everything, 
    removes anything thats not letter, number, whitespace or single quote
    Output: Cleaned text
    '''
    basic_cleaned= original.lower()#lowercase everything
    basic_cleaned = unicodedata.normalize('NFKD', basic_cleaned)\
    .encode('ascii', 'ignore')\
    .decode('utf-8')
 # normalize unicode character
    basic_cleaned = re.sub(r'[^a-z0-9\'\s]', ' ', new)

    return basic_cleaned
    

In [7]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [8]:
basic_cleaned = basic_clean(original)

In [9]:
basic_cleaned

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field  erdos's name contains the hungarian letter 'o'  'o' with double acute accent   but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

# 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [10]:
tokenize = nltk.tokenize.ToktokTokenizer()# created tokenizer

In [11]:
original = tokenize.tokenize(original, return_str = True)

In [12]:
def tokenize(basic_cleaned):
    '''
    Input: basic_cleaned text string
    Actions:
    creates the tokenizer
    uses the tokenizer
    Output: clean_tokenize text string
    '''
    #create the tokenizer
    tokenize = nltk.tokenize.ToktokTokenizer()
    #use the tokenizer
    clean_tokenize = tokenize.tokenize(basic_cleaned, return_str=True)
    
    return clean_tokenize

In [13]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős ' s name contains the Hungarian letter ' ő ' ( ' o ' with double acute accent ) , but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [14]:
clean_tokenize = tokenize(basic_cleaned)


In [15]:
clean_tokenize


"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

# 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [16]:
stemmer = nltk.porter.PorterStemmer()#create porter stemmer

In [17]:
stems = [stemmer.stem(word) for word in original.split()]    #use stemmer - apply stem to each word in our string


In [18]:
article_stemmed = ' '.join(stems)

In [19]:
def stem(clean_tokenize):
    '''
    Inputs: clean_tokenize
    Actions: creates and uses stemmer for each word
    Output: clean_tokenize_stem
    '''
    stemmer = nltk.porter.PorterStemmer()
    stems = [stemmer.stem(word) for word in clean_tokenize.split()]
    clean_tokenize_stem = ' '.join(stems)

    return clean_tokenize_stem
    

In [20]:
clean_tokenize_stem = stem(clean_tokenize)

In [21]:
clean_tokenize_stem

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

# 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [22]:
wnl = nltk.stem.WordNetLemmatizer()


In [23]:
lemmas = [wnl.lemmatize(word) for word in original.split()]


In [24]:
article_lemma = ' '.join(lemmas)

In [25]:
def lemmatize(clean_tokenize):
    '''
    Input: clean_tokenize
    Actions: cretaes lemmatizer and applies to each word
    Outputs: clean_tokenize_lemma
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in clean_tokenize.split()]
    clean_tokenize_lemma = ' '.join(lemmas)

    return clean_tokenize_lemma
    

    

In [26]:
clean_tokenize_lemma = lemmatize(clean_tokenize)
clean_tokenize_lemma

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

# 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

- This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.



In [27]:
stopwords_ls = stopwords.words('english')

In [28]:
words = article_lemma.split()

In [29]:
stopwords_ls.append(' ')

In [30]:
filtered = [word for word in words if word not in stopwords_ls]


In [31]:
parsed_aricles = ' '.join(filtered)

In [32]:
def remove_stopwords(lemma_or_stem, extra_words=[], exclude_words=[]):
    '''
    Input:text string or .apply(remove_stopwords) to entire data frame
    Action: removes standard stop words
    Output: parsed_article
    '''
    # save stopwords
    stopwords_ls = stopwords.words('english')
    # removing any stopwords in exclude list
    stopwords_ls = set(stopwords_ls) - set(exclude_words)
    # adding any stopwords in extra list
    stopwords_ls = stopwords_ls.union(set(extra_words))
    
    # split words in article
    words = lemma_or_stem.split()
    # remove stopwords from list of words
    filtered = [word for word in words if word not in stopwords_ls]
    # join words back together
    parsed_article = ' '.join(filtered)
    
    return parsed_article

In [33]:
parsed_article = remove_stopwords(clean_tokenize_lemma)
parsed_article

"paul erdos george polya influential hungarian mathematician contributed lot field erdos ' name contains hungarian letter ' ' ' ' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

# 6.Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.



In [34]:
import acquire as a

In [35]:
a.acquire_info()


get_blog_articles2()
scrape_one_page(topic)
get_news_articles(topic_list)


In [36]:
topic_list = ['business', 'sports', 'technology', 'entertainment']


In [37]:
final_list = a.get_news_articles(topic_list)


In [38]:
news_df = pd.DataFrame(final_list)
news_df


Unnamed: 0,category,title,content
0,business,Jio Financial Services' Q2 profit jumps 101% Q...,Jio Financial Services posted a net profit of ...
1,business,LinkedIn lays off 668 employees,Microsoft-owned LinkedIn on Monday said it wou...
2,business,6-month notice period for pilots not unreasona...,Akasa Air CEO Vinay Dube said a notice period ...
3,business,Some working overtime to harm us: Adani Group ...,Adani Group has reacted amid allegations of TM...
4,business,Oil prices steady above $90 as investors asses...,"Brent oil prices steadied above $90 (over ₹7,4..."
5,business,SC rejects telcos' plea to see licence fee as ...,The Supreme Court on Monday rejected a request...
6,business,SpiceJet stock dip amid 'Gangwal not intereste...,SpiceJet's shares tanked 11% on Monday after a...
7,business,"HDFC Bank's Q2 profit jumps 50% to ₹15,976 crore",HDFC Bank on Monday reported a net profit of o...
8,business,What is the TCS bribes-for-jobs scandal?,The bribes-for-jobs scandal at Tata Consultanc...
9,business,"BioNTech warns of write-off of up to ₹7,888 cr...",Germany's BioNTech flagged write-downs of up t...


# 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.



# 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [43]:
news_df['clean_norm_token'] = news_df['content'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)


In [44]:
news_df['stemmed'] = news_df.clean_norm_token.apply(stem)


In [45]:
news_df['lemmatized'] = news_df.clean_norm_token.apply(lemmatize)


In [46]:
news_df.head()


Unnamed: 0,category,title,content,clean_norm_token,stemmed,lemmatized
0,business,Jio Financial Services' Q2 profit jumps 101% Q...,Jio Financial Services posted a net profit of ...,paul erdos george polya influential hungarian ...,paul erdo georg polya influenti hungarian math...,paul erdos george polya influential hungarian ...
1,business,LinkedIn lays off 668 employees,Microsoft-owned LinkedIn on Monday said it wou...,paul erdos george polya influential hungarian ...,paul erdo georg polya influenti hungarian math...,paul erdos george polya influential hungarian ...
2,business,6-month notice period for pilots not unreasona...,Akasa Air CEO Vinay Dube said a notice period ...,paul erdos george polya influential hungarian ...,paul erdo georg polya influenti hungarian math...,paul erdos george polya influential hungarian ...
3,business,Some working overtime to harm us: Adani Group ...,Adani Group has reacted amid allegations of TM...,paul erdos george polya influential hungarian ...,paul erdo georg polya influenti hungarian math...,paul erdos george polya influential hungarian ...
4,business,Oil prices steady above $90 as investors asses...,"Brent oil prices steadied above $90 (over ₹7,4...",paul erdos george polya influential hungarian ...,paul erdo georg polya influenti hungarian math...,paul erdos george polya influential hungarian ...


# 9. Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - lemmatize.
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - lemmatize.
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
   - stemmed.

