# In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.




In [1]:
import pandas as pd

import unicodedata
import re

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords


# 1.Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.


In [2]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [3]:
new = original.lower()
new

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [4]:
new = unicodedata.normalize('NFKD' , new)\
    .encode('ascii', 'ignore')\
    .decode('utf-8')
new

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [5]:
new = re.sub(r'[^a-z0-9\'\s]', ' ', new)
new

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field  erdos's name contains the hungarian letter 'o'  'o' with double acute accent   but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [6]:
def basic_clean(original):
    '''
    Input: original text 
    lowercase everything,
    normalize everything, 
    removes anything thats not letter, number, whitespace or single quote
    Output: Cleaned text
    '''
    basic_cleaned= original.lower()#lowercase everything
    basic_cleaned = unicodedata.normalize('NFKD', basic_cleaned)\
    .encode('ascii', 'ignore')\
    .decode('utf-8')
 # normalize unicode character
    basic_cleaned = re.sub(r'[^a-z0-9\'\s]', ' ', new)

    return basic_cleaned
    

In [7]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [9]:
basic_cleaned = basic_clean(original)

In [10]:
basic_cleaned

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field  erdos's name contains the hungarian letter 'o'  'o' with double acute accent   but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

# 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [12]:
tokenize = nltk.tokenize.ToktokTokenizer()# created tokenizer

In [16]:
original = tokenize.tokenize(original, return_str = True)

In [17]:
def tokenize(basic_cleaned):
    '''
    Input: basic_cleaned text string
    Actions:
    creates the tokenizer
    uses the tokenizer
    Output: clean_tokenize text string
    '''
    #create the tokenizer
    tokenize = nltk.tokenize.ToktokTokenizer()
    #use the tokenizer
    clean_tokenize = tokenize.tokenize(basic_cleaned, return_str=True)
    
    return clean_tokenize

In [19]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős ' s name contains the Hungarian letter ' ő ' ( ' o ' with double acute accent ) , but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [18]:
clean_tokenize = tokenize(basic_cleaned)


In [20]:
clean_tokenize


"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

# 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [21]:
stemmer = nltk.porter.PorterStemmer()#create porter stemmer

In [24]:
stems = [stemmer.stem(word) for word in original.split()]    #use stemmer - apply stem to each word in our string


In [25]:
article_stemmed = ' '.join(stems)

In [27]:
def stem(clean_tokenize):
    '''
    Inputs: clean_tokenize
    Actions: creates and uses stemmer for each word
    Output: clean_tokenize_stem
    '''
    stemmer = nltk.porter.PorterStemmer()
    stems = [stemmer.stem(word) for word in clean_tokenize.split()]
    clean_tokenize_stem = ' '.join(stems)

    return clean_tokenize_stem
    

In [28]:
clean_tokenize_stem = stem(clean_tokenize)

In [29]:
clean_tokenize_stem

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

# 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.