In [56]:
import pandas as pd
import numpy as np
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
import json 

import acquire

import warnings
warnings.filterwarnings('ignore')

## 1. function that takes a string and applies some basic text cleaning
   - lowercase everything
   - normalize unicode
   - replace anything not a letter, number, whitespace, or single quote

In [57]:
def basic_clean(string):
    '''this function takes in a string
    and makes everything lowercase
    normalizes, encodes, decodes
    and removes non-alpha-numerics, whitespace, and single quotes
    '''
    # make everything lowercase
    string = string.lower()
    
    # normalize
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    
    # clulnky character removal
    string = re.sub('[^a-z0-9\'\s]', '', string)
    
    return string

In [7]:
basic_clean("Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity")

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

## 2. function named tokenize. It should take in a string and tokenize all the words in the string.

In [53]:
def tokenize(string, charms = True):
    ''' This function takes a string and returns a tokenizes version.
    If set to false, returns a list of tokenized strings'''
    
    # create tokenize object
    tokenize = nltk.tokenize.ToktokTokenizer()
    # apply the tokenizer to the string
    string = tokenize.tokenize(string, return_str = charms)
    return string

In [54]:
tokenize("Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity")

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős ' s name contains the Hungarian letter ' ő ' ( ' o ' with double acute accent ) , but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [55]:
tokenize("Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or ", False)

['Paul',
 'Erdős',
 'and',
 'George',
 'Pólya',
 'were',
 'influential',
 'Hungarian',
 'mathematicians',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field.',
 'Erdős',
 "'",
 's',
 'name',
 'contains',
 'the',
 'Hungarian',
 'letter',
 "'",
 'ő',
 "'",
 '(',
 "'",
 'o',
 "'",
 'with',
 'double',
 'acute',
 'accent',
 ')',
 ',',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'as',
 'Erdos',
 'or']

## 3. function named stem. It should accept some text and return the text after applying stemming to all the words.

In [16]:
def stem(string):
    '''
    This function takes a string and 
    returns a string of words stemmed. '''
    
    ps = nltk.porter.PorterStemmer()
    
    stems = [ps.stem(word) for word in string.split()]
    stems = ' '.join(stems)
    
    return stems

In [17]:
stem("Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity")

"paul erdő and georg pólya were influenti hungarian mathematician who contribut a lot to the field. erdős' name contain the hungarian letter 'ő' ('o' with doubl acut accent), but is often incorrectli written as erdo or erdö either by mistak or out of typograph necess"

In [14]:
def stem_tall(string):
    
    ps = nltk.porter.PorterStemmer()
    
    stems = [ps.stem(word) for word in string.split()]
    
    return stems

In [58]:
stem_tall("Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdőss name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity")

['paul',
 'erdő',
 'and',
 'georg',
 'pólya',
 'were',
 'influenti',
 'hungarian',
 'mathematician',
 'who',
 'contribut',
 'a',
 'lot',
 'to',
 'the',
 'field.',
 'erdőss',
 'name',
 'contain',
 'the',
 'hungarian',
 'letter',
 "'ő'",
 "('o'",
 'with',
 'doubl',
 'acut',
 'accent),',
 'but',
 'is',
 'often',
 'incorrectli',
 'written',
 'as',
 'erdo',
 'or',
 'erdö',
 'either',
 'by',
 'mistak',
 'or',
 'out',
 'of',
 'typograph',
 'necess']

## 4. lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [22]:
def lemmatize(string):
      '''
    This function takes a string and 
    returns a string of words lemmatized. '''
    wnl = nltk.stem.WordNetLemmatizer()

    lemmas = [wnl.lemmatize(word) for word in string.split()]
    lemmas = ' '.join(lemmas)
    
    return lemmas

In [23]:
def lemmatize_tall(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]

    return lemmas

In [24]:
lemmatize("Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity")

"Paul Erdős and George Pólya were influential Hungarian mathematician who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written a Erdos or Erdös either by mistake or out of typographical necessity"

In [25]:
lemmatize_tall("Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity")

['Paul',
 'Erdős',
 'and',
 'George',
 'Pólya',
 'were',
 'influential',
 'Hungarian',
 'mathematician',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field.',
 "Erdős's",
 'name',
 'contains',
 'the',
 'Hungarian',
 'letter',
 "'ő'",
 "('o'",
 'with',
 'double',
 'acute',
 'accent),',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'a',
 'Erdos',
 'or',
 'Erdös',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

## 5. function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
## *get assistance for parameters*

In [33]:
def remove_stopwords(text, extra_words = [], exclude_words = []):
    ''' This function takes in text and removes the 
    standard english stopwords from it and returns a 
    list of stringsa'''
    
    # for english stopwords
    sw = stopwords.words('english')
    
    text = [word for word in text.split() if word not in sw]
    
    return text

In [34]:
remove_stopwords("Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity")

['Paul',
 'Erdős',
 'George',
 'Pólya',
 'influential',
 'Hungarian',
 'mathematicians',
 'contributed',
 'lot',
 'field.',
 "Erdős's",
 'name',
 'contains',
 'Hungarian',
 'letter',
 "'ő'",
 "('o'",
 'double',
 'acute',
 'accent),',
 'often',
 'incorrectly',
 'written',
 'Erdos',
 'Erdös',
 'either',
 'mistake',
 'typographical',
 'necessity']

## 6. news_df using acquire file

In [30]:
news_df = acquire.get_news_articles()
news_df

Unnamed: 0,title,content,category
0,"[Drunk man smoked in toilet, another peed on w...",[A drunk passenger smoked in the toilet on a P...,business
1,"[Kohli shares pic with Anushka, daughter walki...",[Virat Kohli took to social media to share a p...,sports
2,[Musk enters Guinness World Records for the la...,[Elon Musk has officially broken the world rec...,technology
3,[Old pics of Shah Rukh Khan and Angelina Jolie...,[Old pictures of actors Shah Rukh Khan and Ang...,entertainment


## 7. codeup_df using acquire file

In [32]:
codeup_df = acquire.get_blog_articles()
codeup_df



  codeup_soup = BeautifulSoup(url_response.text)


Unnamed: 0,title,content
0,Become a Data Scientist in 6 Months!,\nAre you feeling unfulfilled in your work but...
1,Hiring Tech Talent Around the Holidays,\nAre you a hiring manager having trouble fill...
2,Cloud Administration Program New Funding Options,\nFinding resources to fund your educational g...
3,Why Dallas is a Great Location for IT Professi...,"\nWhen breaking into a new career, it is impor..."
4,Codeup is ranked #1 Best in DFW 2022,\nWe are excited to announce that Codeup ranke...
5,Codeup’s Scholarship Offerings,\nIn honor of November being National Scholars...


## 8. adding columns for each df
### *big honkin function alert*

In [None]:
def df_modifier(df):
    df['original'] = df.content

In [59]:
#df['some_column'] = df['old_col'].apply(some_function)

## 9. ask yourself
 * 493KB: lemm
 * 25MB: lemm
 * 200TB: 