In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire


In [2]:
df = acquire.news_get_data()

## Exercises

The end result of this exercise should be a file named `prepare.py` that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:
    - Lowercase everything
    - Normalize unicode characters
    - Replace anything that is not a letter, number, whitespace or a single quote.


In [3]:
string = df.loc[0,'content']
string

"Adani Transmission has entered the club of India's top 10 most valuable companies, surpassing LIC and HDFC in market value. The Gautam Adani-led company has become India's eighth most valued company, with a market capitalisation of ₹4.42 lakh crore, after its shares hit all-time high on Tuesday. Notably, Adani is the world's third richest person with a $137 billion fortune.  "

In [42]:


def basic_string_clean(string: str, strip=True, lower=True, normalize=True, drop_special=True, drop_punctuation=True) -> str:
    """Returns the same string with the following alterations by default:
    - convert all chars to lowercase
    - maps charcters to fit within ASCII character set (converts accented chars to unaccented counterparts)
    - drops anything that didn't get mapped
    - removes special characters and punctuation
    TODO: Hyphen strategy argument?
    """
    import re
    if strip:
        string = string.strip()
    if lower:
        string = string.lower()
    if normalize:
        string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    if drop_special:
        string = re.sub(r"[\n-]", ' ', string) # Hyphen strategy for now
        regex = r"[^\w\d\s\.\?\!\:\,\']|_"
        string = re.sub(regex, '', string)
    if drop_punctuation:
        regex = r"[\.\?\!\:\,]"
        string = re.sub(regex, '', string)

    return string

basic_string_clean(string)


'codeups dallas campus has a new location for more than two years codeup has been operating out of our original building on commerce st in dallas but recently codeup has moved and tripled our campus space our new address is 900 jackson street suite 400 dallas tx 75202  codeup has been a leading coding boot camp in texas for more than eight years with active campuses in san antonio and dallas offering accelerated programs in data science cloud administration and full stack web development dallas tech scene with dallas tech scene booming it is imperative that the appropriate talent is available we help produce top talent at a fast pace to keep up with the demand for tech jobs in the community codeup dallas offerings currently codeup offers a 20 week full stack web development program at our dallas location with hopes to add our other programs in the future dallas campus area located in the heart of downtown dallas our new location is surrounded by great restaurants museums monuments and 

### 2. Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.
    


#### Using toktok

In [5]:
cleaned = basic_string_clean(string)
nopunc = basic_string_clean(string, drop_punctuation=True)

In [6]:
toktok = nltk.tokenize.ToktokTokenizer()

toktok.tokenize(cleaned, return_str=True)

"adani transmission has entered the club of india ' s top 10 most valuable companies , surpassing lic and hdfc in market value. the gautam adani led company has become india ' s eighth most valued company , with a market capitalisation of 4.42 lakh crore , after its shares hit all time high on tuesday. notably , adani is the world ' s third richest person with a 137 billion fortune ."

In [7]:
def word_tokenize(string:str) -> list:
    from nltk.tokenize.toktok import ToktokTokenizer
    toktok = ToktokTokenizer()
    return toktok.tokenize(string)

In [8]:
' '.join(word_tokenize(nopunc))

"adani transmission has entered the club of india ' s top 10 most valuable companies surpassing lic and hdfc in market value the gautam adani led company has become india ' s eighth most valued company with a market capitalisation of 442 lakh crore after its shares hit all time high on tuesday notably adani is the world ' s third richest person with a 137 billion fortune"

#### Using punkt

In [9]:
nltk.sent_tokenize(cleaned)

["adani transmission has entered the club of india's top 10 most valuable companies, surpassing lic and hdfc in market value.",
 "the gautam adani led company has become india's eighth most valued company, with a market capitalisation of 4.42 lakh crore, after its shares hit all time high on tuesday.",
 "notably, adani is the world's third richest person with a 137 billion fortune."]

In [10]:
nltk.sent_tokenize(nopunc)

["adani transmission has entered the club of india's top 10 most valuable companies surpassing lic and hdfc in market value the gautam adani led company has become india's eighth most valued company with a market capitalisation of 442 lakh crore after its shares hit all time high on tuesday notably adani is the world's third richest person with a 137 billion fortune"]

In [11]:
text = '''
... Punkt knows that the periods in Mr. Smith and Johann S. Bach
... do not mark sentence boundaries.  And sometimes sentences
... can start with non-capitalized words.  i is a good variable
... name.
... '''

nltk.sent_tokenize( basic_string_clean( text ) )

['punkt knows that the periods in mr. smith and johann s. bach do not mark sentence boundaries.',
 'and sometimes sentences can start with non capitalized words.',
 'i is a good variable name.']

### 3. Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.
    


In [12]:
tokens = word_tokenize(nopunc)
' '.join(tokens)

"adani transmission has entered the club of india ' s top 10 most valuable companies surpassing lic and hdfc in market value the gautam adani led company has become india ' s eighth most valued company with a market capitalisation of 442 lakh crore after its shares hit all time high on tuesday notably adani is the world ' s third richest person with a 137 billion fortune"

In [13]:
stemmer = nltk.stem.SnowballStemmer('english')
t1 = [stemmer.stem(tok) for tok in tokens]
' '.join(t1)

"adani transmiss has enter the club of india ' s top 10 most valuabl compani surpass lic and hdfc in market valu the gautam adani led compani has becom india ' s eighth most valu compani with a market capitalis of 442 lakh crore after it share hit all time high on tuesday notabl adani is the world ' s third richest person with a 137 billion fortun"

In [14]:
stemmer = nltk.stem.LancasterStemmer()
t2 = [stemmer.stem(tok) for tok in tokens]
' '.join(t2)

"adan transmit has ent the club of ind ' s top 10 most valu company surpass lic and hdfc in market valu the gautam adan led company has becom ind ' s eigh most valu company with a market capit of 442 lakh cror aft it shar hit al tim high on tuesday not adan is the world ' s third richest person with a 137 bil fortun"

In [15]:
stemmer = nltk.stem.PorterStemmer()
t3 = [stemmer.stem(tok) for tok in tokens]
' '.join(t3)

"adani transmiss ha enter the club of india ' s top 10 most valuabl compani surpass lic and hdfc in market valu the gautam adani led compani ha becom india ' s eighth most valu compani with a market capitalis of 442 lakh crore after it share hit all time high on tuesday notabl adani is the world ' s third richest person with a 137 billion fortun"

In [16]:
pd.options.display.max_columns = None
pd.DataFrame({  'orig':tokens,
                'snowball':t1,
                'porter':t3,
                'lancaster':t2,
                }).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67
orig,adani,transmission,has,entered,the,club,of,india,',s,top,10,most,valuable,companies,surpassing,lic,and,hdfc,in,market,value,the,gautam,adani,led,company,has,become,india,',s,eighth,most,valued,company,with,a,market,capitalisation,of,442,lakh,crore,after,its,shares,hit,all,time,high,on,tuesday,notably,adani,is,the,world,',s,third,richest,person,with,a,137,billion,fortune
snowball,adani,transmiss,has,enter,the,club,of,india,',s,top,10,most,valuabl,compani,surpass,lic,and,hdfc,in,market,valu,the,gautam,adani,led,compani,has,becom,india,',s,eighth,most,valu,compani,with,a,market,capitalis,of,442,lakh,crore,after,it,share,hit,all,time,high,on,tuesday,notabl,adani,is,the,world,',s,third,richest,person,with,a,137,billion,fortun
porter,adani,transmiss,ha,enter,the,club,of,india,',s,top,10,most,valuabl,compani,surpass,lic,and,hdfc,in,market,valu,the,gautam,adani,led,compani,ha,becom,india,',s,eighth,most,valu,compani,with,a,market,capitalis,of,442,lakh,crore,after,it,share,hit,all,time,high,on,tuesday,notabl,adani,is,the,world,',s,third,richest,person,with,a,137,billion,fortun
lancaster,adan,transmit,has,ent,the,club,of,ind,',s,top,10,most,valu,company,surpass,lic,and,hdfc,in,market,valu,the,gautam,adan,led,company,has,becom,ind,',s,eigh,most,valu,company,with,a,market,capit,of,442,lakh,cror,aft,it,shar,hit,al,tim,high,on,tuesday,not,adan,is,the,world,',s,third,richest,person,with,a,137,bil,fortun


Lancaster seems overtly aggressive while Snowball and Porter seem exactly the same.

Researching the differences I've discovered that Snowball is often called Porter2, so I'll stick with snowball.

In [17]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

def stem(word, stemmer: object= stemmer):
    return stemmer.stem(word)

stem('transmission')

'transmiss'

### 4. Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.
    


In [18]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Crux\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Crux\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [19]:
stemmer = nltk.stem.WordNetLemmatizer()
t4 = [stemmer.lemmatize(tok) for tok in tokens]
' '.join(t4)

"adani transmission ha entered the club of india ' s top 10 most valuable company surpassing lic and hdfc in market value the gautam adani led company ha become india ' s eighth most valued company with a market capitalisation of 442 lakh crore after it share hit all time high on tuesday notably adani is the world ' s third richest person with a 137 billion fortune"

In [20]:
pd.options.display.max_columns = None
xf = pd.DataFrame({  'orig':tokens,
                'wordnet':t4,
                'snowball':t1,
                }).T

xf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67
orig,adani,transmission,has,entered,the,club,of,india,',s,top,10,most,valuable,companies,surpassing,lic,and,hdfc,in,market,value,the,gautam,adani,led,company,has,become,india,',s,eighth,most,valued,company,with,a,market,capitalisation,of,442,lakh,crore,after,its,shares,hit,all,time,high,on,tuesday,notably,adani,is,the,world,',s,third,richest,person,with,a,137,billion,fortune
wordnet,adani,transmission,ha,entered,the,club,of,india,',s,top,10,most,valuable,company,surpassing,lic,and,hdfc,in,market,value,the,gautam,adani,led,company,ha,become,india,',s,eighth,most,valued,company,with,a,market,capitalisation,of,442,lakh,crore,after,it,share,hit,all,time,high,on,tuesday,notably,adani,is,the,world,',s,third,richest,person,with,a,137,billion,fortune
snowball,adani,transmiss,has,enter,the,club,of,india,',s,top,10,most,valuabl,compani,surpass,lic,and,hdfc,in,market,valu,the,gautam,adani,led,compani,has,becom,india,',s,eighth,most,valu,compani,with,a,market,capitalis,of,442,lakh,crore,after,it,share,hit,all,time,high,on,tuesday,notabl,adani,is,the,world,',s,third,richest,person,with,a,137,billion,fortun


In [21]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize(word, lemmatizer: object= lemmatizer):
    return lemmatizer.lemmatize(word)

lemmatize('companies')

'company'

### 5. Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.
    
    This function should define two optional parameters, `extra_words` and `exclude_words`. These parameters should define any additional stop words to include, and any words that we _don't_ want to remove.
    


In [22]:
stopwords = nltk.corpus.stopwords.words('english')

In [23]:
def remove_stopwords(tokens:list[str], extra_words:list[str] = [], exclude_words:list[str] = []) -> list[str]:
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords += extra_words
    stopwords = [word for word in stopwords if not word in exclude_words]
    return [word for word in tokens if not word in stopwords]

In [24]:
out = remove_stopwords(tokens, exclude_words=['has'], extra_words=['adani'])
temp = pd.DataFrame({'sans_stopwords':out}).T

In [25]:
xf = pd.concat([xf, temp])
xf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67
orig,adani,transmission,has,entered,the,club,of,india,',s,top,10,most,valuable,companies,surpassing,lic,and,hdfc,in,market,value,the,gautam,adani,led,company,has,become,india,',s,eighth,most,valued,company,with,a,market,capitalisation,of,442,lakh,crore,after,its,shares,hit,all,time,high,on,tuesday,notably,adani,is,the,world,',s,third,richest,person,with,a,137.0,billion,fortune
wordnet,adani,transmission,ha,entered,the,club,of,india,',s,top,10,most,valuable,company,surpassing,lic,and,hdfc,in,market,value,the,gautam,adani,led,company,ha,become,india,',s,eighth,most,valued,company,with,a,market,capitalisation,of,442,lakh,crore,after,it,share,hit,all,time,high,on,tuesday,notably,adani,is,the,world,',s,third,richest,person,with,a,137.0,billion,fortune
snowball,adani,transmiss,has,enter,the,club,of,india,',s,top,10,most,valuabl,compani,surpass,lic,and,hdfc,in,market,valu,the,gautam,adani,led,compani,has,becom,india,',s,eighth,most,valu,compani,with,a,market,capitalis,of,442,lakh,crore,after,it,share,hit,all,time,high,on,tuesday,notabl,adani,is,the,world,',s,third,richest,person,with,a,137.0,billion,fortun
sans_stopwords,transmission,has,entered,club,india,',top,10,valuable,companies,surpassing,lic,hdfc,market,value,gautam,led,company,has,become,india,',eighth,valued,company,market,capitalisation,442,lakh,crore,shares,hit,time,high,tuesday,notably,world,',third,richest,person,137,billion,fortune,,,,,,,,,,,,,,,,,,,,,,,,


### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.
    


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.
    


In [26]:
import acquire

news_df = acquire.news_get_data()
codeup_df = acquire.blog_get_data()

In [27]:
display(news_df.head())
display(codeup_df.head())

Unnamed: 0,title,author,published,category,content
0,Adani Transmission becomes India's 8th most va...,Hiral Goyal,2022-08-30T08:23:58.000Z,business,Adani Transmission has entered the club of Ind...
1,Musk cites whistleblower's claims in new notic...,Ridham Gambhir,2022-08-30T12:02:31.000Z,business,Tesla CEO Elon Musk's legal team has filed ano...
2,No plan to rebrand Zomato app to Eternal: CEO ...,Hiral Goyal,2022-08-30T06:39:28.000Z,business,Zomato CEO Deepinder Goyal clarified in an exc...
3,"Cancelling AC, first-class confirmed train tic...",Ridham Gambhir,2022-08-30T11:23:37.000Z,business,The Finance Ministry stated that cancellation ...
4,China arrests over 230 people tied to its larg...,Hiral Goyal,2022-08-30T04:50:24.000Z,business,China has announced that 234 people who are su...


Unnamed: 0,title,published,category,content
0,Is a Career in Tech Recession-Proof?,"Aug 12, 2022",Cloud Administration,"\nGiven the current economic climate, many eco..."
1,Codeup X Superhero Car Show & Comic Con,"Aug 10, 2022",Codeup News,\nCodeup had a blast at the San Antonio Superh...
2,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",Featured,\nIf you’re considering a career in web develo...
3,Codeup’s New Dallas Campus,"Jul 25, 2022",Codeup News,\nCodeup’s Dallas campus has a new location! F...
4,Codeup TV Commercial,"Jul 20, 2022",Codeup News,\nCodeup has officially made its TV debut! Our...


### 8. For each dataframe, produce the following columns:
    
    - `title` to hold the title
    - `original` to hold the original article/post content
    - `clean` to hold the normalized and tokenized original with the stopwords removed.
    - `stemmed` to hold the stemmed version of the cleaned data.
    - `lemmatized` to hold the lemmatized version of the cleaned data.


In [28]:
news_df['clean'] = news_df.content.apply(basic_string_clean, drop_punctuation=True).apply(word_tokenize)

In [29]:
news_df.head()

Unnamed: 0,title,author,published,category,content,clean
0,Adani Transmission becomes India's 8th most va...,Hiral Goyal,2022-08-30T08:23:58.000Z,business,Adani Transmission has entered the club of Ind...,"[adani, transmission, has, entered, the, club,..."
1,Musk cites whistleblower's claims in new notic...,Ridham Gambhir,2022-08-30T12:02:31.000Z,business,Tesla CEO Elon Musk's legal team has filed ano...,"[tesla, ceo, elon, musk, ', s, legal, team, ha..."
2,No plan to rebrand Zomato app to Eternal: CEO ...,Hiral Goyal,2022-08-30T06:39:28.000Z,business,Zomato CEO Deepinder Goyal clarified in an exc...,"[zomato, ceo, deepinder, goyal, clarified, in,..."
3,"Cancelling AC, first-class confirmed train tic...",Ridham Gambhir,2022-08-30T11:23:37.000Z,business,The Finance Ministry stated that cancellation ...,"[the, finance, ministry, stated, that, cancell..."
4,China arrests over 230 people tied to its larg...,Hiral Goyal,2022-08-30T04:50:24.000Z,business,China has announced that 234 people who are su...,"[china, has, announced, that, 234, people, who..."


In [30]:
news_df['stemmed'] = news_df.clean.apply(lambda x: [stem(word) for word in x])


In [31]:
news_df['lemmatized'] = news_df.clean.apply(lambda x: [lemmatize(word) for word in x])


In [32]:
news_df.head()

Unnamed: 0,title,author,published,category,content,clean,stemmed,lemmatized
0,Adani Transmission becomes India's 8th most va...,Hiral Goyal,2022-08-30T08:23:58.000Z,business,Adani Transmission has entered the club of Ind...,"[adani, transmission, has, entered, the, club,...","[adani, transmiss, has, enter, the, club, of, ...","[adani, transmission, ha, entered, the, club, ..."
1,Musk cites whistleblower's claims in new notic...,Ridham Gambhir,2022-08-30T12:02:31.000Z,business,Tesla CEO Elon Musk's legal team has filed ano...,"[tesla, ceo, elon, musk, ', s, legal, team, ha...","[tesla, ceo, elon, musk, ', s, legal, team, ha...","[tesla, ceo, elon, musk, ', s, legal, team, ha..."
2,No plan to rebrand Zomato app to Eternal: CEO ...,Hiral Goyal,2022-08-30T06:39:28.000Z,business,Zomato CEO Deepinder Goyal clarified in an exc...,"[zomato, ceo, deepinder, goyal, clarified, in,...","[zomato, ceo, deepind, goyal, clarifi, in, an,...","[zomato, ceo, deepinder, goyal, clarified, in,..."
3,"Cancelling AC, first-class confirmed train tic...",Ridham Gambhir,2022-08-30T11:23:37.000Z,business,The Finance Ministry stated that cancellation ...,"[the, finance, ministry, stated, that, cancell...","[the, financ, ministri, state, that, cancel, o...","[the, finance, ministry, stated, that, cancell..."
4,China arrests over 230 people tied to its larg...,Hiral Goyal,2022-08-30T04:50:24.000Z,business,China has announced that 234 people who are su...,"[china, has, announced, that, 234, people, who...","[china, has, announc, that, 234, peopl, who, a...","[china, ha, announced, that, 234, people, who,..."


In [33]:
def make_nlp_cols(series):
    clean = series.apply(basic_string_clean, drop_punctuation=True).apply(word_tokenize).apply(remove_stopwords)
    stemmed = clean.apply(lambda x: [stem(word) for word in x])
    lemmatized = clean.apply(lambda x: [lemmatize(word) for word in x])
    out = pd.concat([clean, stemmed, lemmatized], axis=1)
    out.columns = ['cleaned','stemmed','lemmatized']
    return out

In [34]:
temp = make_nlp_cols(codeup_df.content)

In [35]:
codeup_df = pd.concat([codeup_df, temp], axis=1)

In [36]:
codeup_df.head()

Unnamed: 0,title,published,category,content,cleaned,stemmed,lemmatized
0,Is a Career in Tech Recession-Proof?,"Aug 12, 2022",Cloud Administration,"\nGiven the current economic climate, many eco...","[given, current, economic, climate, many, econ...","[given, current, econom, climat, mani, economi...","[given, current, economic, climate, many, econ..."
1,Codeup X Superhero Car Show & Comic Con,"Aug 10, 2022",Codeup News,\nCodeup had a blast at the San Antonio Superh...,"[codeup, blast, san, antonio, superhero, car, ...","[codeup, blast, san, antonio, superhero, car, ...","[codeup, blast, san, antonio, superhero, car, ..."
2,What Jobs Can You Get After a Coding Bootcamp?...,"Aug 2, 2022",Featured,\nIf you’re considering a career in web develo...,"[youre, considering, career, web, development,...","[your, consid, career, web, develop, dont, kno...","[youre, considering, career, web, development,..."
3,Codeup’s New Dallas Campus,"Jul 25, 2022",Codeup News,\nCodeup’s Dallas campus has a new location! F...,"[codeups, dallas, campus, new, location, two, ...","[codeup, dalla, campus, new, locat, two, year,...","[codeups, dallas, campus, new, location, two, ..."
4,Codeup TV Commercial,"Jul 20, 2022",Codeup News,\nCodeup has officially made its TV debut! Our...,"[codeup, officially, made, tv, debut, communit...","[codeup, offici, made, tv, debut, communiti, s...","[codeup, officially, made, tv, debut, communit..."


In [None]:
"""TODO: debug why apostrophes are dropped from codeup text"""

In [38]:
string = codeup_df.loc[3,'content']

In [68]:
def basic_string_clean(string: str, strip=True, lower=True, normalize=True, drop_special=True, drop_punctuation=True) -> str:
    """Returns the same string with the following alterations by default:
    - convert all chars to lowercase
    - maps charcters to fit within ASCII character set (converts accented chars to unaccented counterparts)
    - drops anything that didn't get mapped
    - removes special characters and punctuation
    TODO: Hyphen strategy argument?
    """
    import re
    if strip:
        string = string.strip()
    if lower:
        string = string.lower()
    if normalize:
        # Handle curly quotes
        charmap = { 0x201c : u'"',
                    0x201d : u'"',
                    0x2018 : u"'",
                    0x2019 : u"'" }
        string = string.translate(charmap)
        # Encode/Decode
        string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    if drop_special:
        string = re.sub(r"[\n-]", ' ', string) # Hyphen strategy for now
        regex = r"[^\w\s\.\?\!\:\,\']|_"
        string = re.sub(regex, '', string)
    if drop_punctuation:
        regex = r"[\.\?\!\:\,]"
        string = re.sub(regex, '', string)

    return string


In [93]:
charmap = { 0x201c : u'"',
            0x201d : u'"',
            0x2018 : u"'",
            0x2019 : u"'" }
print(charmap)

{8220: '"', 8221: '"', 8216: "'", 8217: "'"}


In [97]:
re.match(r'\w', '_')

<re.Match object; span=(0, 1), match='_'>

In [71]:
basic_string_clean(string, strip=True, lower=True, normalize=True, drop_special=True, drop_punctuation=True)


"codeup's dallas campus has a new location for more than two years codeup has been operating out of our original building on commerce st in dallas but recently codeup has moved and tripled our campus space our new address is 900 jackson street suite 400 dallas tx 75202  codeup has been a leading coding boot camp in texas for more than eight years with active campuses in san antonio and dallas offering accelerated programs in data science cloud administration and full stack web development dallas tech scene with dallas' tech scene booming it is imperative that the appropriate talent is available we help produce top talent at a fast pace to keep up with the demand for tech jobs in the community codeup dallas offerings currently codeup offers a 20 week full stack web development program at our dallas location with hopes to add our other programs in the future dallas campus area located in the heart of downtown dallas our new location is surrounded by great restaurants museums monuments an

In [98]:
basic_string_clean(string, strip=True, lower=True, normalize=False, drop_special=False, drop_punctuation=False)

'codeup’s dallas campus has a new location! for more than two years, codeup has been operating out of our original building on commerce st. in dallas, but recently codeup has moved, and tripled our campus space! our new address is 900 jackson street suite #400 dallas, tx 75202\n\ncodeup has been a leading coding boot camp in texas for more than eight years, with active campuses in san antonio and dallas offering accelerated programs in data science, cloud administration, and full-stack web development.\ndallas tech scene\nwith dallas’ tech scene booming, it is imperative that the appropriate talent is available! we help produce top talent at a fast pace to keep up with the demand for tech jobs in the community.\ncodeup dallas offerings\ncurrently, codeup offers a 20-week full-stack web development program at our dallas location with hopes to add our other programs in the future.\ndallas campus area\nlocated in the heart of downtown dallas, our new location is surrounded by great restau

In [72]:
basic_string_clean(string)

"codeup's dallas campus has a new location for more than two years codeup has been operating out of our original building on commerce st in dallas but recently codeup has moved and tripled our campus space our new address is 900 jackson street suite 400 dallas tx 75202  codeup has been a leading coding boot camp in texas for more than eight years with active campuses in san antonio and dallas offering accelerated programs in data science cloud administration and full stack web development dallas tech scene with dallas' tech scene booming it is imperative that the appropriate talent is available we help produce top talent at a fast pace to keep up with the demand for tech jobs in the community codeup dallas offerings currently codeup offers a 20 week full stack web development program at our dallas location with hopes to add our other programs in the future dallas campus area located in the heart of downtown dallas our new location is surrounded by great restaurants museums monuments an

In [67]:

charmap = { 0x201c : u'"',
            0x201d : u'"',
            0x2018 : u"'",
            0x2019 : u"'" }
print (string.translate(charmap) )


Codeup's Dallas campus has a new location! For more than two years, Codeup has been operating out of our original building on Commerce st. in Dallas, but recently Codeup has moved, and tripled our campus space! Our new address is 900 Jackson Street Suite #400 Dallas, Tx 75202

Codeup has been a leading coding boot camp in Texas for more than eight years, with active campuses in San Antonio and Dallas offering accelerated programs in Data Science, Cloud Administration, and Full-Stack Web Development.
Dallas Tech Scene
With Dallas' tech scene booming, it is imperative that the appropriate talent is available! We help produce top talent at a fast pace to keep up with the demand for tech jobs in the community.
Codeup Dallas Offerings
Currently, Codeup offers a 20-week Full-Stack Web Development program at our Dallas location with hopes to add our other programs in the future.
Dallas Campus Area
Located in the heart of Downtown Dallas, our new location is surrounded by great restaurants, m

### 9. Ask yourself:
    
    - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

# Refactoring to return strings instead of lists

In [86]:
toktok = nltk.tokenize.ToktokTokenizer()
snowball = nltk.stem.SnowballStemmer('english')
wordnet = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

def lemmatize(sentence, lemmatizer:object = wordnet) -> str:
    words = sentence.split(' ')
    out = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(out)


def stem(sentence, stemmer:object = snowball) -> str:
    words = sentence.split(' ')
    out = [stemmer.stem(word) for word in words]
    return ' '.join(out)


def word_tokenize(string:str, tokenizer:object = toktok) -> str:
    tokens =  tokenizer.tokenize(string)
    return ' '.join(tokens)

def remove_stopwords(string:str, extra_words:list[str] = [], exclude_words:list[str] = []) -> str:
    tokens = string.split(' ')
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords += extra_words
    stopwords = [word for word in stopwords if not word in exclude_words]
    out = [word for word in tokens if not word in stopwords]
    return ' '.join(out)

In [87]:
def make_nlp_cols(series: pd.Series) -> pd.Series:
    clean = series.apply(basic_string_clean, drop_punctuation=True)\
        .apply(word_tokenize)\
        .apply(remove_stopwords)
    stemmed = clean.apply(stem)
    lemmatized = clean.apply(lemmatize)
    out = pd.concat([clean, stemmed, lemmatized], axis=1)
    out.columns = ['cleaned','stemmed','lemmatized']
    return out

In [80]:
make_nlp_cols(codeup_df.content)

Unnamed: 0,cleaned,stemmed,lemmatized
0,given current economic climate many economists...,given current econom climat mani economist con...,given current economic climate many economist ...
1,codeup blast san antonio superhero car show co...,codeup blast san antonio superhero car show co...,codeup blast san antonio superhero car show co...
2,' considering career web development ' know ex...,' consid career web develop ' know expect cont...,' considering career web development ' know ex...
3,codeup ' dallas campus new location two years ...,codeup ' dalla campus new locat two year codeu...,codeup ' dallas campus new location two year c...
4,codeup officially made tv debut community stud...,codeup offici made tv debut communiti student ...,codeup officially made tv debut community stud...
...,...,...,...
279,simple answer computer programming talent peop...,simpl answer comput program talent peopl take ...,simple answer computer programming talent peop...
280,developing codeup interviewed dozens people at...,develop codeup interview dozen peopl attempt l...,developing codeup interviewed dozen people att...
281,tedx san antonio presentation fall nick longo ...,tedx san antonio present fall nick longo co fo...,tedx san antonio presentation fall nick longo ...
282,hot topic trending special treatment ladies te...,hot topic trend special treatment ladi tech in...,hot topic trending special treatment lady tech...
