### Data Preparation - NLP

In [1]:
# unicode, regex, json for text digestion
import unicodedata
import re
import json

# nltk: natural language toolkit -> tokenization, stopwords (more on this soon)
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# pandas dataframe manipulation, acquire script, time formatting
import pandas as pd
import acquire
from time import strftime

# shh, down in front
import warnings
warnings.filterwarnings('ignore')

In [2]:
blog_1 = acquire.get_blog_posts('https://codeup.com/workshops/from-bootcamp-to-bootcamp-a-military-appreciation-panel/')
blog_2 = acquire.get_blog_posts('https://codeup.com/featured/our-acquisition-of-the-rackspace-cloud-academy-one-year-later/')
blog_3 = acquire.get_blog_posts('https://codeup.com/workshops/virtual/learn-to-code-html-css-on-4-30/')
blog_4 = acquire.get_blog_posts('https://codeup.com/workshops/virtual/learn-to-code-python-workshop-on-4-16/')
blog_5 = acquire.get_blog_posts('https://codeup.com/codeup-news/coming-soon-cloud-administration/')
codeup_df = pd.DataFrame([blog_1, blog_2, blog_3, blog_4, blog_5])
codeup_df



Unnamed: 0,title,content
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ..."
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a..."
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...
3,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ..."
4,Coming Soon: Cloud Administration,We’re launching a new program out of San Anton...


In [3]:
business_articles = acquire.get_news_articles('https://inshorts.com/en/read/business', 'business')
sports_articles = acquire.get_news_articles('https://inshorts.com/en/read/sports', 'sports')
tech_articles = acquire.get_news_articles('https://inshorts.com/en/read/technology', 'technology')
entertainment_articles = acquire.get_news_articles('https://inshorts.com/en/read/entertainment', 'entertainment')
news_df = pd.concat([business_articles, sports_articles, tech_articles, entertainment_articles], axis=0).reset_index(drop=True)
news_df

Unnamed: 0,title,content,category
0,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,business
1,India's biggest IPO of LIC subscribed nearly 3...,"LIC's IPO, India's biggest IPO which opened on...",business
2,"Office as we know it, is over: Airbnb CEO on l...",After Airbnb allowed its employees to work rem...,business
3,Microsoft to help cover US employees' travel c...,Microsoft has said that it will cover travel c...,business
4,Musk's $44 bn Twitter deal at risk of being re...,Elon Musk's $44 billion offer to buy Twitter c...,business
...,...,...,...
95,We're running shops in mall he built: Ranveer ...,"Actor Ranveer Singh called Shah Rukh Khan a ""p...",entertainment
96,Helen to make acting comeback after 10 years w...,Actress Helen will make her comeback to acting...,entertainment
97,"Ran out of money during lockdown, in theatre y...",Actress Sarika said that she ran out of money ...,entertainment
98,Didn't attend a wedding in 8yrs of 'Kahaani Gh...,Actress Sakshi Tanwar has revealed in an inter...,entertainment


In [4]:
def basic_clean(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    return text


In [5]:
codeup_df['clean'] = codeup_df.content.apply(basic_clean)

In [6]:
def tokenize(text):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    text = tokenizer.tokenize(text, return_str=True)
    return text

In [7]:
tokenized_df = codeup_df.clean.apply(tokenize)

In [8]:
def stem(text):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    text = ' '.join(stems)
    return text

In [9]:
codeup_df['stemmed'] = tokenized_df.apply(stem)

In [10]:
def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    text = ' '.join(lemmas)
    return text


In [11]:
codeup_df['lemmatized'] = tokenized_df.apply(lemmatize)

In [12]:
codeup_df.head()

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ...",in honor of military appreciation month join u...,in honor of militari appreci month join us for...,in honor of military appreciation month join u...
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a...",just about a year ago on april 16th 2021 we an...,just about a year ago on april 16th 2021 we an...,just about a year ago on april 16th 2021 we an...
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...,html css are the design building blocks of al...,html css are the design build block of all the...,html cs are the design building block of all t...
3,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ...",according to linkedin the 1 most promising job...,accord to linkedin the 1 most promis job is da...,according to linkedin the 1 most promising job...
4,Coming Soon: Cloud Administration,We’re launching a new program out of San Anton...,were launching a new program out of san antoni...,were launch a new program out of san antonio w...,were launching a new program out of san antoni...


---

In [13]:
news_df['clean'] = news_df.content.apply(basic_clean)
tokenized_df = news_df.clean.apply(tokenize)
news_df['stemmed'] = tokenized_df.apply(stem)
news_df['lemmatized'] = tokenized_df.apply(lemmatize)
news_df.head()

Unnamed: 0,title,content,category,clean,stemmed,lemmatized
0,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,business,the indian rupee weakened further on monday to...,the indian rupe weaken further on monday to cl...,the indian rupee weakened further on monday to...
1,India's biggest IPO of LIC subscribed nearly 3...,"LIC's IPO, India's biggest IPO which opened on...",business,lic's ipo india's biggest ipo which opened on ...,lic ' s ipo india ' s biggest ipo which open o...,lic ' s ipo india ' s biggest ipo which opened...
2,"Office as we know it, is over: Airbnb CEO on l...",After Airbnb allowed its employees to work rem...,business,after airbnb allowed its employees to work rem...,after airbnb allow it employe to work remot fo...,after airbnb allowed it employee to work remot...
3,Microsoft to help cover US employees' travel c...,Microsoft has said that it will cover travel c...,business,microsoft has said that it will cover travel c...,microsoft ha said that it will cover travel co...,microsoft ha said that it will cover travel co...
4,Musk's $44 bn Twitter deal at risk of being re...,Elon Musk's $44 billion offer to buy Twitter c...,business,elon musk's 44 billion offer to buy twitter co...,elon musk ' s 44 billion offer to buy twitter ...,elon musk ' s 44 billion offer to buy twitter ...


---

In [37]:
def remove_stopwords(text, extra_words= [], exclude_words= []):
    stopword_list = stopwords.words('english')
    stopword_list = set(stopword_list) - set(exclude_words)
    stopword_list = stopword_list.union(set(extra_words))
    words = text.split()
    filtered_words = [w for w in words if w not in stopword_list]
    text_without_stopwords = ' '.join(filtered_words)
    return text_without_stopwords
    

In [21]:
codeup_df['stemmed_stopwords_removed'] = codeup_df.stemmed.apply(remove_stopwords)

In [23]:
codeup_df

Unnamed: 0,title,content,clean,stemmed,lemmatized,stemmed_stopwords_removed
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ...",in honor of military appreciation month join u...,in honor of militari appreci month join us for...,in honor of military appreciation month join u...,honor militari appreci month join us discuss c...
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a...",just about a year ago on april 16th 2021 we an...,just about a year ago on april 16th 2021 we an...,just about a year ago on april 16th 2021 we an...,year ago april 16th 2021 announc acquisit rack...
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...,html css are the design building blocks of al...,html css are the design build block of all the...,html cs are the design building block of all t...,html css design build block websit interact da...
3,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ...",according to linkedin the 1 most promising job...,accord to linkedin the 1 most promis job is da...,according to linkedin the 1 most promising job...,accord linkedin 1 promis job data scienc codeu...
4,Coming Soon: Cloud Administration,We’re launching a new program out of San Anton...,were launching a new program out of san antoni...,were launch a new program out of san antonio w...,were launching a new program out of san antoni...,launch new program san antonio acquisit racksp...


In [38]:
codeup_df['lem_stopwords_removed'] = codeup_df.lemmatized.apply(remove_stopwords)

In [39]:
codeup_df

Unnamed: 0,title,content,clean,stemmed,lemmatized,stemmed_stopwords_removed,lem_stopwords_removed
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ...",in honor of military appreciation month join u...,in honor of militari appreci month join us for...,in honor of military appreciation month join u...,honor militari appreci month join us discuss c...,honor military appreciation month join u discu...
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a...",just about a year ago on april 16th 2021 we an...,just about a year ago on april 16th 2021 we an...,just about a year ago on april 16th 2021 we an...,year ago april 16th 2021 announc acquisit rack...,year ago april 16th 2021 announced acquisition...
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...,html css are the design building blocks of al...,html css are the design build block of all the...,html cs are the design building block of all t...,html css design build block websit interact da...,html cs design building block website interact...
3,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ...",according to linkedin the 1 most promising job...,accord to linkedin the 1 most promis job is da...,according to linkedin the 1 most promising job...,accord linkedin 1 promis job data scienc codeu...,according linkedin 1 promising job data scienc...
4,Coming Soon: Cloud Administration,We’re launching a new program out of San Anton...,were launching a new program out of san antoni...,were launch a new program out of san antonio w...,were launching a new program out of san antoni...,launch new program san antonio acquisit racksp...,launching new program san antonio acquisition ...
