### Data Preparation - NLP

In [1]:
# unicode, regex, json for text digestion
import unicodedata
import re
import json

# nltk: natural language toolkit -> tokenization, stopwords (more on this soon)
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# pandas dataframe manipulation, acquire script, time formatting
import pandas as pd
import acquire
from time import strftime

# shh, down in front
import warnings
warnings.filterwarnings('ignore')

In [2]:
blog_1 = acquire.get_blog_posts('https://codeup.com/workshops/from-bootcamp-to-bootcamp-a-military-appreciation-panel/')
blog_2 = acquire.get_blog_posts('https://codeup.com/featured/our-acquisition-of-the-rackspace-cloud-academy-one-year-later/')
blog_3 = acquire.get_blog_posts('https://codeup.com/workshops/virtual/learn-to-code-html-css-on-4-30/')
blog_4 = acquire.get_blog_posts('https://codeup.com/workshops/virtual/learn-to-code-python-workshop-on-4-16/')
blog_5 = acquire.get_blog_posts('https://codeup.com/codeup-news/coming-soon-cloud-administration/')
codeup_df = pd.DataFrame([blog_1, blog_2, blog_3, blog_4, blog_5])
codeup_df



Unnamed: 0,title,content
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ..."
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a..."
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...
3,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ..."
4,Coming Soon: Cloud Administration,We’re launching a new program out of San Anton...


In [3]:
business_articles = acquire.get_news_articles('https://inshorts.com/en/read/business', 'business')
sports_articles = acquire.get_news_articles('https://inshorts.com/en/read/sports', 'sports')
tech_articles = acquire.get_news_articles('https://inshorts.com/en/read/technology', 'technology')
entertainment_articles = acquire.get_news_articles('https://inshorts.com/en/read/entertainment', 'entertainment')
news_df = pd.concat([business_articles, sports_articles, tech_articles, entertainment_articles], axis=0).reset_index(drop=True)
news_df

Unnamed: 0,title,content,category
0,Rupee hits all-time low of 77.42 against US do...,The Indian rupee fell to an all-time low of 77...,business
1,Bitcoin falls to the lowest level since Januar...,"Bitcoin fell on Monday to as low as $33,266 in...",business
2,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,business
3,Made best possible decision: IndiGo on barring...,IndiGo's CEO Ronojoy Dutta said the airline ma...,business
4,India's biggest IPO of LIC subscribed nearly 3...,"LIC's IPO, India's biggest IPO which opened on...",business
...,...,...,...
94,Akshay only praised Kashmir Files as his film ...,Filmmaker Vivek Agnihotri in an interview has ...,entertainment
95,"Rajkummar, Janhvi's 'Mr & Mrs Mahi' goes on fl...",Rajkummar Rao and Janhvi Kapoor-starrer 'Mr & ...,entertainment
96,Took me a while to break it: Isha Koppikar on ...,"Isha Koppikar, who has been a part of several ...",entertainment
97,Kids don't care: Kunal on Taimur being trolled...,Actor Kunal Kemmu has reacted to Saif Ali Khan...,entertainment


In [4]:
def basic_clean(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    return text


In [6]:
codeup_df['clean'] = codeup_df.content.apply(basic_clean)

In [7]:
def tokenize(text):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    text = tokenizer.tokenize(text, return_str=True)
    return text

In [8]:
tokenized_df = codeup_df.clean.apply(tokenize)

In [9]:
def stem(text):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    text = ' '.join(stems)
    return text

In [10]:
codeup_df['stemmed'] = tokenized_df.apply(stem)

In [11]:
def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    text = ' '.join(lemmas)
    return text


In [12]:
codeup_df['lemmatized'] = tokenized_df.apply(lemmatize)

In [13]:
codeup_df.head()

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ...",in honor of military appreciation month join u...,in honor of militari appreci month join us for...,in honor of military appreciation month join u...
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a...",just about a year ago on april 16th 2021 we an...,just about a year ago on april 16th 2021 we an...,just about a year ago on april 16th 2021 we an...
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...,html css are the design building blocks of al...,html css are the design build block of all the...,html cs are the design building block of all t...
3,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ...",according to linkedin the 1 most promising job...,accord to linkedin the 1 most promis job is da...,according to linkedin the 1 most promising job...
4,Coming Soon: Cloud Administration,We’re launching a new program out of San Anton...,were launching a new program out of san antoni...,were launch a new program out of san antonio w...,were launching a new program out of san antoni...


---

In [15]:
news_df['clean'] = news_df.content.apply(basic_clean)
tokenized_df = news_df.clean.apply(tokenize)
news_df['stemmed'] = tokenized_df.apply(stem)
news_df['lemmatized'] = tokenized_df.apply(lemmatize)
news_df.head()

Unnamed: 0,title,content,category,clean,stemmed,lemmatized
0,Rupee hits all-time low of 77.42 against US do...,The Indian rupee fell to an all-time low of 77...,business,the indian rupee fell to an alltime low of 774...,the indian rupe fell to an alltim low of 7742 ...,the indian rupee fell to an alltime low of 774...
1,Bitcoin falls to the lowest level since Januar...,"Bitcoin fell on Monday to as low as $33,266 in...",business,bitcoin fell on monday to as low as 33266 in m...,bitcoin fell on monday to as low as 33266 in m...,bitcoin fell on monday to a low a 33266 in mor...
2,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,business,the indian rupee weakened further on monday to...,the indian rupe weaken further on monday to cl...,the indian rupee weakened further on monday to...
3,Made best possible decision: IndiGo on barring...,IndiGo's CEO Ronojoy Dutta said the airline ma...,business,indigo's ceo ronojoy dutta said the airline ma...,indigo ' s ceo ronojoy dutta said the airlin m...,indigo ' s ceo ronojoy dutta said the airline ...
4,India's biggest IPO of LIC subscribed nearly 3...,"LIC's IPO, India's biggest IPO which opened on...",business,lic's ipo india's biggest ipo which opened on ...,lic ' s ipo india ' s biggest ipo which open o...,lic ' s ipo india ' s biggest ipo which opened...


---