In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as a


In [2]:
def basic_clean(string):
    '''
    This function accepts a string as an input
    then lowercases everything, normalizes unicode
    characters, and replaces anything that is
    not a letter, number, whitespace, 
    or a single quote.
    '''
    cleaned = string.lower()
    cleaned = unicodedata.normalize('NFKD', cleaned)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    cleaned = re.sub(r"[^a-z0-9'\s]", '', cleaned)
    
    return cleaned

In [3]:
def tokenize(string):
    '''
    This function takes in a string as an input
    then tokenizes all words in the string.
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    print(tokenizer.tokenize(string, return_str=True)[0:500])

In [4]:
def stem(string):
    '''
    This function takes in a string as an input
    then stems all words in the string.
    '''
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    string_stemmed = ' '.join(stems)
    return string_stemmed

In [5]:
def lemmatize(string):
    '''
    This function takes in a string as an input
    then lemmatizes all words in the string.
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    string_lemmatized = ' '.join(lemmas)

    return string_lemmatized

In [6]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string as an input
    then removes stopwords. The function has two
    additional parameters that define additional
    stopwords to remove in extra_words as a list,
    and defines stopwords to exclude from removal
    in exlude_words as a list. extra_words and
    exclude_words are empty lists by default.
    '''
    words = string.split()
    filtered_words = [w for w in words if w not in stopword_list]

    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')

    string_without_stopwords = ' '.join(filtered_words)

    print(string_without_stopwords)

In [7]:
blog_articles = a.get_blog_articles()

In [8]:
codeup_df = blog_articles

In [9]:
news_articles = a.get_news_articles()

In [10]:
news_df = news_articles

In [11]:
codeup_df

Unnamed: 0,title,content
0,Become a Data Scientist in 6 Months!,Are you feeling unfulfilled in your work but w...
1,Hiring Tech Talent Around the Holidays,Are you a hiring manager having trouble fillin...
2,Cloud Administration Program New Funding Options,Finding resources to fund your educational goa...
3,Why Dallas is a Great Location for IT Professi...,"When breaking into a new career, it is importa..."
4,Codeup is ranked #1 Best in DFW 2022,We are excited to announce that Codeup ranked ...
5,Codeup’s Scholarship Offerings,In honor of November being National Scholarshi...


In [12]:
news_df.head()

Unnamed: 0,title,content,category
0,"Moscow-Goa flight gets bomb threat, makes emer...",A Goa-bound flight from Russia's Moscow made a...,national
1,"Joshimath divided into 3 zones, govt says most...","Uttarakhand's Joshimath, where a majority of b...",national
2,Which states have reported COVID-19 variant XB...,"One new case of COVID-19 variant XBB.1.5, whic...",national
3,I decided to wear t-shirt till I shiver after ...,Congress leader Rahul Gandhi on Monday told re...,national
4,Sec 144 imposed in Assam's Karimganj after Baj...,Prohibitory orders under Section 144 of the Cr...,national


In [13]:
#codeup_df.iloc[0]

In [14]:
#basic_clean(codeup_df['content'][0])

## Applying functions to codeup_df

In [15]:
codeup_df['original'] = codeup_df['content']

In [16]:
clean_list = []
for n in range(0, 6):
    
    clean_content = basic_clean(codeup_df['content'][n])
    clean_list.append(clean_content)
clean_list

['are you feeling unfulfilled in your work but want to avoid returning to the traditional educational route codeup can help starting over as a professional is daunting and not always ideal codeup can help you go from a career you are bored with to a job that excites you in just 6 months\nheres how\ndata science program\nduring our 20week program you will have the opportunity to take your career to new heights with data science being one of the most needed jobs in tech\nyoull gather data then clean it explore it for trends and apply machine learning models to make predictions\nupon completing this program you will know how to turn insights into actionable recommendations youll be a huge asset to any company having all the technical skills to become a data scientist with projects upon projects of experience under your belt\ncodeup\na common reason individuals opt not to change their careers is fear it is too late codeup has crafted a program that will guide you through your career transi

In [17]:
len(clean_list)

6

In [18]:
codeup_df['clean'] = clean_list

In [19]:
#codeup_df

In [20]:
stem_list = []
for n in range(0, 6):
    
    stem_content = stem(codeup_df['content'][n])
    stem_list.append(stem_content)
stem_list

['are you feel unfulfil in your work but want to avoid return to the tradit educ route? codeup can help! start over as a profession is daunt and not alway ideal. codeup can help you go from a career you are bore with, to a job that excit you in just 6 months! here’ how… data scienc program dure our 20-week program, you will have the opportun to take your career to new height with data scienc be one of the most need job in tech. you’ll gather data, then clean it, explor it for trends, and appli machin learn model to make predictions. upon complet thi program, you will know how to turn insight into action recommendations. you’ll be a huge asset to ani company, have all the technic skill to becom a data scientist with project upon project of experi under your belt. codeup a common reason individu opt not to chang their career is fear it is too late. codeup ha craft a program that will guid you through your career transit and prove that you can jumpstart a new job at ani age and experi lev

In [21]:
codeup_df['stemmed'] = stem_list

In [22]:
#codeup_df

In [23]:
lem_list = []
for n in range(0, 6):
    
    lem_content = lemmatize(codeup_df['content'][n])
    lem_list.append(lem_content)
lem_list

['Are you feeling unfulfilled in your work but want to avoid returning to the traditional educational route? Codeup can help! Starting over a a professional is daunting and not always ideal. Codeup can help you go from a career you are bored with, to a job that excites you in just 6 months! Here’s how… Data Science Program During our 20-week program, you will have the opportunity to take your career to new height with data science being one of the most needed job in tech. You’ll gather data, then clean it, explore it for trends, and apply machine learning model to make predictions. Upon completing this program, you will know how to turn insight into actionable recommendations. You’ll be a huge asset to any company, having all the technical skill to become a data scientist with project upon project of experience under your belt. Codeup A common reason individual opt not to change their career is fear it is too late. Codeup ha crafted a program that will guide you through your career tra

In [24]:
codeup_df['lemmatized'] = lem_list

In [25]:
codeup_df

Unnamed: 0,title,content,original,clean,stemmed,lemmatized
0,Become a Data Scientist in 6 Months!,Are you feeling unfulfilled in your work but w...,Are you feeling unfulfilled in your work but w...,are you feeling unfulfilled in your work but w...,are you feel unfulfil in your work but want to...,Are you feeling unfulfilled in your work but w...
1,Hiring Tech Talent Around the Holidays,Are you a hiring manager having trouble fillin...,Are you a hiring manager having trouble fillin...,are you a hiring manager having trouble fillin...,are you a hire manag have troubl fill an it po...,Are you a hiring manager having trouble fillin...
2,Cloud Administration Program New Funding Options,Finding resources to fund your educational goa...,Finding resources to fund your educational goa...,finding resources to fund your educational goa...,find resourc to fund your educ goal is possibl...,Finding resource to fund your educational goal...
3,Why Dallas is a Great Location for IT Professi...,"When breaking into a new career, it is importa...","When breaking into a new career, it is importa...",when breaking into a new career it is importan...,"when break into a new career, it is import to ...","When breaking into a new career, it is importa..."
4,Codeup is ranked #1 Best in DFW 2022,We are excited to announce that Codeup ranked ...,We are excited to announce that Codeup ranked ...,we are excited to announce that codeup ranked ...,we are excit to announc that codeup rank #1 fo...,We are excited to announce that Codeup ranked ...
5,Codeup’s Scholarship Offerings,In honor of November being National Scholarshi...,In honor of November being National Scholarshi...,in honor of november being national scholarshi...,in honor of novemb be nation scholarship month...,In honor of November being National Scholarshi...


## Applying functions to news_df

In [26]:
clean_list = []
for n in range(0, len(news_df)):
    
    clean_content = basic_clean(news_df['content'][n])
    clean_list.append(clean_content)
clean_list

["a goabound flight from russia's moscow made an emergency landing in gujarat's jamnagar at around 949 pm after goa atc air traffic control received a bomb threat the aircraft is in the isolation bay at the jamnagar airport and further investigation is underway the flight was carrying 244 passengers",
 "uttarakhand's joshimath where a majority of buildings developed cracks has been divided into three zones based on the magnitude of possible danger rm sundaram secretary to cm pushkar singh dhami said the town has been divided into 'danger' 'buffer' and 'completely safe' zones sundaram added that the buildings which have sustained the most damage will be demolished ",
 'one new case of covid19 variant xbb15 which is responsible for the rise in cases in the us has been discovered in uttarakhand taking the total number of cases in india to eight according to insacog data pti reported earlier three xbb15 cases were reported in gujarat while karnataka telangana chhattisgarh and rajasthan rep

In [27]:
stem_list = []
for n in range(0, len(news_df)):
    
    stem_content = stem(news_df['content'][n])
    stem_list.append(stem_content)
stem_list

["a goa-bound flight from russia' moscow made an emerg land in gujarat' jamnagar at around 9.49 pm after goa atc (air traffic control) receiv a bomb threat. the aircraft is in the isol bay at the jamnagar airport and further investig is underway. the flight wa carri 244 passengers.",
 "uttarakhand' joshimath, where a major of build develop cracks, ha been divid into three zone base on the magnitud of possibl danger. rm sundaram, secretari to cm pushkar singh dhami, said the town ha been divid into 'danger', 'buffer' and 'complet safe' zones. sundaram ad that the build which have sustain the most damag will be demolished.",
 'one new case of covid-19 variant xbb.1.5, which is respons for the rise in case in the us, ha been discov in uttarakhand, take the total number of case in india to eight, accord to insacog data, pti reported. earlier, three xbb.1.5 case were report in gujarat while karnataka, telangana, chhattisgarh and rajasthan report one case each, accord to insacog.',
 'congres

In [28]:
lem_list = []
for n in range(0, len(news_df)):
    
    lem_content = lemmatize(news_df['content'][n])
    lem_list.append(lem_content)
lem_list

["A Goa-bound flight from Russia's Moscow made an emergency landing in Gujarat's Jamnagar at around 9.49 pm after Goa ATC (Air Traffic Control) received a bomb threat. The aircraft is in the isolation bay at the Jamnagar airport and further investigation is underway. The flight wa carrying 244 passengers.",
 "Uttarakhand's Joshimath, where a majority of building developed cracks, ha been divided into three zone based on the magnitude of possible danger. RM Sundaram, Secretary to CM Pushkar Singh Dhami, said the town ha been divided into 'danger', 'buffer' and 'completely safe' zones. Sundaram added that the building which have sustained the most damage will be demolished.",
 'One new case of COVID-19 variant XBB.1.5, which is responsible for the rise in case in the US, ha been discovered in Uttarakhand, taking the total number of case in India to eight, according to INSACOG data, PTI reported. Earlier, three XBB.1.5 case were reported in Gujarat while Karnataka, Telangana, Chhattisgarh

In [29]:
news_df['original'] = news_df['content']

In [30]:
news_df['clean'] = clean_list

In [31]:
news_df['stemmed'] = stem_list

In [32]:
news_df['lemmatized'] = lem_list

In [33]:
news_df

Unnamed: 0,title,content,category,original,clean,stemmed,lemmatized
0,"Moscow-Goa flight gets bomb threat, makes emer...",A Goa-bound flight from Russia's Moscow made a...,national,A Goa-bound flight from Russia's Moscow made a...,a goabound flight from russia's moscow made an...,a goa-bound flight from russia' moscow made an...,A Goa-bound flight from Russia's Moscow made a...
1,"Joshimath divided into 3 zones, govt says most...","Uttarakhand's Joshimath, where a majority of b...",national,"Uttarakhand's Joshimath, where a majority of b...",uttarakhand's joshimath where a majority of bu...,"uttarakhand' joshimath, where a major of build...","Uttarakhand's Joshimath, where a majority of b..."
2,Which states have reported COVID-19 variant XB...,"One new case of COVID-19 variant XBB.1.5, whic...",national,"One new case of COVID-19 variant XBB.1.5, whic...",one new case of covid19 variant xbb15 which is...,"one new case of covid-19 variant xbb.1.5, whic...","One new case of COVID-19 variant XBB.1.5, whic..."
3,I decided to wear t-shirt till I shiver after ...,Congress leader Rahul Gandhi on Monday told re...,national,Congress leader Rahul Gandhi on Monday told re...,congress leader rahul gandhi on monday told re...,congress leader rahul gandhi on monday told re...,Congress leader Rahul Gandhi on Monday told re...
4,Sec 144 imposed in Assam's Karimganj after Baj...,Prohibitory orders under Section 144 of the Cr...,national,Prohibitory orders under Section 144 of the Cr...,prohibitory orders under section 144 of the cr...,prohibitori order under section 144 of the crp...,Prohibitory order under Section 144 of the CrP...
...,...,...,...,...,...,...,...
292,Vehicle sales in India grow by 15% to 2.11 cro...,India's retail sales of overall vehicles witne...,automobile,India's retail sales of overall vehicles witne...,india's retail sales of overall vehicles witne...,india' retail sale of overal vehicl wit a 15.2...,India's retail sale of overall vehicle witness...
293,Stellantis may shut more auto plants due to hi...,Stellantis NV CEO Carlos Tavares said on Thurs...,automobile,Stellantis NV CEO Carlos Tavares said on Thurs...,stellantis nv ceo carlos tavares said on thurs...,stellanti nv ceo carlo tavar said on thursday ...,Stellantis NV CEO Carlos Tavares said on Thurs...
294,"Tesla cuts Model 3, Model Y prices in China fo...",Tesla on Friday reduced prices for its Model 3...,automobile,Tesla on Friday reduced prices for its Model 3...,tesla on friday reduced prices for its model 3...,tesla on friday reduc price for it model 3 and...,Tesla on Friday reduced price for it Model 3 a...
295,Electric two-wheeler sales cross 6-lakh mark i...,Electric two-wheeler (E2W) sales in 2022 hit a...,automobile,Electric two-wheeler (E2W) sales in 2022 hit a...,electric twowheeler e2w sales in 2022 hit a mi...,electr two-wheel (e2w) sale in 2022 hit a mile...,Electric two-wheeler (E2W) sale in 2022 hit a ...
