In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import acquire as a
import re
import unicodedata
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import os
import warnings
warnings.filterwarnings('ignore')

#### The end result of this exercise should be a file named prepare.py that defines the requested functions.

#### In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

## Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

 Lowercase everything

 Normalize unicode characters

 Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(text):
    
    text = text.lower()
    
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    text = re.sub(r'[^a-zA-Z0-9\s\']', '', text)
    
    return text

In [3]:
text = "This is a sample sentence for basic clean."
basic =  basic_clean(text)
print(basic)

this is a sample sentence for basic clean


## Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [4]:
def tokenize(input_string):

    words = input_string.split()
    return words

In [5]:
input_text = "This is a sample sentence for tokenization."
tokens = tokenize(input_text)
print(tokens)

['This', 'is', 'a', 'sample', 'sentence', 'for', 'tokenization.']


## Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [6]:
def stem(text):
    
    stemmer = PorterStemmer()
    
    words = word_tokenize(text)

    stemmed_words = [stemmer.stem(word) for word in words]

    stemmed_text = ' '.join(stemmed_words)
    
    return stemmed_text

In [7]:
input_text = "Running dogs are faster than other dogs running"
stemmed_output = stem(input_text)
print(stemmed_output)

run dog are faster than other dog run


## Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [8]:
def lemmatize(text):
  
    lemmatizer = WordNetLemmatizer()
  
    words = word_tokenize(text)
  
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    
    return lemmatized_text

## Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

## This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [9]:
def remove_stopwords(text, extra_words=None, exclude_words=None):
   
    stop_words = set(stopwords.words("english"))

    if extra_words:
        stop_words.update(extra_words)

    if exclude_words:
        stop_words.difference_update(exclude_words)

    words = text.split()
  
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text


In [10]:
input_text = "This is an example sentence with some stopwords that should be removed."
filtered_text = remove_stopwords(input_text)
print(filtered_text)

example sentence stopwords removed.


## Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [11]:
news_df = a.get_inshorts_data()

Data scraped and saved to CSV.


In [12]:
news_df

Unnamed: 0,headline,summary,author,time,day,category
0,Adani Green's unit appeals against tax demand,Adani Green Energy's subsidiary Adani Energy h...,Mansi Agarwal,05:27 pm,"Tuesday, 15 August, 2023",business
1,EX-WSJ reporter ends Indian hacker lawsuit aga...,Former Wall Street Journal reporter Jay Solomo...,Srishty Choudhury,04:37 pm,"Tuesday, 15 August, 2023",business
2,Fitch warns it may be forced to downgrade seve...,"A Fitch Ratings' analyst warned that US banks,...",Srishty Choudhury,04:28 pm,"Tuesday, 15 August, 2023",business
3,Govt hikes windfall tax on crude petroleum to ...,The government has hiked the windfall tax on d...,Mansi Agarwal,04:22 pm,"Tuesday, 15 August, 2023",business
4,"VIP Industries MD Anindya Dutta resigns, CFO t...",VIP Industries on Tuesday announced that MD An...,Mansi Agarwal,03:54 pm,"Tuesday, 15 August, 2023",business
...,...,...,...,...,...,...
136,Seats filling fast for Extended Reality Certif...,Hero Vired in collaboration with Snapchat and ...,Roshan Gupta,04:30 am,"Friday, 11 August, 2023",education
137,Indians among 500 students stuck as Canada col...,"At least 500 students, including Indians, are ...",Disha Jana,05:12 pm,"Thursday, 10 August, 2023",education
138,Want him to excel at what he chooses: Vendor w...,"Sujal Singh, a Noida-based towel seller's son,...",Sakshita Khosla,03:23 pm,"Thursday, 10 August, 2023",education
139,IIM Amendment Bill passed by the Parliament,The Rajya Sabha on Tuesday passed the IIM (Ame...,Sakshita Khosla,12:24 pm,"Tuesday, 8 August, 2023",education


## Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [13]:
urls_to_scrape = [
    "https://codeup.edu/alumni-stories/how-i-paid-43-for-my-codeup-tuition/",
    "https://codeup.edu/featured/women-in-tech-panelist-spotlight/",
    "https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/",
    "https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/",
    "https://codeup.edu/events/women-in-tech-madeleine/",
    "https://codeup.edu/codeup-news/panelist-spotlight-4/",
]

csv_filename = "articles.csv"

In [14]:
codeup_df =a.scrape_and_save_articles(urls_to_scrape, csv_filename)

Articles saved to 'articles.csv'


In [15]:
codeup_df

Unnamed: 0,title,content
0,How I Paid $43 For My Codeup Tuition,"Nov 27, 2019 | Alumni Stories Bootcamps or car..."
1,Women in tech: Panelist Spotlight – Magdalena ...,"Mar 28, 2023 | Events, Featured Codeup is host..."
2,Women in tech: Panelist Spotlight – Rachel Rob...,"Mar 20, 2023 | Events, Featured Codeup is host..."
3,Women in Tech: Panelist Spotlight – Sarah Mellor,"Mar 13, 2023 | Codeup News, Featured Codeup is..."
4,Women in Tech: Panelist Spotlight – Madeleine ...,"Mar 6, 2023 | Events, Featured Codeup is hosti..."
5,Black Excellence in Tech: Panelist Spotlight –...,"Feb 16, 2023 | Codeup News, Events, Featured ..."


## For each dataframe, produce the following columns:

- title to hold the title

- original to hold the original article/post content

- clean to hold the normalized and tokenized original with the stopwords removed.

- stemmed to hold the stemmed version of the cleaned data.

- lemmatized to hold the lemmatized version of the cleaned data.

In [16]:
codeup_df['original'] = codeup_df['content']
codeup_df['clean'] = codeup_df['original'].apply(basic_clean)
codeup_df['clean'] = codeup_df['clean'].apply(remove_stopwords)
codeup_df['stemmed'] = codeup_df['clean'].apply(stem)

codeup_df

Unnamed: 0,title,content,original,clean,stemmed
0,How I Paid $43 For My Codeup Tuition,"Nov 27, 2019 | Alumni Stories Bootcamps or car...","Nov 27, 2019 | Alumni Stories Bootcamps or car...",nov 27 2019 alumni stories bootcamps career ac...,nov 27 2019 alumni stori bootcamp career accel...
1,Women in tech: Panelist Spotlight – Magdalena ...,"Mar 28, 2023 | Events, Featured Codeup is host...","Mar 28, 2023 | Events, Featured Codeup is host...",mar 28 2023 events featured codeup hosting wom...,mar 28 2023 event featur codeup host women tec...
2,Women in tech: Panelist Spotlight – Rachel Rob...,"Mar 20, 2023 | Events, Featured Codeup is host...","Mar 20, 2023 | Events, Featured Codeup is host...",mar 20 2023 events featured codeup hosting wom...,mar 20 2023 event featur codeup host women tec...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,"Mar 13, 2023 | Codeup News, Featured Codeup is...","Mar 13, 2023 | Codeup News, Featured Codeup is...",mar 13 2023 codeup news featured codeup hostin...,mar 13 2023 codeup news featur codeup host wom...
4,Women in Tech: Panelist Spotlight – Madeleine ...,"Mar 6, 2023 | Events, Featured Codeup is hosti...","Mar 6, 2023 | Events, Featured Codeup is hosti...",mar 6 2023 events featured codeup hosting wome...,mar 6 2023 event featur codeup host women tech...
5,Black Excellence in Tech: Panelist Spotlight –...,"Feb 16, 2023 | Codeup News, Events, Featured ...","Feb 16, 2023 | Codeup News, Events, Featured ...",feb 16 2023 codeup news events featured codeup...,feb 16 2023 codeup news event featur codeup ho...


In [17]:
news_df['title'] = news_df['headline']
news_df['original'] = news_df['summary']
news_df['clean'] = news_df['original'].apply(basic_clean)
news_df['clean'] = news_df['clean'].apply(remove_stopwords)
news_df['stemmed'] = news_df['clean'].apply(stem)

news_df

Unnamed: 0,headline,summary,author,time,day,category,title,original,clean,stemmed
0,Adani Green's unit appeals against tax demand,Adani Green Energy's subsidiary Adani Energy h...,Mansi Agarwal,05:27 pm,"Tuesday, 15 August, 2023",business,Adani Green's unit appeals against tax demand,Adani Green Energy's subsidiary Adani Energy h...,adani green energy's subsidiary adani energy f...,adani green energi 's subsidiari adani energi ...
1,EX-WSJ reporter ends Indian hacker lawsuit aga...,Former Wall Street Journal reporter Jay Solomo...,Srishty Choudhury,04:37 pm,"Tuesday, 15 August, 2023",business,EX-WSJ reporter ends Indian hacker lawsuit aga...,Former Wall Street Journal reporter Jay Solomo...,former wall street journal reporter jay solomo...,former wall street journal report jay solomon ...
2,Fitch warns it may be forced to downgrade seve...,"A Fitch Ratings' analyst warned that US banks,...",Srishty Choudhury,04:28 pm,"Tuesday, 15 August, 2023",business,Fitch warns it may be forced to downgrade seve...,"A Fitch Ratings' analyst warned that US banks,...",fitch ratings' analyst warned us banks includi...,fitch rate ' analyst warn us bank includ jpmor...
3,Govt hikes windfall tax on crude petroleum to ...,The government has hiked the windfall tax on d...,Mansi Agarwal,04:22 pm,"Tuesday, 15 August, 2023",business,Govt hikes windfall tax on crude petroleum to ...,The government has hiked the windfall tax on d...,government hiked windfall tax domestically pro...,govern hike windfal tax domest produc crude pe...
4,"VIP Industries MD Anindya Dutta resigns, CFO t...",VIP Industries on Tuesday announced that MD An...,Mansi Agarwal,03:54 pm,"Tuesday, 15 August, 2023",business,"VIP Industries MD Anindya Dutta resigns, CFO t...",VIP Industries on Tuesday announced that MD An...,vip industries tuesday announced md anindya du...,vip industri tuesday announc md anindya dutta ...
...,...,...,...,...,...,...,...,...,...,...
136,Seats filling fast for Extended Reality Certif...,Hero Vired in collaboration with Snapchat and ...,Roshan Gupta,04:30 am,"Friday, 11 August, 2023",education,Seats filling fast for Extended Reality Certif...,Hero Vired in collaboration with Snapchat and ...,hero vired collaboration snapchat unreal engin...,hero vire collabor snapchat unreal engin launc...
137,Indians among 500 students stuck as Canada col...,"At least 500 students, including Indians, are ...",Disha Jana,05:12 pm,"Thursday, 10 August, 2023",education,Indians among 500 students stuck as Canada col...,"At least 500 students, including Indians, are ...",least 500 students including indians stuck can...,least 500 student includ indian stuck canadian...
138,Want him to excel at what he chooses: Vendor w...,"Sujal Singh, a Noida-based towel seller's son,...",Sakshita Khosla,03:23 pm,"Thursday, 10 August, 2023",education,Want him to excel at what he chooses: Vendor w...,"Sujal Singh, a Noida-based towel seller's son,...",sujal singh noidabased towel seller's son gave...,sujal singh noidabas towel seller 's son gave ...
139,IIM Amendment Bill passed by the Parliament,The Rajya Sabha on Tuesday passed the IIM (Ame...,Sakshita Khosla,12:24 pm,"Tuesday, 8 August, 2023",education,IIM Amendment Bill passed by the Parliament,The Rajya Sabha on Tuesday passed the IIM (Ame...,rajya sabha tuesday passed iim amendment bill ...,rajya sabha tuesday pass iim amend bill 2023 p...


## Ask yourself:

## If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?

#### Either would be fine with this low of a corpus

## If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?

#### More than likely Lemmatization at that point

## If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

#### Lemmatization