In [1]:
#imports
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import requests
import os

from bs4 import BeautifulSoup

import pandas as pd

import acquire as a
import prepare as p

### 1/ The end result of this exercise should be a file named ```prepare.py``` that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

Define a function named ```basic_clean```. It should take in a string and apply some basic text cleaning to it:  
        Lowercase everything  
        Normalize unicode characters  
        Replace anything that is not a letter, number, whitespace or a single quote.  


In [2]:
# le texte (un poème)

mots = 'Good rain knows the right time / To fall when spring comes / And follows the wind in the night / Silently, moistening every thing / A wild path black and cloudy / A riverboat fire alone and bright / Dawn has the look that is red and wet / In Brocade City, Guancheng* / "Spring Night, Happy Rain" by Du Fu, written in Chengdu, 759 – 763, during the An Lushan Rebellion.'

mots

'Good rain knows the right time / To fall when spring comes / And follows the wind in the night / Silently, moistening every thing / A wild path black and cloudy / A riverboat fire alone and bright / Dawn has the look that is red and wet / In Brocade City, Guancheng* / "Spring Night, Happy Rain" by Du Fu, written in Chengdu, 759 – 763, during the An Lushan Rebellion.'

In [3]:
# lowercasing

mots = mots.lower()

In [4]:
# removing accents, etc : normalisation

mots = unicodedata.normalize('NFKD', mots).encode('ascii', 'ignore').decode('utf-8')
mots

'good rain knows the right time / to fall when spring comes / and follows the wind in the night / silently, moistening every thing / a wild path black and cloudy / a riverboat fire alone and bright / dawn has the look that is red and wet / in brocade city, guancheng* / "spring night, happy rain" by du fu, written in chengdu, 759  763, during the an lushan rebellion.'

In [5]:
# remove special characters (punctuation)

mots = re.sub(r'[^a-z0-9\s]',' ',mots)
mots

'good rain knows the right time   to fall when spring comes   and follows the wind in the night   silently  moistening every thing   a wild path black and cloudy   a riverboat fire alone and bright   dawn has the look that is red and wet   in brocade city  guancheng     spring night  happy rain  by du fu  written in chengdu  759  763  during the an lushan rebellion '

In [6]:
# function to clean a phrase

def basic_clean(mots):
    
    '''
    this function takes in a string, then 
    prepares it for parsing by lowercasing, 
    normalising and removing special characters
    '''
    
    # lowercasing
    mots = mots.lower()
    
    # removing accents, etc : normalisation
    mots = unicodedata.normalize('NFKD', mots).encode('ascii', 'ignore').decode('utf-8')
    
    # remove special characters (punctuation)
    mots = re.sub(r'[^a-z0-9\s]',' ',mots)

    return mots

In [7]:
# the function

basic_clean(mots)

'good rain knows the right time   to fall when spring comes   and follows the wind in the night   silently  moistening every thing   a wild path black and cloudy   a riverboat fire alone and bright   dawn has the look that is red and wet   in brocade city  guancheng     spring night  happy rain  by du fu  written in chengdu  759  763  during the an lushan rebellion '

### 2/ Define a function named tokenize. It should take in a string and tokenize all the words in the string.


In [8]:
#tokeniser object

tok = nltk.tokenize.ToktokTokenizer()
tok

<nltk.tokenize.toktok.ToktokTokenizer at 0x12ed1c3a0>

In [9]:
# each token

mots = tok.tokenize(mots)
mots

['good',
 'rain',
 'knows',
 'the',
 'right',
 'time',
 'to',
 'fall',
 'when',
 'spring',
 'comes',
 'and',
 'follows',
 'the',
 'wind',
 'in',
 'the',
 'night',
 'silently',
 'moistening',
 'every',
 'thing',
 'a',
 'wild',
 'path',
 'black',
 'and',
 'cloudy',
 'a',
 'riverboat',
 'fire',
 'alone',
 'and',
 'bright',
 'dawn',
 'has',
 'the',
 'look',
 'that',
 'is',
 'red',
 'and',
 'wet',
 'in',
 'brocade',
 'city',
 'guancheng',
 'spring',
 'night',
 'happy',
 'rain',
 'by',
 'du',
 'fu',
 'written',
 'in',
 'chengdu',
 '759',
 '763',
 'during',
 'the',
 'an',
 'lushan',
 'rebellion']

In [10]:
# tokeniser function

def tokenise(mots):
    
    '''this function takes in a cleaned
    string and breaks it down into tokens
    for nlp stemming and lemmatising
    '''
    #tokeniser object
    tok = nltk.tokenize.ToktokTokenizer()
    
    # each token
    mots = tok.tokenize(mots)
    
    return mots

In [11]:
tokenise(mots)

['[',
 "'",
 'good',
 "'",
 ',',
 "'",
 'rain',
 "'",
 ',',
 "'",
 'knows',
 "'",
 ',',
 "'",
 'the',
 "'",
 ',',
 "'",
 'right',
 "'",
 ',',
 "'",
 'time',
 "'",
 ',',
 "'",
 'to',
 "'",
 ',',
 "'",
 'fall',
 "'",
 ',',
 "'",
 'when',
 "'",
 ',',
 "'",
 'spring',
 "'",
 ',',
 "'",
 'comes',
 "'",
 ',',
 "'",
 'and',
 "'",
 ',',
 "'",
 'follows',
 "'",
 ',',
 "'",
 'the',
 "'",
 ',',
 "'",
 'wind',
 "'",
 ',',
 "'",
 'in',
 "'",
 ',',
 "'",
 'the',
 "'",
 ',',
 "'",
 'night',
 "'",
 ',',
 "'",
 'silently',
 "'",
 ',',
 "'",
 'moistening',
 "'",
 ',',
 "'",
 'every',
 "'",
 ',',
 "'",
 'thing',
 "'",
 ',',
 "'",
 'a',
 "'",
 ',',
 "'",
 'wild',
 "'",
 ',',
 "'",
 'path',
 "'",
 ',',
 "'",
 'black',
 "'",
 ',',
 "'",
 'and',
 "'",
 ',',
 "'",
 'cloudy',
 "'",
 ',',
 "'",
 'a',
 "'",
 ',',
 "'",
 'riverboat',
 "'",
 ',',
 "'",
 'fire',
 "'",
 ',',
 "'",
 'alone',
 "'",
 ',',
 "'",
 'and',
 "'",
 ',',
 "'",
 'bright',
 "'",
 ',',
 "'",
 'dawn',
 "'",
 ',',
 "'",
 'has',
 "'",
 ',',
 "'",
 

### 3/ Define a function named ```stem```. It should accept some text and return the text after applying stemming to all the words.


In [12]:
# create the stemmer obj

ps = nltk.porter.PorterStemmer()

In [13]:
# stemming each word

[ps.stem(mot) for mot in mots]

['good',
 'rain',
 'know',
 'the',
 'right',
 'time',
 'to',
 'fall',
 'when',
 'spring',
 'come',
 'and',
 'follow',
 'the',
 'wind',
 'in',
 'the',
 'night',
 'silent',
 'moisten',
 'everi',
 'thing',
 'a',
 'wild',
 'path',
 'black',
 'and',
 'cloudi',
 'a',
 'riverboat',
 'fire',
 'alon',
 'and',
 'bright',
 'dawn',
 'ha',
 'the',
 'look',
 'that',
 'is',
 'red',
 'and',
 'wet',
 'in',
 'brocad',
 'citi',
 'guancheng',
 'spring',
 'night',
 'happi',
 'rain',
 'by',
 'du',
 'fu',
 'written',
 'in',
 'chengdu',
 '759',
 '763',
 'dure',
 'the',
 'an',
 'lushan',
 'rebellion']

In [14]:
# assigning variable

stems = [ps.stem(mot) for mot in mots]

In [15]:
# joining stems into a clump of text


stems = [ps.stem(word) for word in mots]
' '.join(stems)

'good rain know the right time to fall when spring come and follow the wind in the night silent moisten everi thing a wild path black and cloudi a riverboat fire alon and bright dawn ha the look that is red and wet in brocad citi guancheng spring night happi rain by du fu written in chengdu 759 763 dure the an lushan rebellion'

In [16]:
# function to stem a string

def stem(mots):
    '''
    this function takes in a string, and 
    stems it using the Porter Stemmer
    '''
    # create the stemmer obj
    ps = nltk.porter.PorterStemmer()

    # stemming each word
    stems = [ps.stem(mot) for mot in mots]

    # joining stems into a clump of text
    stems = [ps.stem(word) for word in mots]
    ' '.join(stems)
    
    return stems

In [17]:
# faire fonctionner le fonction

stem(mots)

['good',
 'rain',
 'know',
 'the',
 'right',
 'time',
 'to',
 'fall',
 'when',
 'spring',
 'come',
 'and',
 'follow',
 'the',
 'wind',
 'in',
 'the',
 'night',
 'silent',
 'moisten',
 'everi',
 'thing',
 'a',
 'wild',
 'path',
 'black',
 'and',
 'cloudi',
 'a',
 'riverboat',
 'fire',
 'alon',
 'and',
 'bright',
 'dawn',
 'ha',
 'the',
 'look',
 'that',
 'is',
 'red',
 'and',
 'wet',
 'in',
 'brocad',
 'citi',
 'guancheng',
 'spring',
 'night',
 'happi',
 'rain',
 'by',
 'du',
 'fu',
 'written',
 'in',
 'chengdu',
 '759',
 '763',
 'dure',
 'the',
 'an',
 'lushan',
 'rebellion']

### 4/ Define a function named ```lemmatize```. It should accept some text and return the text after applying lemmatization to each word.


In [18]:
# lemmatiser obj
wln = nltk.stem.WordNetLemmatizer()

In [19]:
# join the lemmatised words

lemmas = [wln.lemmatize(mot) for mot in mots]
' '.join(lemmas)

'good rain know the right time to fall when spring come and follows the wind in the night silently moistening every thing a wild path black and cloudy a riverboat fire alone and bright dawn ha the look that is red and wet in brocade city guancheng spring night happy rain by du fu written in chengdu 759 763 during the an lushan rebellion'

In [20]:
# lemmatising function

def lemmatize(mots):
    
    '''this function takes in a string
    and lemmatises it'''
    
    # lemmatiser obj
    wln = nltk.stem.WordNetLemmatizer()
    
    # join the lemmatised words

    lemmas = [wln.lemmatize(mot) for mot in mots]
    ' '.join(lemmas)
    
    return lemmas

In [21]:
lemmatize(mots)

['good',
 'rain',
 'know',
 'the',
 'right',
 'time',
 'to',
 'fall',
 'when',
 'spring',
 'come',
 'and',
 'follows',
 'the',
 'wind',
 'in',
 'the',
 'night',
 'silently',
 'moistening',
 'every',
 'thing',
 'a',
 'wild',
 'path',
 'black',
 'and',
 'cloudy',
 'a',
 'riverboat',
 'fire',
 'alone',
 'and',
 'bright',
 'dawn',
 'ha',
 'the',
 'look',
 'that',
 'is',
 'red',
 'and',
 'wet',
 'in',
 'brocade',
 'city',
 'guancheng',
 'spring',
 'night',
 'happy',
 'rain',
 'by',
 'du',
 'fu',
 'written',
 'in',
 'chengdu',
 '759',
 '763',
 'during',
 'the',
 'an',
 'lushan',
 'rebellion']

### 5/ Define a function named ```remove_stopwords```. It should accept some text and return the text after removing all the stopwords.
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [22]:
# establish stopwords

arret = stopwords.words('english')

In [23]:
# all the words w/o stopwords, joined as phrase

mots_w_stopwords_removed = [mot for mot in mots if mot not in arret]
' '.join(mots_w_stopwords_removed)

'good rain knows right time fall spring comes follows wind night silently moistening every thing wild path black cloudy riverboat fire alone bright dawn look red wet brocade city guancheng spring night happy rain du fu written chengdu 759 763 lushan rebellion'

In [24]:
# appending to stopwords variable list : extra_words

extra_words = ['wind', 'path']
# appending via extending the extra_words list
arret.extend(extra_words)
arret[-2]

'wind'

In [25]:
# function to remove stopwords

# appending to stopword list ; leave empty blank if none
extra_words = ['wind', 'path']

# list of words to keep in stopwords list ; leave list empty if none
keep_words = ['a', 'the']

def remove_stopwords(mots, extra_words, keep_words):
    
    '''
    this function takes in a string of lemmatised 
    or stemmed words after normalisation and
    returns the string with stopwords removed
    '''
    
    # establish stopwords
    arret = stopwords.words('english')
    
    # appending via extending the extra_words list
    arret.extend(extra_words)

    # indicating words to keep in the original text
    exclude_words = set(stopwords.words('english')) - set(keep_words)
    
    # all the words w/o stopwords, joined as phrase
    mots_w_stopwords_removed = [mot for mot in mots if mot not in exclude_words]
    ' '.join(mots_w_stopwords_removed)
    
    return mots_w_stopwords_removed

In [26]:
# appending to stopword list ; leave blank if none
extra_words = []

# list of words to keep in stopwords list ; leave blank if none
keep_words = ['and']

remove_stopwords(mots, extra_words, keep_words)

['good',
 'rain',
 'knows',
 'right',
 'time',
 'fall',
 'spring',
 'comes',
 'and',
 'follows',
 'wind',
 'night',
 'silently',
 'moistening',
 'every',
 'thing',
 'wild',
 'path',
 'black',
 'and',
 'cloudy',
 'riverboat',
 'fire',
 'alone',
 'and',
 'bright',
 'dawn',
 'look',
 'red',
 'and',
 'wet',
 'brocade',
 'city',
 'guancheng',
 'spring',
 'night',
 'happy',
 'rain',
 'du',
 'fu',
 'written',
 'chengdu',
 '759',
 '763',
 'lushan',
 'rebellion']

In [27]:
# alternate way to remove certain words from stopword list

arret.remove('ma')
arret
# haven't got this method to accept a list yet

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [28]:
# list of words to keep in stopwords list
keep_words = ['a', 'the']

# indicating words to keep in the original text
exclude_words = set(stopwords.words('english')) - set(keep_words)

mots


['good',
 'rain',
 'knows',
 'the',
 'right',
 'time',
 'to',
 'fall',
 'when',
 'spring',
 'comes',
 'and',
 'follows',
 'the',
 'wind',
 'in',
 'the',
 'night',
 'silently',
 'moistening',
 'every',
 'thing',
 'a',
 'wild',
 'path',
 'black',
 'and',
 'cloudy',
 'a',
 'riverboat',
 'fire',
 'alone',
 'and',
 'bright',
 'dawn',
 'has',
 'the',
 'look',
 'that',
 'is',
 'red',
 'and',
 'wet',
 'in',
 'brocade',
 'city',
 'guancheng',
 'spring',
 'night',
 'happy',
 'rain',
 'by',
 'du',
 'fu',
 'written',
 'in',
 'chengdu',
 '759',
 '763',
 'during',
 'the',
 'an',
 'lushan',
 'rebellion']

### 6/  Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe ```news_df```.


In [29]:
# scrape 1 pg from InShorts

a.scrate_one_page('technology')

[{'category': 'technology',
  'title': 'Twitter down for thousands of users worldwide',
  'content': 'Microblogging platform Twitter suffered an outage on Wednesday, leaving thousands of users unable to refresh their feeds. Reports of the outage began to emerge with users from various countries, including the United States, the United Kingdom, Japan, and India. This comes after reports said that Elon Musk-led Twitter recently laid off up to 200 employees.'},
 {'category': 'technology',
  'title': 'Bill Gates meets Ratan Tata, N Chandrasekaran; pics surface',
  'content': 'Microsoft Co-founder Bill Gates met with Tata Sons Chairman Emeritus Ratan Tata and Tata Sons Chairman Natarajan Chandrasekaran. "Bill had an enriching discussion with Ratan Tata and N Chandrasekaran about their philanthropic initiatives," Gates Foundation India said in a tweet. "We look forward to strengthening our work together & partnering for health, diagnostics, and nutrition," it added.'},
 {'category': 'technol

In [30]:
# topic list

topics = ['business', 'sports', 'technology', 'entertainment']

In [31]:
# get news articles : 4 InShort topic pages

final_list = a.get_news_articles(topics)

In [32]:
# turn into DF

news_df = pd.DataFrame(final_list)

news_df.head()

Unnamed: 0,category,title,content
0,business,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...
1,business,"Bill Gates meets Ratan Tata, N Chandrasekaran;...",Microsoft Co-founder Bill Gates met with Tata ...
2,business,SoftBank sells shares worth ₹954 crore in logi...,SoftBank sold shares worth ₹954 crore in logis...
3,business,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...
4,business,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...


### 7/ Make another dataframe for the Codeup blog posts. Name the dataframe ```codeup_df```.

In [33]:
# using the function 

links_list = a.access_codeup_blog()
links_list

['https://codeup.com/codeup-news/panelist-spotlight-4/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-stephanie-jones/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-james-cooper/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight/',
 'https://codeup.com/tips-for-prospective-students/coding-bootcamp-or-self-learning/',
 'https://codeup.com/codeup-news/codeup-best-bootcamps/']

In [34]:
# Codeup blog articles function

blog_list = a.get_blog_articles(links_list)

In [35]:
# turn into DF

codeup_df = pd.DataFrame(blog_list)

codeup_df.head()

Unnamed: 0,title,link,date_published,content
0,Black Excellence in Tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 16, 2023",\nBlack excellence in tech: Panelist Spotlight...
1,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 13, 2023",\nBlack excellence in tech: Panelist Spotlight...
2,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 10, 2023",\nBlack excellence in tech: Panelist Spotlight...
3,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 6, 2023",\nBlack excellence in tech: Panelist Spotlight...
4,Coding Bootcamp or Self-Learning? Which is Bes...,https://codeup.com/codeup-news/codeup-best-boo...,"Jan 20, 2023",\nIf you’re interested in embarking on a caree...


### 8 / For each dataframe, produce the following columns:

    ```title``` to hold the title  
    ```original``` to hold the original article/post content  
    ```clean``` to hold the normalized and tokenized original with the stopwords removed  
    ```stemmed``` to hold the stemmed version of the cleaned data  
    ```lemmatized``` to hold the lemmatized version of the cleaned data  



In [36]:
dufu = '"Taking down a trellis / Already, the sticks I tied are withered and falling, / The calabash leaves are thin and sparse. / Luckily the white flowers have born their fruit, And peacefully the green leaves have faded. / Autumn insects speak not a sound, / What’s must sparrows think at dusk? /  For bitter cold is now our prison; / So, Life too has such beginnings." _Du Fu (712–770 AD)'
dufu

'"Taking down a trellis / Already, the sticks I tied are withered and falling, / The calabash leaves are thin and sparse. / Luckily the white flowers have born their fruit, And peacefully the green leaves have faded. / Autumn insects speak not a sound, / What’s must sparrows think at dusk? /  For bitter cold is now our prison; / So, Life too has such beginnings." _Du Fu (712–770 AD)'

In [37]:
# cleaning 'dufu' 

p.basic_clean(dufu)

' taking down a trellis / already  the sticks i tied are withered and falling  / the calabash leaves are thin and sparse  / luckily the white flowers have born their fruit  and peacefully the green leaves have faded  / autumn insects speak not a sound  / whats must sparrows think at dusk  /  for bitter cold is now our prison  / so  life too has such beginnings    du fu  712770 ad '

In [38]:
# renaming & dropping 'content' column

codeup_df['original'] = codeup_df['content']
# codeup_df.drop(columns = 'content', inplace = True)

In [39]:
# looking at one entry

codeup_df['original'][0]

'\nBlack excellence in tech: Panelist Spotlight – Wilmarie De La Cruz Mejia\n\nCodeup is hosting a Black Excellence in Tech Panel in honor of Black History Month on February 22, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!\xa0\xa0\nMeet Wilmarie!\nWilmarie De\xa0La Cruz Mejia is a current Codeup student on the path to becoming a Full-Stack Web Developer at our Dallas, TX campus.\xa0\nWilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with Codeup.\xa0\nWe asked Wilmarie to share more about her experience at Codeup. She shares, “I was able to meet other people who were passionate about coding and be in a positive learning environment.”\nWe hope you can join us on February 22nd to sit in on an insightful conversation with Wilmarie and all of our panelists!\n'

In [40]:
p.basic_clean(codeup_df['original'][0])

'\nblack excellence in tech  panelist spotlight  wilmarie de la cruz mejia\n\ncodeup is hosting a black excellence in tech panel in honor of black history month on february 22  2023  to further celebrate  wed like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry   \nmeet wilmarie \nwilmarie de la cruz mejia is a current codeup student on the path to becoming a full stack web developer at our dallas  tx campus  \nwilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with codeup  \nwe asked wilmarie to share more about her experience at codeup  she shares  i was able to meet other people who were passionate about coding and be in a positive learning environment \nwe hope you can join us on february 22nd to sit in on an insightful conversation with wilmarie and all of our panelists \n'

In [41]:
# cleaning function

codeup_df['cleaned'] = codeup_df['original'].apply(p.basic_clean)

In [42]:
codeup_df.head()

Unnamed: 0,title,link,date_published,content,original,cleaned
0,Black Excellence in Tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 16, 2023",\nBlack excellence in tech: Panelist Spotlight...,\nBlack excellence in tech: Panelist Spotlight...,\nblack excellence in tech panelist spotlight...
1,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 13, 2023",\nBlack excellence in tech: Panelist Spotlight...,\nBlack excellence in tech: Panelist Spotlight...,\nblack excellence in tech panelist spotlight...
2,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 10, 2023",\nBlack excellence in tech: Panelist Spotlight...,\nBlack excellence in tech: Panelist Spotlight...,\nblack excellence in tech panelist spotlight...
3,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 6, 2023",\nBlack excellence in tech: Panelist Spotlight...,\nBlack excellence in tech: Panelist Spotlight...,\nblack excellence in tech panelist spotlight...
4,Coding Bootcamp or Self-Learning? Which is Bes...,https://codeup.com/codeup-news/codeup-best-boo...,"Jan 20, 2023",\nIf you’re interested in embarking on a caree...,\nIf you’re interested in embarking on a caree...,\nif youre interested in embarking on a career...


In [43]:
# tokenising function

codeup_df['cleaned'] = codeup_df['cleaned'].apply(p.tokenise)

In [44]:
# Stemming function

codeup_df['stemmed'] = codeup_df['cleaned'].apply(p.stem)

In [45]:
# lemmatising function

codeup_df['lemmatised'] = codeup_df['cleaned'].apply(p.lemmatize)

In [46]:
# removing stopwords from stemmed column

codeup_df['sans_stopwords'] = codeup_df['stemmed'].apply(p.remove_stopwords, extra_words, keep_words)

In [47]:
# # function to clean, tokenise, stem, lem and remove stopwords

# # appending to stopword list ; leave empty blank if none
# extra_words = []

# # list of words to keep in stopwords list ; leave list empty if none
# keep_words = []


# def clean_stem_lem(df, extra_words, keep_words):
    
#     '''
#     this function takes in the Codeup blog df
#     and cleans, tokenises, stems and lemmatises
#     the dataframe, creating a new column for 
#     each action to the dataframe
#     '''
    
#     # renaming & dropping 'content' column
# #     codeup_df['original'] = codeup_df['content']
# #     codeup_df.drop(columns = 'content', inplace = True)

#     # cleaning function
#     codeup_df['cleaned'] = codeup_df['original'].apply(p.basic_clean)

#     # tokenising function
#     codeup_df['cleaned'] = codeup_df['cleaned'].apply(p.tokenise)

#     # Stemming function
#     codeup_df['stemmed'] = codeup_df['cleaned'].apply(p.stem)

#     # lemmatising function
#     codeup_df['lemmatised'] = codeup_df['cleaned'].apply(p.lemmatize)
    
#     # removing stopwords from stemmed column
#     codeup_df['sans_stopwords'] = codeup_df['stemmed'].apply(p.remove_stopwords, extra_words, keep_words)
    
#     return codeup_df

In [48]:
p.clean_stem_lem_stop(codeup_df, extra_words, keep_words)

Unnamed: 0,title,link,date_published,original,cleaned,stemmed,lemmatised,sans_stopwords
0,Black Excellence in Tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 16, 2023",\nBlack excellence in tech: Panelist Spotlight...,"[black, excellence, in, tech, panelist, spotli...","[black, excel, in, tech, panelist, spotlight, ...","[black, excellence, in, tech, panelist, spotli...","[black, excel, tech, panelist, spotlight, wilm..."
1,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 13, 2023",\nBlack excellence in tech: Panelist Spotlight...,"[black, excellence, in, tech, panelist, spotli...","[black, excel, in, tech, panelist, spotlight, ...","[black, excellence, in, tech, panelist, spotli...","[black, excel, tech, panelist, spotlight, step..."
2,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 10, 2023",\nBlack excellence in tech: Panelist Spotlight...,"[black, excellence, in, tech, panelist, spotli...","[black, excel, in, tech, panelist, spotlight, ...","[black, excellence, in, tech, panelist, spotli...","[black, excel, tech, panelist, spotlight, jame..."
3,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/codeup-best-boo...,"Feb 6, 2023",\nBlack excellence in tech: Panelist Spotlight...,"[black, excellence, in, tech, panelist, spotli...","[black, excel, in, tech, panelist, spotlight, ...","[black, excellence, in, tech, panelist, spotli...","[black, excel, tech, panelist, spotlight, jean..."
4,Coding Bootcamp or Self-Learning? Which is Bes...,https://codeup.com/codeup-news/codeup-best-boo...,"Jan 20, 2023",\nIf you’re interested in embarking on a caree...,"[if, youre, interested, in, embarking, on, a, ...","[if, your, interest, in, embark, on, a, career...","[if, youre, interested, in, embarking, on, a, ...","[interest, embark, career, tech, like, taken, ..."
5,Codeup Among Top 58 Best Coding Bootcamps of 2023,https://codeup.com/codeup-news/codeup-best-boo...,"Jan 12, 2023",\nCodeup is pleased to announce we have been r...,"[codeup, is, pleased, to, announce, we, have, ...","[codeup, is, pleas, to, announc, we, have, bee...","[codeup, is, pleased, to, announce, we, have, ...","[codeup, pleas, announc, rank, among, 58, best..."


#### InShorts news_df

In [49]:
news_df.head()

Unnamed: 0,category,title,content
0,business,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...
1,business,"Bill Gates meets Ratan Tata, N Chandrasekaran;...",Microsoft Co-founder Bill Gates met with Tata ...
2,business,SoftBank sells shares worth ₹954 crore in logi...,SoftBank sold shares worth ₹954 crore in logis...
3,business,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...
4,business,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...


In [50]:
# cleaning 'content'
news_df['cleaned'] = news_df['content'].apply(p.basic_clean)

# tokenising function
news_df['cleaned'] = news_df['cleaned'].apply(p.tokenise)

In [51]:
# Stemming function
news_df['stemmed'] = news_df['cleaned'].apply(p.stem)

In [52]:
    # lemmatising function
news_df['lemmatised'] = news_df['cleaned'].apply(p.lemmatize)

In [53]:
 # removing stopwords from stemmed column
news_df['sans_stopwords'] = news_df['stemmed'].apply(p.remove_stopwords, extra_words, keep_words)

In [56]:
# applying the function

p.clean_stem_lem_stop(news_df, extra_words, keep_words)

Unnamed: 0,category,title,cleaned,stemmed,lemmatised,sans_stopwords,original
0,business,All Adani stocks end higher for the first time...,"[all, 10, adani, group, stocks, closed, higher...","[all, 10, adani, group, stock, close, higher, ...","[all, 10, adani, group, stock, closed, higher,...","[10, adani, group, stock, close, higher, wedne...",All 10 Adani Group stocks closed higher on Wed...
1,business,"Bill Gates meets Ratan Tata, N Chandrasekaran;...","[microsoft, co, founder, bill, gates, met, wit...","[microsoft, co, founder, bill, gate, met, with...","[microsoft, co, founder, bill, gate, met, with...","[microsoft, co, founder, bill, gate, met, tata...",Microsoft Co-founder Bill Gates met with Tata ...
2,business,SoftBank sells shares worth ₹954 crore in logi...,"[softbank, sold, shares, worth, 954, crore, in...","[softbank, sold, share, worth, 954, crore, in,...","[softbank, sold, share, worth, 954, crore, in,...","[softbank, sold, share, worth, 954, crore, log...",SoftBank sold shares worth ₹954 crore in logis...
3,business,Smriti Irani's 2011 tweet on LPG price hike re...,"[hours, after, the, central, government, raise...","[hour, after, the, central, govern, rais, the,...","[hour, after, the, central, government, raised...","[hour, central, govern, rais, price, commerci,...",Hours after the central government raised the ...
4,business,"Indian-Americans Renjen, Subramaniam to be mem...","[indian, americans, punit, renjen, and, rajesh...","[indian, american, punit, renjen, and, rajesh,...","[indian, american, punit, renjen, and, rajesh,...","[indian, american, punit, renjen, rajesh, subr...",Indian-Americans Punit Renjen and Rajesh Subra...
...,...,...,...,...,...,...,...
95,entertainment,"I won't pay the bill on a date, says Nora Fatehi","[kapil, sharma, in, a, recent, episode, of, ',...","[kapil, sharma, in, a, recent, episod, of, ', ...","[kapil, sharma, in, a, recent, episode, of, ',...","[kapil, sharma, recent, episod, ', kapil, shar...","Kapil Sharma, in a recent episode of 'The Kapi..."
96,entertainment,Actors today focus on gym and social media mor...,"[actor, govind, namdev, has, said, that, the, ...","[actor, govind, namdev, ha, said, that, the, c...","[actor, govind, namdev, ha, said, that, the, c...","[actor, govind, namdev, ha, said, current, act...",Actor Govind Namdev has said that the current ...
97,entertainment,It takes guts: Rajatava on Akshay accepting fa...,"[bengali, actor, rajatava, dutta, praised, aks...","[bengali, actor, rajatava, dutta, prais, aksha...","[bengali, actor, rajatava, dutta, praised, aks...","[bengali, actor, rajatava, dutta, prais, aksha...",Bengali actor Rajatava Dutta praised Akshay Ku...
98,entertainment,We'd end up matching clothes without planning:...,"[speaking, about, the, similarities, that, she...","[speak, about, the, similar, that, she, found,...","[speaking, about, the, similarity, that, she, ...","[speak, similar, found, husband, rj, anmol, me...",Speaking about the similarities that she found...
