In [21]:
import pandas as pd
import numpy as np

import unicodedata
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import acquire
import prepare

In [2]:
url = 'https://codeup.com/blog/'

In [6]:
all_blogs = acquire.get_blog_content(url)

In [11]:
len(all_blogs)

6

In [12]:
def basic_clean(original):
    article = original.lower()
    
    article = unicodedata.normalize('NFKD', article).encode('ascii', 'ignore').decode('utf-8')
    
    article = re.sub(r'[^a-z0-9\'\s]', '', article)
    
    return article

In [14]:
def tokenize(article):
    tokenize = ToktokTokenizer()
    
    article = tokenize.tokenize(article, return_str=True)
    
    return article

In [15]:
def stem(article):
    
    ps = nltk.porter.PorterStemmer()
    
    stems = [ps.stem(word) for word in article.split()]
    
    article_stemmed = ' '.join(stems)
    
    return article_stemmed

In [16]:
def lemmatize(article):
    
    wnl = nltk.stem.WordNetLemmatizer()
    
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    
    article_lemmatized = ' '.join(lemmas)
    
    return article_lemmatized

In [39]:
def remove_stopwords(article, extra_words = 'no_extra', exclude_words = 'all_good'):
        
    stopwords_ls = stopwords.words('english')
    
    if extra_words != 'no_extra':
        
        stopwords_ls = stopwords_ls.append(extra_words)
    else:
        stopwords_ls = stopwords_ls
            
    if exclude_words != 'all_good':
        
        stopwords_ls = stopwords_ls.remove(exclude_words)
    else:
        stopwords_ls = stopwords_ls
    
    words = article.split()
    
    filtered_words = [word for word in words if word not in stopwords_ls]
    
    article_without_stopwords = ' '.join(filtered_words)
    
    return article_without_stopwords

In [30]:
og_content = all_blogs[0]['content']

In [25]:
blog_df = pd.DataFrame(all_blogs)
blog_df

Unnamed: 0,title,content
0,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...
1,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...
2,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio..."
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...
5,2022 SABJ C-Suite Award Winner: Stephen Noteboom,"Codeup’s Chief Operating Officer, Stephen Note..."


In [32]:
article = basic_clean(og_content)

In [34]:
article = tokenize(article)

In [36]:
article = lemmatize(article)

In [37]:
remove_stopwords(article)

'codeup excited launch first diversity equity inclusion dei report eight year organization weve implemented policy grown dei effort extremely proud progress weve made staff codeup community recognize learn report capture way weve lived value cultivating inclusive growth continue look future wanted shine light demographic student staff particular compare tech industry whole collect organize share employee demographic data informed standard set equal employment opportunity commission eeoc proud celebrate weve grown motivated committed better view report visit link download'

In [38]:
og_content

'Codeup is excited to launch our first Diversity Equity, and Inclusion (DEI) report! In over eight years as an organization, we’ve implemented policies and grown our DEI efforts. We are extremely proud of the progress we’ve made as a staff and Codeup community, and we recognize there is more to learn. This report captures some of the ways that we’ve lived our value of Cultivating Inclusive Growth, and how we will continue doing so as we look to the future.\nWe wanted to shine a light on the demographics of our students and staff, and in particular how that compares to the tech industry as a whole. How we collect, organize, and share employee demographic data is informed by standards set by the Equal Employment Opportunity Commission (EEOC).\nWe are proud to celebrate how we’ve grown and are motivated and committed to do more and be better. To view the report visit the link here, or download it below.'

In [42]:
blog_df = blog_df.rename(columns={'content':'original'})

In [44]:
blog_df['clean'] = blog_df.original.apply(basic_clean)
blog_df.head()

Unnamed: 0,title,original,clean
0,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...,codeup is excited to launch our first diversit...
1,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...,codeup has been named the 2022 diversity and i...
2,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...,deciding to transition into a tech career is a...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio...",codeup strongly values diversity and inclusion...
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...,with many companies switching to cloud service...


In [45]:
blog_df['stemmed'] = blog_df.clean.apply(stem)
blog_df.head()

Unnamed: 0,title,original,clean,stemmed
0,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...,codeup is excited to launch our first diversit...,codeup is excit to launch our first divers equ...
1,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...,codeup has been named the 2022 diversity and i...,codeup ha been name the 2022 divers and inclus...
2,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...,deciding to transition into a tech career is a...,decid to transit into a tech career is a big s...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio...",codeup strongly values diversity and inclusion...,codeup strongli valu divers and inclus in hono...
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...,with many companies switching to cloud service...,with mani compani switch to cloud servic and i...


In [46]:
blog_df['lemmatized'] = blog_df.stemmed.apply(lemmatize)
blog_df.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...,codeup is excited to launch our first diversit...,codeup is excit to launch our first divers equ...,codeup is excit to launch our first diver equi...
1,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...,codeup has been named the 2022 diversity and i...,codeup ha been name the 2022 divers and inclus...,codeup ha been name the 2022 diver and inclus ...
2,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...,deciding to transition into a tech career is a...,decid to transit into a tech career is a big s...,decid to transit into a tech career is a big s...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio...",codeup strongly values diversity and inclusion...,codeup strongli valu divers and inclus in hono...,codeup strongli valu diver and inclus in honor...
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...,with many companies switching to cloud service...,with mani compani switch to cloud servic and i...,with mani compani switch to cloud servic and i...
