In [1]:
import pandas as pd

# importing data from csv file, using the first row as column labels 'headers'
data = pd.read_csv('news_sample.csv', header = 0)

In [2]:
from cleantext import clean

# using the clean-text library to clean the text
def clean_text(s):
    return clean(s,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )
# the cleaned text is stored in a new column 'cleaned_content'
data['cleaned_content'] = data['content'].apply(clean_text)

In [3]:
import nltk
from nltk.tokenize import  word_tokenize # this tokenizer is arbitrary, but we can use it for now and analyze the results later
#nltk.download('punkt')

data['processed_content'] = data['cleaned_content'].apply(lambda t : (word_tokenize(t)))
word_frq_pre_stopwords_removal = nltk.FreqDist(data['processed_content'].sum())

In [4]:
from nltk.corpus import stopwords
# the following line must be uncommented first time this package is used in the environment
#nltk.download('stopwords')

def remove_stopwords(s):
    # the stopwords from the library are put in a set for faster lookup
    words_to_remove = set(stopwords.words('english')) # this line makes the code ~400x faster!
    # checking each indiviudal token to see if its in the set of stopwords
    return [w for w in s if w not in words_to_remove]
# the stepwordless text is stored in a new column 'processed_content' overriding the previous data
data['processed_content'] = data['processed_content'].apply(remove_stopwords)
word_frq_post_stopwords_removal = nltk.FreqDist(data['processed_content'].sum())

In [5]:
print('vocab size before removing stopwords', len(word_frq_pre_stopwords_removal))
print('token number before removing stopwords', sum(word_frq_pre_stopwords_removal.values()))
print('vocab size after removing stopwords', len(word_frq_post_stopwords_removal))
print('token number after removing stopwords', sum(word_frq_post_stopwords_removal.values()))
print('vocab size reduction', (len(word_frq_pre_stopwords_removal) - len(word_frq_post_stopwords_removal)) / len(word_frq_pre_stopwords_removal) * 100, '%')
print('token number reduction', (sum(word_frq_pre_stopwords_removal.values()) - sum(word_frq_post_stopwords_removal.values())) / sum(word_frq_pre_stopwords_removal.values()) * 100, '%')


vocab size before removing stopwords 16488
token number before removing stopwords 201978
vocab size after removing stopwords 16356
token number after removing stopwords 127639
vocab size reduction 0.8005822416302766 %
token number reduction 36.80549366762717 %


In [7]:
# import lemmatizer
from nltk.stem import WordNetLemmatizer
def lemmatize(s):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in s]
# the lemmatized text is stored in a new column 'processed_content' overriding the previous data
data['processed_content'] = data['processed_content'].apply(lemmatize)


In [9]:
word_frq_post_lemmatization = nltk.FreqDist(data['processed_content'].sum())
print('vocab reduction after lemmatization', (len(word_frq_post_stopwords_removal) - len(word_frq_post_lemmatization)) / len(word_frq_post_stopwords_removal) * 100, '%')

vocab reduction after lemmatization 10.613842015162632 %


In [20]:
from nltk.stem import PorterStemmer # this stemmer is arbitrary, but we can use it for now and analyze the results later
# the following line must be uncommented first time this package is used in the environment
#nltk.download('wordnet')

# the stemmer reduces the tokens (words) to their root form
def stem_words(l):
    ps = PorterStemmer()
    return [ps.stem(w) for w in l]
# the stemmed text is stored in the column 'processed_content' overriding the previous data
data['processed_content'] = data['processed_content'].apply(stem_words)

In [21]:
word_frq_post_stemming = nltk.FreqDist(data['processed_content'].sum())
print('vocab reduction after stemming', (len(word_frq_post_stopwords_removal) - len(word_frq_post_stemming)) / len(word_frq_post_stopwords_removal) * 100, '%')

vocab reduction after stemming 31.445241278716423 %
