In [25]:
import pandas as pd

# importing data from csv file, using the first row as column labels 'headers'
data = pd.read_csv('news_sample.csv', header = 0, index_col=0)

In [26]:
from cleantext import clean

# using the clean-text library to clean the text
def clean_text(s):
    return clean(s,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )
# the cleaned text is stored in a new column 'cleaned_content'
data['cleaned_content'] = data['content'].apply(clean_text)

In [27]:
import nltk
from nltk.tokenize import  word_tokenize # this tokenizer is arbitrary, but we can use it for now and analyze the results later
#nltk.download('punkt')

data['processed_content'] = data['cleaned_content'].apply(lambda t : (word_tokenize(t)))
word_frq_pre_stopwords_removal = nltk.FreqDist(data['processed_content'].sum())

In [28]:
from nltk.corpus import stopwords
# the following line must be uncommented first time this package is used in the environment
#nltk.download('stopwords')

def remove_stopwords(s):
    # the stopwords from the library are put in a set for faster lookup
    words_to_remove = set(stopwords.words('english')) # this line makes the code ~400x faster!
    # checking each indiviudal token to see if its in the set of stopwords
    return [w for w in s if w not in words_to_remove]
# the stepwordless text is stored in a new column 'processed_content' overriding the previous data
data['processed_content'] = data['processed_content'].apply(remove_stopwords)
word_frq_post_stopwords_removal = nltk.FreqDist(data['processed_content'].sum())

In [29]:
print('vocab size before removing stopwords', len(word_frq_pre_stopwords_removal))
print('token number before removing stopwords', sum(word_frq_pre_stopwords_removal.values()))
print('vocab size after removing stopwords', len(word_frq_post_stopwords_removal))
print('token number after removing stopwords', sum(word_frq_post_stopwords_removal.values()))
print('vocab size reduction', (len(word_frq_pre_stopwords_removal) - len(word_frq_post_stopwords_removal)) / len(word_frq_pre_stopwords_removal) * 100, '%')
print('token number reduction', (sum(word_frq_pre_stopwords_removal.values()) - sum(word_frq_post_stopwords_removal.values())) / sum(word_frq_pre_stopwords_removal.values()) * 100, '%')


vocab size before removing stopwords 16586
token number before removing stopwords 200995
vocab size after removing stopwords 16454
token number after removing stopwords 126763
vocab size reduction 0.7958519233088147 %
token number reduction 36.93226199656708 %


In [30]:
from nltk.stem import PorterStemmer # this stemmer is arbitrary, but we can use it for now and analyze the results later
# the following line must be uncommented first time this package is used in the environment
#nltk.download('wordnet')

# the stemmer reduces the tokens (words) to their root form
def stem_words(l):
    ps = PorterStemmer()
    return [ps.stem(w) for w in l]
# the stemmed text is stored in the column 'processed_content' overriding the previous data
data['processed_content'] = data['processed_content'].apply(stem_words)

In [31]:
word_frq_post_stemming = nltk.FreqDist(data['processed_content'].sum())
print('vocab reduction after stemming', (len(word_frq_post_stopwords_removal) - len(word_frq_post_stemming)) / len(word_frq_post_stopwords_removal) * 100, '%')

vocab reduction after stemming 31.445241278716423 %


In [33]:
data.to_csv('news_sample_processed.csv')

In [35]:
data

Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,cleaned_content,processed_content
0,141,awm.com,unreliable,http://awm.com/church-congregation-brings-gift...,Sometimes the power of Christmas will make you...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Church Congregation Brings Gift to Waitresses ...,Ruth Harris,,[''],,,,sometimes the power of christmas will make you...,"[sometim, power, christma, make, wild, wonder,..."
1,256,beforeitsnews.com,fake,http://beforeitsnews.com/awakening-start-here/...,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,Zurich Times,,[''],,,,"awakening of <num> strands of dna ""reconnectin...","[awaken, <, num, >, strand, dna, ``, reconnect..."
2,700,cnnnext.com,unreliable,http://www.cnnnext.com/video/18526/never-hike-...,Never Hike Alone: A Friday the 13th Fan Film U...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Never Hike Alone - A Friday the 13th Fan Film ...,,,[''],Never Hike Alone: A Friday the 13th Fan Film ...,,,never hike alone: a friday the 13th fan film u...,"[never, hike, alon, :, friday, 13th, fan, film..."
3,768,awm.com,unreliable,http://awm.com/elusive-alien-of-the-sea-caught...,"When a rare shark was caught, scientists were ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Elusive ‘Alien Of The Sea ‘ Caught By Scientis...,Alexander Smith,,[''],,,,"when a rare shark was caught, scientists were ...","[rare, shark, caught, ,, scientist, left, blun..."
4,791,bipartisanreport.com,clickbait,http://bipartisanreport.com/2018/01/21/trumps-...,Donald Trump has the unnerving ability to abil...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Trump’s Genius Poll Is Complete & The Results ...,Gloria Christie,,[''],,,,donald trump has the unnerving ability to abil...,"[donald, trump, unnerv, abil, abil, creat, rea..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,39259,beforeitsnews.com,fake,http://beforeitsnews.com/economy/2017/12/priso...,"Prison for Rahm, God’s Work And Many Others\r\...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"Prison for Rahm, God’s Work And Many Others",,,[''],,,,"prison for rahm, god's work and many others\nh...","[prison, rahm, ,, god, 's, work, mani, other, ..."
246,39468,beforeitsnews.com,fake,http://beforeitsnews.com/diy/2017/11/4-useful-...,4 Useful Items for Your Tiny Home\r\n\r\nHeadl...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,4 Useful Items for Your Tiny Home,Dimitry K,,[''],,,,<num> useful items for your tiny home\nheadlin...,"[<, num, >, use, item, tini, home, headlin, :,..."
247,39477,www.newsmax.com,,https://www.newsmax.com/politics/michael-hayde...,Former CIA Director Michael Hayden said Thursd...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Michael Hayden: We Should Be 'Frightened' by T...,Todd Beamon,,"['michael hayden', 'sthole countries', 'daca',...",President Donald Trump's reported remarks abou...,"Homeland Security, Trump Administration, Immig...",,former cia director michael hayden said thursd...,"[former, cia, director, michael, hayden, said,..."
248,39550,www.newsmax.com,,https://www.newsmax.com/newsfront/antonio-saba...,Antonio Sabato Jr. says Hollywood's liberal el...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Antonio Sabato Jr.: It's Oprah or Bust for Hol...,Bill Hoffmann,,"['antonio sabato jr', 'oprah winfrey', 'presid...",Antonio Sabato Jr. says Hollywood's liberal el...,"Trump Administration, ISIS/Islamic State, News...",,antonio sabato jr. says hollywood's liberal el...,"[antonio, sabato, jr., say, hollywood, 's, lib..."
