# Text processing and tokenization

In [22]:
import nltk as nl
import spacy as sp
import dask.dataframe as dd
from src.preprocess import preprocess as pp

## Processing

In [23]:
# Removing email addresses
import re
def remove_emails(text):
    """
    Removes emails from a given text entry
    """
    return re.sub(r'\S*@\S*\s?', '', text)

In [24]:
# Load data into data frame
df = dd.read_parquet('data/c1.parquet').partitions[0]
df

Unnamed: 0_level_0,id,content,type,title,authors,domain,url
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,int64,object,object,object,object,object,object
,...,...,...,...,...,...,...


In [25]:
df.head()

Unnamed: 0,id,content,type,title,authors,domain,url
0,2,"Life is an illusion, at least on a quantum lev...",rumor,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,express.co.uk,https://www.express.co.uk/news/science/738402/...
1,6,"Unfortunately, he hasn’t yet attacked her for ...",hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
2,7,The Los Angeles Police Department has been den...,hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
3,8,The White House has decided to quietly withdra...,hate,"MORE WINNING! Israeli intelligence source, DEB...","Cleavis Nowell, Cleavisnowell, Clarence J. Fei...",barenakedislam.com,http://barenakedislam.com/2017/12/24/more-winn...
4,9,“The time has come to cut off the tongues of t...,hate,"“Oh, Trump, you coward, you just wait, we will...","F.N. Lehner, Don Spilman, Clarence J. Feinour,...",barenakedislam.com,http://barenakedislam.com/2017/12/25/oh-trump-...


In [26]:
# Removing newline characters
def remove_newlines(text):
    """
    Removes newline characters from a given text entry
    """
    return re.sub(r'\n', '', text)

In [27]:
text = "\n This is a test \n"
print(remove_newlines(text))

 This is a test 


In [28]:
# Tokenization
def tokenize(text):
    """
    Tokenizes a given text entry
    """
    tokens = nl.wordpunct_tokenize(text)
    return list(filter(lambda x: x.isalnum(), tokens))

In [29]:
# Removing stopwords OBS! Have to download the model first (spacy download en_core_web_sm)
en = sp.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

def remove_stopwords(tokens):
    """
    Removes stopwords from a given list of tokens
    """
    return list(filter(lambda word: word not in stopwords, tokens))


In [30]:
# Stemming with spacy
import spacy as sp
en = sp.load('en_core_web_sm')
def lemmatize(tokens):
    """
    Stems a given list of tokens
    """
    return [en(word)[0].lemma_ for word in tokens]

In [31]:
tokens = tokenize(df.head(1).content.values[0])
lemmatize(tokens)

['life',
 'be',
 'an',
 'illusion',
 'at',
 'least',
 'on',
 'a',
 'quantum',
 'level',
 'in',
 'a',
 'theory',
 'which',
 'have',
 'recently',
 'be',
 'confirm',
 'by',
 'a',
 'set',
 'of',
 'researcher',
 'they',
 'finally',
 'have',
 'the',
 'mean',
 'to',
 'test',
 'John',
 'wheeler',
 's',
 'delay',
 'choice',
 'theory',
 'and',
 'conclude',
 'that',
 'the',
 'physicist',
 'be',
 'right',
 'in',
 '1978',
 'Mr',
 'wheeler',
 's',
 'propose',
 'experiment',
 'involve',
 'a',
 'move',
 'object',
 'that',
 'be',
 'give',
 'the',
 'choice',
 'to',
 'act',
 'like',
 'a',
 'wave',
 'or',
 'a',
 'particle',
 'the',
 'former',
 'act',
 'as',
 'a',
 'vibration',
 'with',
 'a',
 'frequency',
 'that',
 'can',
 'distinguish',
 'it',
 'from',
 'other',
 'wave',
 'and',
 'the',
 'latter',
 'have',
 'no',
 'frequency',
 'that',
 'you',
 'can',
 'determine',
 'its',
 'position',
 'in',
 'space',
 'unlike',
 'a',
 'wave',
 'and',
 'at',
 'what',
 'point',
 'do',
 'it',
 'decide',
 'to',
 'act',
 'l

In [32]:
# Stemming with nltk
from nltk.stem.porter import *
stemmer = PorterStemmer()
def stem(tokens):
    """
    Stems a given list of tokens
    """
    return list(map(lambda word: stemmer.stem(word), tokens))


In [37]:
# Combining all preprocessing steps
def preprocess(text):
    """
    Combines all preprocessing steps
    """
    text = remove_newlines(text)
    text = tokenize(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    return str(text)

In [36]:
preprocess(df.head(1).content.values[0])

"['Life', 'illusion', 'quantum', 'level', 'theory', 'recently', 'confirmed', 'set', 'researchers', 'They', 'finally', 'means', 'test', 'John', 'Wheeler', 's', 'delayed', 'choice', 'theory', 'concluded', 'physicist', 'right', 'In', '1978', 'Mr', 'Wheeler', 's', 'proposed', 'experiment', 'involved', 'moving', 'object', 'given', 'choice', 'act', 'like', 'wave', 'particle', 'acting', 'vibration', 'frequency', 'distinguish', 'waves', 'having', 'frequency', 'determine', 'position', 'space', 'unlike', 'wave', 'point', 'decide', 'act', 'like', 'At', 'time', 'technology', 'available', 'conduct', 'strong', 'experiment', 'scientists', 'able', 'carry']"

In [17]:
# Get a small subset of the df
# df = df.assign(cleaned_text=df.content.apply(preprocess, meta=('content', 'str')))
# df.head()

KeyboardInterrupt: 

## Fulfilling task #1

In [38]:
# Get sample
from src.preprocess import preprocess as pp
df = pp.import_data(sample = True)
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary
0,0,0,141,awm.com,unreliable,http://awm.com/church-congregation-brings-gift...,Sometimes the power of Christmas will make you...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Church Congregation Brings Gift to Waitresses ...,Ruth Harris,,[''],,,
1,1,1,256,beforeitsnews.com,fake,http://beforeitsnews.com/awakening-start-here/...,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,Zurich Times,,[''],,,
2,2,2,700,cnnnext.com,unreliable,http://www.cnnnext.com/video/18526/never-hike-...,Never Hike Alone: A Friday the 13th Fan Film U...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Never Hike Alone - A Friday the 13th Fan Film ...,,,[''],Never Hike Alone: A Friday the 13th Fan Film ...,,
3,3,3,768,awm.com,unreliable,http://awm.com/elusive-alien-of-the-sea-caught...,"When a rare shark was caught, scientists were ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Elusive ‘Alien Of The Sea ‘ Caught By Scientis...,Alexander Smith,,[''],,,
4,4,4,791,bipartisanreport.com,clickbait,http://bipartisanreport.com/2018/01/21/trumps-...,Donald Trump has the unnerving ability to abil...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Trump’s Genius Poll Is Complete & The Results ...,Gloria Christie,,[''],,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,245,245,39259,beforeitsnews.com,fake,http://beforeitsnews.com/economy/2017/12/priso...,"Prison for Rahm, God’s Work And Many Others\n\...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"Prison for Rahm, God’s Work And Many Others",,,[''],,,
246,246,246,39468,beforeitsnews.com,fake,http://beforeitsnews.com/diy/2017/11/4-useful-...,4 Useful Items for Your Tiny Home\n\nHeadline:...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,4 Useful Items for Your Tiny Home,Dimitry K,,[''],,,
247,247,247,39477,www.newsmax.com,,https://www.newsmax.com/politics/michael-hayde...,Former CIA Director Michael Hayden said Thursd...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Michael Hayden: We Should Be 'Frightened' by T...,Todd Beamon,,"['michael hayden', 'sthole countries', 'daca',...",President Donald Trump's reported remarks abou...,"Homeland Security, Trump Administration, Immig...",
248,248,248,39550,www.newsmax.com,,https://www.newsmax.com/newsfront/antonio-saba...,Antonio Sabato Jr. says Hollywood's liberal el...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Antonio Sabato Jr.: It's Oprah or Bust for Hol...,Bill Hoffmann,,"['antonio sabato jr', 'oprah winfrey', 'presid...",Antonio Sabato Jr. says Hollywood's liberal el...,"Trump Administration, ISIS/Islamic State, News...",


In [48]:
# Basic data treatment
df['cleaned_content'] = df.content.apply(remove_newlines)
df['cleaned_content'] = df.cleaned_content.apply(tokenize)
df.cleaned_content.iloc[0]

['Sometimes',
 'the',
 'power',
 'of',
 'Christmas',
 'will',
 'make',
 'you',
 'do',
 'wild',
 'and',
 'wonderful',
 'things',
 'You',
 'do',
 'not',
 'need',
 'to',
 'believe',
 'in',
 'the',
 'Holy',
 'Trinity',
 'to',
 'believe',
 'in',
 'the',
 'positive',
 'power',
 'of',
 'doing',
 'good',
 'for',
 'others',
 'The',
 'simple',
 'act',
 'of',
 'giving',
 'without',
 'receiving',
 'is',
 'lost',
 'on',
 'many',
 'of',
 'us',
 'these',
 'days',
 'as',
 'worries',
 'about',
 'money',
 'and',
 'success',
 'hold',
 'us',
 'back',
 'from',
 'giving',
 'to',
 'others',
 'who',
 'are',
 'in',
 'need',
 'One',
 'congregation',
 'in',
 'Ohio',
 'was',
 'moved',
 'to',
 'action',
 'by',
 'the',
 'power',
 'of',
 'a',
 'sermon',
 'given',
 'at',
 'their',
 'church',
 'on',
 'Christmas',
 'Eve',
 'The',
 'pastor',
 'at',
 'Grand',
 'Lake',
 'United',
 'Methodist',
 'Church',
 'in',
 'Celina',
 'Ohio',
 'gave',
 'an',
 'emotional',
 'sermon',
 'about',
 'the',
 'importance',
 'of',
 'understan

In [51]:
# Compute size of vocabulary
from collections import Counter
vocab = Counter()
for text in df.cleaned_content:
    vocab.update(text)
len(vocab)

19847

In [52]:
# Stem the words in cleaned content
df['cleaned_content'] = df.cleaned_content.apply(stem)
df.cleaned_content.iloc[0]

['sometim',
 'the',
 'power',
 'of',
 'christma',
 'will',
 'make',
 'you',
 'do',
 'wild',
 'and',
 'wonder',
 'thing',
 'you',
 'do',
 'not',
 'need',
 'to',
 'believ',
 'in',
 'the',
 'holi',
 'triniti',
 'to',
 'believ',
 'in',
 'the',
 'posit',
 'power',
 'of',
 'do',
 'good',
 'for',
 'other',
 'the',
 'simpl',
 'act',
 'of',
 'give',
 'without',
 'receiv',
 'is',
 'lost',
 'on',
 'mani',
 'of',
 'us',
 'these',
 'day',
 'as',
 'worri',
 'about',
 'money',
 'and',
 'success',
 'hold',
 'us',
 'back',
 'from',
 'give',
 'to',
 'other',
 'who',
 'are',
 'in',
 'need',
 'one',
 'congreg',
 'in',
 'ohio',
 'wa',
 'move',
 'to',
 'action',
 'by',
 'the',
 'power',
 'of',
 'a',
 'sermon',
 'given',
 'at',
 'their',
 'church',
 'on',
 'christma',
 'eve',
 'the',
 'pastor',
 'at',
 'grand',
 'lake',
 'unit',
 'methodist',
 'church',
 'in',
 'celina',
 'ohio',
 'gave',
 'an',
 'emot',
 'sermon',
 'about',
 'the',
 'import',
 'of',
 'understand',
 'the',
 'messag',
 'of',
 'jesu',
 'for',


In [53]:
# Recompute size of vocabulary
vocab = Counter()
for text in df.cleaned_content:
    vocab.update(text)
len(vocab)

11389

## Running processor across entire dataset and outputting cleaned parquet files

In [16]:
from dask.distributed import Client, LocalCluster

In [17]:
# Create a local cluster
cluster = LocalCluster(n_workers=2, threads_per_worker=4, memory_limit='6GB')
client = Client(cluster)
client

2023-03-09 16:49:41,743 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-fu3sx2es', purging
2023-03-09 16:49:41,744 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-ng_x6lq0', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 2
Total threads: 8,Total memory: 11.18 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:39899,Workers: 2
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 11.18 GiB

0,1
Comm: tcp://127.0.0.1:35835,Total threads: 4
Dashboard: http://127.0.0.1:42847/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:37433,
Local directory: /tmp/dask-worker-space/worker-8au_4usp,Local directory: /tmp/dask-worker-space/worker-8au_4usp

0,1
Comm: tcp://127.0.0.1:34883,Total threads: 4
Dashboard: http://127.0.0.1:38969/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:35607,
Local directory: /tmp/dask-worker-space/worker-u1f3w2im,Local directory: /tmp/dask-worker-space/worker-u1f3w2im


In [18]:
# Load in the dataframe
ddf = dd.read_parquet('data/c1.parquet')
ddf.persist()

Unnamed: 0_level_0,id,content,type,title,authors,domain,url
npartitions=31,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,int64,object,object,object,object,object,object
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [20]:
client.restart()
ddf.partitions[0].compute()



Unnamed: 0,id,content,type,title,authors,domain,url
0,2,"Life is an illusion, at least on a quantum lev...",rumor,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,express.co.uk,https://www.express.co.uk/news/science/738402/...
1,6,"Unfortunately, he hasn’t yet attacked her for ...",hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
2,7,The Los Angeles Police Department has been den...,hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
3,8,The White House has decided to quietly withdra...,hate,"MORE WINNING! Israeli intelligence source, DEB...","Cleavis Nowell, Cleavisnowell, Clarence J. Fei...",barenakedislam.com,http://barenakedislam.com/2017/12/24/more-winn...
4,9,“The time has come to cut off the tongues of t...,hate,"“Oh, Trump, you coward, you just wait, we will...","F.N. Lehner, Don Spilman, Clarence J. Feinour,...",barenakedislam.com,http://barenakedislam.com/2017/12/25/oh-trump-...
...,...,...,...,...,...,...,...
29995,33688,300 with Salmonella at Boise Co-op\n\n% of rea...,fake,300 with Salmonella at Boise Co-op,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...
29996,33689,CDC Coming to Washington to Help in 90 Person ...,fake,CDC Coming to Washington to Help in 90 Person ...,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...
29997,33690,Write for Food Safety News\n\n% of readers thi...,fake,Write for Food Safety News,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...
29998,33691,Cyclospora Sickens 358\n\n% of readers think t...,fake,Cyclospora Sickens 358,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...


In [21]:
# Apply preprocessing to the data
client.restart()
ddf = ddf.assign(cleaned_content=ddf.content.apply(preprocess, meta=('content', 'str')))
ddf.persist()



Unnamed: 0_level_0,id,content,type,title,authors,domain,url,cleaned_content
npartitions=31,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,int64,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...


In [22]:
client.restart()
ddf.partitions[0].compute()

KeyboardInterrupt: 

In [None]:
#client.restart()
# ddf.to_parquet('data/test_cleaning', write_index=False)