# Text processing and tokenization

In [1]:
import dask
import nltk as nl
import spacy as sp
import dask.dataframe as dd
from src.preprocess import preprocess as pp

## Processing

In [2]:
# Removing email addresses
import re
def remove_emails(text):
    """
    Removes emails from a given text entry
    """
    return re.sub(r'\S*@\S*\s?', '', text)

In [3]:
# Load data into data frame
df = dd.read_parquet('data/c1.parquet').partitions[0]
df

Unnamed: 0_level_0,id,content,type,title,authors,domain,url
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,int64,object,object,object,object,object,object
,...,...,...,...,...,...,...


In [4]:
df.head()

Unnamed: 0,id,content,type,title,authors,domain,url
0,2,"Life is an illusion, at least on a quantum lev...",rumor,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,express.co.uk,https://www.express.co.uk/news/science/738402/...
1,6,"Unfortunately, he hasn’t yet attacked her for ...",hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
2,7,The Los Angeles Police Department has been den...,hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
3,8,The White House has decided to quietly withdra...,hate,"MORE WINNING! Israeli intelligence source, DEB...","Cleavis Nowell, Cleavisnowell, Clarence J. Fei...",barenakedislam.com,http://barenakedislam.com/2017/12/24/more-winn...
4,9,“The time has come to cut off the tongues of t...,hate,"“Oh, Trump, you coward, you just wait, we will...","F.N. Lehner, Don Spilman, Clarence J. Feinour,...",barenakedislam.com,http://barenakedislam.com/2017/12/25/oh-trump-...


In [5]:
# Removing newline characters
def remove_newlines(text):
    """
    Removes newline characters from a given text entry
    """
    return re.sub(r'\n', '', text)

In [6]:
text = "\n This is a test \n"
print(remove_newlines(text))

 This is a test 


In [7]:
# Tokenization
def tokenize(text):
    """
    Tokenizes a given text entry
    """
    tokens = nl.wordpunct_tokenize(text)
    return list(filter(lambda x: x.isalnum(), tokens))

In [8]:
# Removing stopwords OBS! Have to download the model first (spacy download en_core_web_sm)
en = sp.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

def remove_stopwords(tokens):
    """
    Removes stopwords from a given list of tokens
    """
    return list(filter(lambda word: word not in stopwords, tokens))


In [9]:
# Stemming with nltk
from nltk.stem.porter import *
stemmer = PorterStemmer()
def stem(tokens):
    """
    Stems a given list of tokens
    """
    return list(map(lambda word: stemmer.stem(word), tokens))


In [10]:
# Combining all preprocessing steps
def preprocess(text):
    """
    Combines all preprocessing steps
    """
    text = text
    text = remove_emails(text)
    text = remove_newlines(text)
    text = tokenize(text)
    text = remove_stopwords(text)
    # text = stem(text)
    return str(text)

In [11]:
# Get a small subset of the df
df = df.assign(cleaned_text=df.content.apply(preprocess, meta=('content', 'str')))
df.head()

['Life', 'illusion', 'quantum', 'level', 'theory', 'recently', 'confirmed', 'set', 'researchers', 'They', 'finally', 'means', 'test', 'John', 'Wheeler', 's', 'delayed', 'choice', 'theory', 'concluded', 'physicist', 'right', 'In', '1978', 'Mr', 'Wheeler', 's', 'proposed', 'experiment', 'involved', 'moving', 'object', 'given', 'choice', 'act', 'like', 'wave', 'particle', 'acting', 'vibration', 'frequency', 'distinguish', 'waves', 'having', 'frequency', 'determine', 'position', 'space', 'unlike', 'wave', 'point', 'decide', 'act', 'like', 'At', 'time', 'technology', 'available', 'conduct', 'strong', 'experiment', 'scientists', 'able', 'carry']


In [12]:
# df['cleaned_content'] = preprocess(df.content)
# df.head()

In [13]:
# df.compute()

In [14]:
# df = dd.read_parquet('data/c1.parquet')
# df

In [15]:
# Use Dask to apply preprocessing to all entries in the data frame (only the content column)
# df = dd.read_parquet('data/c1.parquet')
# df = df.assign(cleaned_content=preprocess(str(df['content'])))
# df.head()

## Running processor across entire dataset and outputting cleaned parquet files

### Loading lazy corpues loader

In [16]:
from dask.distributed import Client, LocalCluster

In [17]:
# Create a local cluster
cluster = LocalCluster(n_workers=2, threads_per_worker=4, memory_limit='6GB')
client = Client(cluster)
client

2023-03-09 16:49:41,743 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-fu3sx2es', purging
2023-03-09 16:49:41,744 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-ng_x6lq0', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 2
Total threads: 8,Total memory: 11.18 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:39899,Workers: 2
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 11.18 GiB

0,1
Comm: tcp://127.0.0.1:35835,Total threads: 4
Dashboard: http://127.0.0.1:42847/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:37433,
Local directory: /tmp/dask-worker-space/worker-8au_4usp,Local directory: /tmp/dask-worker-space/worker-8au_4usp

0,1
Comm: tcp://127.0.0.1:34883,Total threads: 4
Dashboard: http://127.0.0.1:38969/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:35607,
Local directory: /tmp/dask-worker-space/worker-u1f3w2im,Local directory: /tmp/dask-worker-space/worker-u1f3w2im


In [18]:
# Load in the dataframe
ddf = dd.read_parquet('data/c1.parquet')
ddf.persist()

Unnamed: 0_level_0,id,content,type,title,authors,domain,url
npartitions=31,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,int64,object,object,object,object,object,object
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [19]:
ddf.partitions[0].compute()

Unnamed: 0,id,content,type,title,authors,domain,url
0,2,"Life is an illusion, at least on a quantum lev...",rumor,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,express.co.uk,https://www.express.co.uk/news/science/738402/...
1,6,"Unfortunately, he hasn’t yet attacked her for ...",hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
2,7,The Los Angeles Police Department has been den...,hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
3,8,The White House has decided to quietly withdra...,hate,"MORE WINNING! Israeli intelligence source, DEB...","Cleavis Nowell, Cleavisnowell, Clarence J. Fei...",barenakedislam.com,http://barenakedislam.com/2017/12/24/more-winn...
4,9,“The time has come to cut off the tongues of t...,hate,"“Oh, Trump, you coward, you just wait, we will...","F.N. Lehner, Don Spilman, Clarence J. Feinour,...",barenakedislam.com,http://barenakedislam.com/2017/12/25/oh-trump-...
...,...,...,...,...,...,...,...
29995,33688,300 with Salmonella at Boise Co-op\n\n% of rea...,fake,300 with Salmonella at Boise Co-op,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...
29996,33689,CDC Coming to Washington to Help in 90 Person ...,fake,CDC Coming to Washington to Help in 90 Person ...,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...
29997,33690,Write for Food Safety News\n\n% of readers thi...,fake,Write for Food Safety News,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...
29998,33691,Cyclospora Sickens 358\n\n% of readers think t...,fake,Cyclospora Sickens 358,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...


In [20]:
client.restart()
ddf.partitions[0].compute()



Unnamed: 0,id,content,type,title,authors,domain,url
0,2,"Life is an illusion, at least on a quantum lev...",rumor,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,express.co.uk,https://www.express.co.uk/news/science/738402/...
1,6,"Unfortunately, he hasn’t yet attacked her for ...",hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
2,7,The Los Angeles Police Department has been den...,hate,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",barenakedislam.com,http://barenakedislam.com/category/donald-trum...
3,8,The White House has decided to quietly withdra...,hate,"MORE WINNING! Israeli intelligence source, DEB...","Cleavis Nowell, Cleavisnowell, Clarence J. Fei...",barenakedislam.com,http://barenakedislam.com/2017/12/24/more-winn...
4,9,“The time has come to cut off the tongues of t...,hate,"“Oh, Trump, you coward, you just wait, we will...","F.N. Lehner, Don Spilman, Clarence J. Feinour,...",barenakedislam.com,http://barenakedislam.com/2017/12/25/oh-trump-...
...,...,...,...,...,...,...,...
29995,33688,300 with Salmonella at Boise Co-op\n\n% of rea...,fake,300 with Salmonella at Boise Co-op,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...
29996,33689,CDC Coming to Washington to Help in 90 Person ...,fake,CDC Coming to Washington to Help in 90 Person ...,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...
29997,33690,Write for Food Safety News\n\n% of readers thi...,fake,Write for Food Safety News,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...
29998,33691,Cyclospora Sickens 358\n\n% of readers think t...,fake,Cyclospora Sickens 358,Marler Blog,beforeitsnews.com,http://beforeitsnews.com/food-and-farming/2015...


In [21]:
# Apply preprocessing to the dataframe
client.restart()
ddf = ddf.assign(cleaned_content=ddf.content.apply(preprocess, meta=('content', 'str')))
ddf.persist()



Unnamed: 0_level_0,id,content,type,title,authors,domain,url,cleaned_content
npartitions=31,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,int64,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...


In [22]:
ddf.partitions[0].compute()

KeyboardInterrupt: 

In [None]:
#client.restart()
# ddf.to_parquet('data/test_cleaning', write_index=False)