In [1]:
import nltk
import os
import glob
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import PorterStemmer
import time

In [2]:
path = 'alt.atheism'

In [3]:
dirs = os.listdir(path)

In [4]:
os.chdir(path)

d = []
for file in glob.glob("*"):
    with open(file, 'r', encoding="utf8", errors="ignore") as f:
        data = f.read()
    d.append(data)

In [5]:
contents = ''.join(d)

## NLTK without Parallelization

In [6]:
def nltk_tokenize(contents, level = 'word'):
    if level == 'word':
        return word_tokenize(contents)
    else:
        return sent_tokenize(contents)

### - Tokenization

In [7]:
start_time = time.time() 
words = nltk_tokenize(contents, level = 'word')
sents = nltk_tokenize(contents, level = 'sent')
print('Tokenization (both word and sentence level) in nltk for this corpus is :', time.time() - start_time )

Tokenization (both word and sentence level) in nltk for this corpus is : 4.29943323135376


In [8]:
def stemming(contents):
    ps = PorterStemmer() 
    words = word_tokenize(contents)
    stem = {}
    for word in words:
        stem[word] = ps.stem(word)
#         stem.append(word+':'+ps.stem(word))
    return stem

### - Stemming

In [9]:
start_time = time.time()
stems = stemming(contents)
print('Stemming in nltk took:', time.time() - start_time )

Stemming in nltk took: 8.434539794921875


In [10]:
def pos_tagging(contents):
    words =  word_tokenize(contents)
    nltk_pos = nltk.pos_tag(words)
    return nltk_pos

### - POS Tagging

In [11]:
start_time = time.time()
nltk_pos = pos_tagging(contents)
print ("POS tagging takes %s"%(time.time()-start_time))

POS tagging takes 18.19748091697693


## Parallelization

In [12]:
from joblib import Parallel, delayed

### - Tokenization with Parallelization in NLTK

In [14]:
start_time = time.time()
words = Parallel(n_jobs=3)(delayed(nltk_tokenize)(i) for i in d)
sents = Parallel(n_jobs=3)(delayed(nltk_tokenize)(i, 'sent') for i in d)
print('Tokenization (both word and sentence level) in nltk with tokenization for this corpus is %s'%(time.time()-start_time))

Tokenization (both word and sentence level) in nltk with tokenization for this corpus is 2.3760998249053955


## Spacy

### - Tokenization

In [15]:
import spacy

In [16]:
def tokenization(contents):
    nlp = spacy.load('en_core_web_sm') 
    nlp.max_length = 5000000
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    doc = nlp(contents)
    sents = [sent.string.strip() for sent in doc.sents]
    words = [token.text for token in doc]
    return sents, words

In [17]:
start_time = time.time()
sents, words = tokenization(contents)
print ("Tokneization takes %s"%(time.time()-start_time))

Tokneization takes 83.17936277389526


### - Stemming

Spacy doesn't currently support stemming. We will use lemmatization instead. 

In [18]:
def lemmatization(contents):
    nlp = spacy.load('en_core_web_sm')
    nlp.max_length = 5000000
    doc = nlp(contents)
    stem = {}
    for token in doc:
        stem[token] = token.lemma_
        
    return stem

In [19]:
start_time = time.time()
stem = lemmatization(contents)
print ("Lemmatization takes %s"%(time.time()-start_time))

Lemmatization takes 88.59071898460388


### - POS Tagging

In [20]:
def pos_tagging(contents):
    nlp = spacy.load('en_core_web_sm') 
    nlp.max_length = 5000000
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    doc = nlp(contents)
    words = [token.text for token in doc]
    pos = {word:word.pos_ for word in doc}
    return pos

In [21]:
start_time = time.time()
pos = pos_tagging(contents)
print ("POS tagging takes %s"%(time.time()-start_time))

POS tagging takes 89.37744522094727


### - Tokenization with Parallelization in Spacy

In [22]:
start_time = time.time()
words = Parallel(n_jobs=3)(delayed(tokenization)(i) for i in d)
print('Tokenization (both word and sentence level) in Spacy with tokenization for this corpus is :', time.time()-start_time)

Tokenization (both word and sentence level) in Spacy with tokenization for this corpus is : 199.79079794883728


## REGEX Matching

### Email Matching

In [23]:
import re

email = re.findall(r'[A-Za-z0-9_\-\.]+\@[A-Za-z0-9_\-\.]+\.[A-Za-z0-9_\-\.]+', contents)

print(email[:100])

['p00261@psilink.com', '930416.141520.7h1.rusnews.w165w@mantis.co.uk', '2944079995.1.p00261@psilink.com', 'usenet@worldlink.com', 'mathew@mantis.co.uk', 'decay@cbnewsj.cb.att.com', 'C63AEC.FB3@cbnewsj.cb.att.com', 'bissda.4.734849678@saturn.wwc.edu', 'madhausC5yD87.KIp@netcom.com', 'madhausC5yD87.KIp@netcom.com', 'madhaus@netcom.com', 'healta@saturn.wwc.edu', 'jimh@carson.u.washington.edu', '1r0rmtINNk5n@shelley.u.washington.edu', 'timmbake@mcl.ucsb.edu', 'a137490@lehtori.cc.tut.fi', 'hdq@cc.tut.fi', '1993Apr10.191100.16094@ultb.isc.rit.edu', '1993Apr10.191100.16094@ultb.isc.rit.edu', 'snm6394@ultb.isc.rit.edu', 'a137490@cc.tut.fi', 'aaron@minster.york.ac.uk', '735563729.1016@minster.york.ac.uk', '1993Apr21.171937.2489@daffy.cs.wisc.edu', 'mccullou@snake10.cs.wisc.edu', 'west@next02cville.wam.umd.edu', '1993Apr6.021635.20958@wam.umd.edu', 'usenet@wam.umd.edu', 'west@next02.wam.umd.edu', 'kmr4.1433.734039535@po.CWRU.edu', 'kmr4.1433.734039535@po.CWRU.edu', 'kmr4@po.CWRU.edu', '1993Apr5.

### Dates

In [24]:
dates_1 = re.findall(r'[0-9]+/[0-9]+/[0-9]+', contents)

In [25]:
dates_1

['3/18/93', '3/31/93', '3/31/93', '4/3/93']

In [26]:
dates_2 = re.findall(r'[0-9]{4}\-[0-9]{2}\-[0-9]{2}', contents)

In [27]:
dates_2

[]

In [28]:
dates_3 = re.findall(r'\d+ [JFAMSOND]\w+ \d+', contents)

In [29]:
dates_3[:10]

['16 Apr 1993',
 '16 Apr 1993',
 '26 Apr 1993',
 '20 Apr 1993',
 '15 Apr 1993',
 '23 Apr 1993',
 '6 Apr 1993',
 '17 Apr 1993',
 '21 Apr 93',
 '21 Apr 1993']

In [30]:
dates_4 = re.findall(r'[JFAMSOND]\w+ \d+[th|st]+ \d+', contents)

In [31]:
dates_4

['February 17th 1992', 'February 21st 1989']

In [32]:
all_dates = dates_1 + dates_2 + dates_3 + dates_4

In [33]:
len(all_dates)

1101