# Problem 1

In [1]:
# import libiaries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
import spacy
import os
import re
import time
import multiprocessing
import glob

[nltk_data] Downloading package punkt to /Users/zazhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zazhu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# get path
path = os.getcwd() + '/20_newsgroups/sci.space/*'
files = glob.glob(path)

# read txt files into a nested list
sci_space = []
for i in files:
    with open(i, 'r', encoding="utf8", errors="ignore") as f:
        data = f.read()
        sci_space.append(data)

## Functions

In [6]:
# nltk word tokenization
def nltk_word_tokenization(data):
    tokenized_word = []
    for d in data:
        words = word_tokenize(d)
        for word in words:
            tokenized_word.append(word)
    return tokenized_word

# nltk sentence tokenization
def nltk_sentence_tokenization(data):
    tokenized_sent = []
    for d in data:
        sents = sent_tokenize(d)
        for sent in sents:
            tokenized_sent.append(sent)
    return tokenized_sent

# nltk Stemming
def nltk_stemming(tokenized_word):
    stemmed_word = []
    ps = PorterStemmer()
    for w in tokenized_word:
        stemmed_word.append(ps.stem(w))
    return stemmed_word

# nltk POS tagging 
def nltk_pos_tagging(tokenized_word):
    pos_tags=nltk.pos_tag(tokenized_word)
    return pos_tags

# spacy sentence tokenization
def spacy_sentence_tokenization(data):
    nlp = spacy.load('en_core_web_sm', disable=["parser", "tagger", "ner"])
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    tokenized_sent = []
    for d in data:
        doc = nlp(d)
        sents = [sent.string.strip() for sent in doc.sents]
        for sent in sents:
            tokenized_sent.append(sent)
    return tokenized_sent    
            
# spacy word tokenization
def spacy_word_tokenization(data):
    nlp = spacy.load('en_core_web_sm', disable=["parser", "tagger", "ner"])
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    tokenized_word = []
    for d in data:
        doc = nlp(d)
        words = [token.text for token in doc]
        for word in words:
            tokenized_word.append(word)
    return tokenized_word

# spacy pos tagging
def spacy_pos_tagging(data):
    nlp = spacy.load('en_core_web_sm',disable=["parser", "ner"])
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    for d in data:
        doc = nlp(d)          
        pos_tags = [[token.text,token.pos_]for token in doc]
    return pos_tags

## Using `nltk` w/o  parallelization

In [7]:
# running time for applyung tokenization, stemming and POS tagging using nltk
start = time.time()
result_token_word = nltk_word_tokenization(sci_space)
print ("Word tokenization using nltk takes: %s"%(time.time()-start))

start = time.time()
result_tokwn_sent = nltk_sentence_tokenization(sci_space)
print ("Sentence tokenization using nltk takes: %s"%(time.time()-start))

start = time.time()
result_stem = nltk_stemming(result_token_word)
print ("Word stemming using nltk takes: %s"%(time.time()-start))

start = time.time()
result_pos_tag = nltk_pos_tagging(result_token_word)
print ("POS tagging using nltk takes: %s"%(time.time()-start))

Word tokenization using nltk takes: 2.687723159790039
Sentence tokenization using nltk takes: 0.9483108520507812
Word stemming using nltk takes: 4.678045988082886
POS tagging using nltk takes: 12.345778942108154


## Using `spacy` w/o  parallelization

In [8]:
# running time for applyung tokenization, stemming and POS tagging using spacy
start = time.time()
result_token_word = spacy_word_tokenization(sci_space)
print ("Word tokenization using spacy takes: %s"%(time.time()-start))

start = time.time()
result_tokwn_sent = spacy_sentence_tokenization(sci_space)
print ("Sentence tokenization using spacy takes: %s"%(time.time()-start))

start = time.time()
result_pos_tag = spacy_pos_tagging(sci_space)
print ("POS tagging using spacy takes: %s"%(time.time()-start))

Word tokenization using spacy takes: 2.8109638690948486
Sentence tokenization using spacy takes: 2.8330349922180176
POS tagging using spacy takes: 12.378584861755371


## Compare parallelization between `nltk` and  `spacy` 

In [23]:
def nltk_parallelization(data):
    word = nltk_word_tokenization(data)
    sent = nltk_sentence_tokenization(data)
    tag = nltk_pos_tagging(word)
    return (word,sent,tag)

def spacy_parallelization(data):
    nlp = spacy.load('en_core_web_sm',disable=["parser", "ner"])
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    tokenized_word = []
    tokenized_sent = []
    for doc in nlp.pipe(data, n_threads = -1):       
        pos_tags = [[token.text,token.pos_]for token in doc]
        sents = [sent.string.strip() for sent in doc.sents]
        words = [token.text for token in doc]
        for word in words:
            tokenized_word.append(word)
        for sent in sents:
            tokenized_sent.append(sent)         
    return (tokenized_word,tokenized_sent,pos_tags)

In [24]:
count = multiprocessing.cpu_count()
pool = multiprocessing.Pool(count)
start = time.time()
result = pool.map(nltk_parallelization, sci_space)
print ("Parallelization using nltk takes: %s"%(time.time()-start))

Parallelization using nltk takes: 44.20225214958191


In [26]:
start = time.time()
result = spacy_parallelization(sci_space)
print ("Spacy using threadding takes: %s"%(time.time()-start))

Spacy using threadding takes: 14.250606298446655


In [16]:
count = multiprocessing.cpu_count()
pool = multiprocessing.Pool(count)
start = time.time()
result = pool.map(spacy_parallelization, sci_space)
print ("Parallelization using spacy takes: %s"%(time.time()-start))

Parallelization using spacy takes: 107.56812691688538


# Problem 2

In [65]:
# emails
tokens = nltk_sentence_tokenization(sci_space)
emails = []
for sentence in tokens:
    email = re.findall(r'[\w\.-]+@[\w-]+\.[\w\.-]+', sentence)
    for e in email:
        emails.append(e)
print(emails)

['gnb@leo.bby.com.au', 'gene@theporch.raider.net', '1993Apr20.001757.7543@bby.com.au', 'usenet@bby.com.au', '6ZV82B2w165w@theporch.raider.net', '6ZV82B2w165w@theporch.raider.net', 'gene@theporch.raider.net', 'gnb@bby.com.au', 'jkatz@access.digex.com', 'jid@access.digex.net', 'palmer@cco.caltech.edu', '1rs8hlINN8he@gap.caltech.edu', '23APR199316425663@kelvin.jpl.nasa.gov', '3t75nhg@rpi.edu', '1993Apr29.201036.11256@den.mmc.com', 'C6A2At.E9z@zoo.toronto.edu', '9lp@access.digex.net', 'prb@access.digex.net', 'C6A2At.E9z@zoo.toronto.edu', 'henry@zoo.toronto.edu', 'palmer@alumni.caltech.edu', 'palmer@tgrs.gsfc.nasa.gov', 'henry@zoo.toronto.edu', 'C6AzE5.sF@zoo.toronto.edu', 'k00@wraith.cs.uow.edu.au', 'k00@wraith.cs.uow.edu.au', 'u9152083@wraith.cs.uow.edu.au', 'henry@zoo.toronto.edu', 'jmcocker@eos.ncsu.edu', '1993Apr15.190725.16117@ncsu.edu', 'jmcocker@c00068-100lez.eos.ncsu.edu', 'jmcocker@eos.ncsu.edu', 'jmcocker@eos.ncsu.edu', 'jmcocker@eos.ncsu.edu', 'kjenks@jsc.nasa.gov', '1993Apr26.1

In [62]:
# Dates pattern: 
# 1. May 12, 1996 
# 2. 05/12/1996
# 3. 1996/5/12
# 4. 1996May12
# 5. 12 May 1996

dates = []
for sentence in tokens:
    date = re.findall(r'[JFMAJSOND][a-z]\w+[\s][\d]{1,2}[,][\s][\d]{2,4}|[\d]{1,2}/[\d]{1,2}/[\d]{2,4}|[\d]{1,2}/[\d]{1,2}/[\d]{2,4}|[\d]{4}[JFMASOND][a-z]\w+[\d]{2}|[\d]{1,2}[\s][JFMASOND][a-z]\w+[\s][\d]{2,4}',sentence)
    for d in date:
        dates.append(d) 
print(dates)

['18 Apr 1993', '1993Apr20', '20 Apr 1993', '21 Apr 1993', '30 Apr 1993', '1993Apr29', '30 Apr 1993', '1993Apr15', '15 Apr 1993', '1993Apr26', '1993Apr23', '26 Apr 1993', '21 Apr 1993', '1993Apr20', '1993Apr21', '1993Apr21', '20 Apr 1993', '1993Apr21', '21 Apr 1993', '26 Apr 1993', '15 Apr 1993', 'April 15, 1993', '30 Apr 1993', '1993Apr15', '15 Apr 93', '30 Apr 1993', '21 Apr 93', '1993Apr20', '1993Apr20', '1993Apr20', '21 Apr 1993', '30 Apr 1993', '1993Apr30', '1993Apr30', '1993Apr30', '15 Apr 93', '1993Apr15', '1993Apr15', '1993Apr15', '1993Apr19', '19 Apr 1993', '1993Apr30', '30 Apr 93', '22 Apr 1993', '17 May 1993', '25 Apr 1993', '1993Apr21', '17 May 93', '1993May16', '16 May 1993', '16 May 1993', '19 Apr 93', '23 Apr 93', '1993Apr22', '22 Apr 93', '26 Mar 93', '30 Apr 1993', '1993Apr30', '1993Apr30', '18 May 1993', '1993May14', '1993May14', '1993Apr22', '22 Apr 93', '1993Apr22', '16 May 1993', '22 Apr 93', '1993Apr26', '1993Apr23', '26 Apr 1993', '29 Apr 93', '1993Apr28', '1993A