# Converting parsers

In [51]:
# import re

import nltk
import pandas as pd

from pathlib import Path

from nltk import bigrams
# from nltk import PorterStemmer as ps
# from nltk import SnowballStemmer as ss
# from nltk import WordNetLemmatizer as wn
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures as bigram_measures
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, wordpunct_tokenize


# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [2]:
%%time
file = Path('input', 'enron_mails.p')
df = pd.read_pickle(file)

CPU times: user 283 ms, sys: 228 ms, total: 511 ms
Wall time: 515 ms


In [3]:
def get_punct_tokens(text):
    tokens = wordpunct_tokenize(text)
    
    return tokens


def get_tokens(text):
    tokens = word_tokenize(text)
    
    return tokens


def get_bigrams(text):
    bigram = bigrams(text)
    
    return list(bigram)


def get_freqs(text):
    freq_dist = FreqDist(word.lower() for word in text)
    
    return freq_dist


def save_data(column):
    data_path = Path('output', f'email_nltk_{column}.pkl')
    if data_path.is_file():
        print('already saved')
    else:
        print(f'saving {column} ...')
        df.loc[:,['id', column]].to_pickle(data_path)

## Wordpunct_tokenize

In [4]:
%%time
df['tkp'] = df['text'].apply(get_punct_tokens)

CPU times: user 29.1 s, sys: 1.17 s, total: 30.3 s
Wall time: 30.4 s


## word_tokenize

In [6]:
%%time
df['tkn'] = df['text'].apply(get_tokens)

CPU times: user 6min 33s, sys: 3.63 s, total: 6min 37s
Wall time: 6min 39s


## collocations and bigrams

In [5]:
%%time
df['fq_tkp'] = df['tkp'].apply(get_freqs)

CPU times: user 56.3 s, sys: 1.34 s, total: 57.7 s
Wall time: 58.1 s


In [7]:
%%time
df['fq_tkn'] = df['tkn'].apply(get_freqs)

CPU times: user 50.4 s, sys: 1.47 s, total: 51.9 s
Wall time: 52.2 s


In [8]:
%%time
df['bg_tkp'] = df['tkp'].apply(get_bigrams)

CPU times: user 15.4 s, sys: 4.38 s, total: 19.8 s
Wall time: 20.5 s


In [9]:
%%time
df['bg_tkn'] = df['tkn'].apply(get_bigrams)

CPU times: user 19 s, sys: 18 s, total: 37.1 s
Wall time: 42 s


In [10]:
%%time
df['fq_bg_tkp'] = df['bg_tkp'].apply(FreqDist)

CPU times: user 1min 5s, sys: 1min, total: 2min 5s
Wall time: 2min 28s


In [11]:
%%time
df['fq_bg_tkn'] = df['bg_tkn'].apply(FreqDist)

CPU times: user 46.7 s, sys: 8.31 s, total: 55 s
Wall time: 1min


In [31]:
%%time
df['cl_tkp'] = df.apply(lambda row: BigramCollocationFinder(row.fq_tkp, row.fq_bg_tkp), axis=1)

CPU times: user 36.2 s, sys: 1min 44s, total: 2min 20s
Wall time: 3min 6s


In [42]:
%%time
df['cl_tkn'] = df.apply(lambda row: BigramCollocationFinder(row.fq_tkn, row.fq_bg_tkn), axis=1)

CPU times: user 32.1 s, sys: 1min 17s, total: 1min 49s
Wall time: 2min 13s


In [None]:
df.drop(columns=['fq_tkp',
                 'fq_tkn',
                 'bg_tkp',
                 'bg_tkn',
                 'fq_bg_tkp',
                 'fq_bg_tkn',
                 'cl_tkp',
                 'cl_tkn'
                ],
        inplace=True)

## Lemmas

In [53]:
wn = nltk.WordNetLemmatizer()

In [54]:
%%time
df['lm_wn_tkp'] = df.apply(lambda row: [wn.lemmatize(word) for word in row.tkp], axis=1)

CPU times: user 4min 38s, sys: 1min 38s, total: 6min 17s
Wall time: 6min 54s


In [55]:
%%time
df['lm_wn_tkn'] = df.apply(lambda row: [wn.lemmatize(word) for word in row.tkn], axis=1)

CPU times: user 3min 51s, sys: 7.94 s, total: 3min 59s
Wall time: 4min 4s


In [56]:
%%time
df['fq_lm_wn_tkp'] = df['lm_wn_tkp'].apply(FreqDist)

CPU times: user 38.6 s, sys: 1.27 s, total: 39.9 s
Wall time: 41.7 s


In [57]:
%%time
df['fq_lm_wn_tkn'] = df['lm_wn_tkn'].apply(FreqDist)

CPU times: user 1min 2s, sys: 2min 9s, total: 3min 12s
Wall time: 4min 1s


In [None]:
df.drop(columns=['lm_wn_tkp',
                 'lm_wn_tkn',
                 'fq_lm_wn_tkp',
                 'fq_lm_wn_tkn'
                ],
        inplace=True)

## Stems

In [58]:
ps = nltk.PorterStemmer()
ss = nltk.SnowballStemmer(language = 'english')

In [59]:
%%time
df['st_ps_tkp'] = df.apply(lambda row: [ps.stem(word) for word in row.tkp], axis=1)

CPU times: user 14min 39s, sys: 8.81 s, total: 14min 47s
Wall time: 14min 52s


In [60]:
%%time
df['st_ps_tkn'] = df.apply(lambda row: [ps.stem(word) for word in row.tkn], axis=1)

CPU times: user 14min 29s, sys: 16.3 s, total: 14min 46s
Wall time: 15min 1s


In [61]:
%%time
df['fq_st_ps_tkp'] = df['st_ps_tkp'].apply(FreqDist)

CPU times: user 44.4 s, sys: 6.07 s, total: 50.5 s
Wall time: 54 s


In [62]:
%%time
df['fq_st_ps_tkn'] = df['st_ps_tkn'].apply(FreqDist)

CPU times: user 1min 20s, sys: 4min, total: 5min 20s
Wall time: 7min 3s


In [63]:
%%time
df['st_ss_tkp'] = df.apply(lambda row: [ss.stem(word) for word in row.tkp], axis=1)

CPU times: user 10min 49s, sys: 15.1 s, total: 11min 5s
Wall time: 11min 16s


In [64]:
%%time
df['st_ss_tkn'] = df.apply(lambda row: [ss.stem(word) for word in row.tkn], axis=1)

CPU times: user 10min 12s, sys: 12.3 s, total: 10min 25s
Wall time: 10min 35s


In [65]:
%%time
df['fq_st_ss_tkp'] = df['st_ss_tkp'].apply(FreqDist)

CPU times: user 40.2 s, sys: 2.85 s, total: 43 s
Wall time: 44.7 s


In [66]:
%%time
df['fq_st_ss_tkn'] = df['st_ss_tkn'].apply(FreqDist)

CPU times: user 34.3 s, sys: 1.44 s, total: 35.8 s
Wall time: 37 s


In [None]:
df.drop(columns=['st_ps_tkp',
                 'st_ps_tkn',
                 'fq_st_ps_tkp',
                 'fq_st_ps_tkn',
                 'st_ss_tkp',
                 'st_ss_tkn',
                 'fq_st_ss_tkp',
                 'fq_st_ss_tkn'],
        inplace=True)

## Export only 

In [67]:
for column in df.columns[2:]:
    data_path = Path('output', f'email_nltk_{column}.pkl')
    if data_path.is_file():
        print('already saved')
    else:
        print(f'saving {column} ...')
        df.loc[:,['id', column]].to_pickle(data_path)

saving tkp ...
saving fq_tkp ...
saving tkn ...
saving fq_tkn ...
saving bg_tkp ...
saving bg_tkn ...
saving fq_bg_tkp ...
saving fq_bg_tkn ...
saving cl_tkp ...
saving cl_tkn ...
saving lm_wn_tkp ...
saving lm_wn_tkn ...
saving fq_lm_wn_tkp ...
saving fq_lm_wn_tkn ...
saving st_ps_tkp ...
saving st_ps_tkn ...
saving fq_st_ps_tkp ...
saving fq_st_ps_tkn ...
saving st_ss_tkp ...
saving st_ss_tkn ...
saving fq_st_ss_tkp ...
saving fq_st_ss_tkn ...
