# Converting parsers

In [None]:
import nltk
import pandas as pd

from pathlib import Path

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures as bigram_measures
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, wordpunct_tokenize


# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [None]:
%%time
file = Path('input', 'enron_mails.p')
df = pd.read_pickle(file)

In [None]:
def get_punct_tokens(text):
    tokens = wordpunct_tokenize(text)
    
    return tokens


def get_tokens(text):
    tokens = word_tokenize(text)
    
    return tokens


def get_bigrams(text):
    bigram = nltk.bigrams(text)
    
    return list(bigram)


def get_freqs(text):
    freq_dist = FreqDist(word.lower() for word in text)
    
    return freq_dist


def save_data(column):
    data_path = Path('output', f'email_nltk_{column}.pkl')
    if data_path.is_file():
        print('already saved')
    else:
        print(f'saving {column} ...')
        df.loc[:,['id', column]].to_pickle(data_path)

## Wordpunct_tokenize

In [None]:
%%time
df['tkp'] = df['text'].apply(get_punct_tokens)

## word_tokenize

In [None]:
%%time
df['tkn'] = df['text'].apply(get_tokens)

## Collocations and bigrams

In [None]:
%%time
df['fq_tkp'] = df['tkp'].apply(get_freqs)

In [None]:
%%time
df['fq_tkn'] = df['tkn'].apply(get_freqs)

In [None]:
%%time
df['bg_tkp'] = df['tkp'].apply(get_bigrams)

In [None]:
%%time
df['bg_tkn'] = df['tkn'].apply(get_bigrams)

In [None]:
%%time
df['fq_bg_tkp'] = df['bg_tkp'].apply(FreqDist)

In [None]:
%%time
df['fq_bg_tkn'] = df['bg_tkn'].apply(FreqDist)

In [None]:
%%time
df['cl_tkp'] = df.apply(lambda row: BigramCollocationFinder(row.fq_tkp, row.fq_bg_tkp), axis=1)

In [None]:
%%time
df['cl_tkn'] = df.apply(lambda row: BigramCollocationFinder(row.fq_tkn, row.fq_bg_tkn), axis=1)

In [None]:
cols = ['fq_tkp',
        'fq_tkn',
        'bg_tkp',
        'bg_tkn',
        'fq_bg_tkp',
        'fq_bg_tkn',
        'cl_tkp',
        'cl_tkn']
for col in cols:
    save_data(col)

In [None]:
df.drop(columns=cols,inplace=True)

## Lemmas

In [None]:
wn = nltk.WordNetLemmatizer()

In [None]:
%%time
df['lm_wn_tkp'] = df.apply(lambda row: [wn.lemmatize(word) for word in row.tkp], axis=1)

In [None]:
%%time
df['lm_wn_tkn'] = df.apply(lambda row: [wn.lemmatize(word) for word in row.tkn], axis=1)

In [None]:
%%time
df['fq_lm_wn_tkp'] = df['lm_wn_tkp'].apply(FreqDist)

In [None]:
%%time
df['fq_lm_wn_tkn'] = df['lm_wn_tkn'].apply(FreqDist)

In [None]:
cols = ['lm_wn_tkp',
        'lm_wn_tkn',
        'fq_lm_wn_tkp',
        'fq_lm_wn_tkn']
for col in cols:
    save_data(col)

In [None]:
df.drop(columns=cols, inplace=True)

## Stems

In [None]:
ps = nltk.PorterStemmer()
ss = nltk.SnowballStemmer(language = 'english')

In [None]:
%%time
df['st_ps_tkp'] = df.apply(lambda row: [ps.stem(word) for word in row.tkp], axis=1)

In [None]:
%%time
df['st_ps_tkn'] = df.apply(lambda row: [ps.stem(word) for word in row.tkn], axis=1)

In [None]:
%%time
df['fq_st_ps_tkp'] = df['st_ps_tkp'].apply(FreqDist)

In [None]:
%%time
df['fq_st_ps_tkn'] = df['st_ps_tkn'].apply(FreqDist)

In [None]:
%%time
df['st_ss_tkp'] = df.apply(lambda row: [ss.stem(word) for word in row.tkp], axis=1)

In [None]:
%%time
df['st_ss_tkn'] = df.apply(lambda row: [ss.stem(word) for word in row.tkn], axis=1)

In [None]:
%%time
df['fq_st_ss_tkp'] = df['st_ss_tkp'].apply(FreqDist)

In [None]:
%%time
df['fq_st_ss_tkn'] = df['st_ss_tkn'].apply(FreqDist)

In [None]:
cols = ['st_ps_tkp',
        'st_ps_tkn',
        'fq_st_ps_tkp',
        'fq_st_ps_tkn',
        'st_ss_tkp',
        'st_ss_tkn',
        'fq_st_ss_tkp',
        'fq_st_ss_tkn']
for col in cols:
    save_data(col)

In [None]:
df.drop(columns=cols, inplace=True)

## Taggers and PoS

## Export only 

In [None]:
# for column in df.columns[2:]:
#     data_path = Path('output', f'email_nltk_{column}.pkl')
#     if data_path.is_file():
#         print('already saved')
#     else:
#         print(f'saving {column} ...')
#         df.loc[:,['id', column]].to_pickle(data_path)