# Pre-processing integration

In [10]:
import string

import nltk
import pandas as pd

from pathlib import Path
from itertools import groupby 

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures as bigram_measures
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

In [2]:
%%time
file = Path('/Volumes', 'tfm', 'raw', 'enron_mails.p')
df = pd.read_pickle(file)

NameError: name 'extreme_vals' is not defined

In [3]:
df['length'] = df['text'].apply(len)
Q1 = df['length'].quantile(0.25)
Q3 = df['length'].quantile(0.75)
IQR = Q3 - Q1
# df.loc[df['length'] > (Q3 + 15 * IQR),:]
df.sort_values(by=['length'], ascending=False).head(10)

Unnamed: 0,id,text,length
132160,forney-j/sent_items/158.,<OMNI>\n<OMNINotes></OMNINotes>\n\n<OMNIPAB>PE...,1615558
480942,presto-k/sent_items/1103.,<OMNI>\n<OMNINotes></OMNINotes>\n\n<OMNIPAB>PE...,1371385
337694,quigley-d/sent_items/37.,<OMNI>\n<OMNINotes>\n <dbname>C:\Program Fi...,319984
80043,cash-m/general_research/17.,I am pleased to send you our web-based e-mail ...,215838
308907,dasovich-j/notes_inbox/5594.,=20\n\n\nTelecommunications Reports - January ...,212190
298166,dasovich-j/all_documents/8681.,=20\n\n\nTelecommunications Reports - January ...,212190
331903,crandell-s/sent_items/32.,Delivery Date\t BORDER 1\t S DEAL#\t S PRICE\t...,210672
293459,dasovich-j/all_documents/8394.,=20\n=20\n\nTelecommunications Reports - Janua...,208766
309367,dasovich-j/notes_inbox/5764.,=20\n=20\n\nTelecommunications Reports - Janua...,208766
296267,dasovich-j/all_documents/11847.,"Today's news, and some from the weekend, in th...",187214


In [4]:
extreme_vals = [132160, 480942, 337694]
df.drop(extreme_vals, inplace=True)

## NLTK

In [14]:
def get_tokens(text, wordpunct=False, stop_words=False):
    
    stop = set(stopwords.words('english') + list(string.punctuation))
    
    #stop = set(stopwords.words('english'))
    
    if wordpunct:
        if stop_words:
            tokens = [w for w in wordpunct_tokenize(text.lower()) if w not in stop]
        else:
            tokens = wordpunct_tokenize(text)
    else:
        if stop_words:
            tokens = [w for w in word_tokenize(text.lower()) if w not in stop]
        else:
            tokens = word_tokenize(text)
    
    return tokens
            

# def get_punct_tokens(text, w):
#     tokens = wordpunct_tokenize(text)
    
#     return tokens


# def get_tokens(text):
#     tokens = word_tokenize(text)
    
#     return tokens


def get_bigrams(text):
    bigram = nltk.bigrams(text)
    
    return list(bigram)


def get_freqs(text):
    freq_dist = FreqDist(word.lower() for word in text)
    
    return freq_dist


def save_data(column):
    data_path = Path('output', f'email_nltk_{column}.pkl')
    if data_path.is_file():
        print('already saved')
    else:
        print(f'saving {column} ...')
        df.loc[:,['id', column]].to_pickle(data_path)

In [15]:
df['text'].head(10).apply(get_tokens, args=(True, True,))

0    [----------------------, forwarded, maria, san...
1    [----------------------, forwarded, judy, hern...
2    [----------------------, forwarded, judy, hern...
3                        [received, message, kat, !!!]
4                                             [fyi, .]
5    [nutcracker, tickets, fabulous, seats, final, ...
6    [content, -, transfer, -, encoding, :, quoted,...
7    [----------------------, forwarded, eve, pucke...
8    [daily, blessing, http, ://, www, ., daily, -,...
9    [----------------------, forwarded, judy, hern...
Name: text, dtype: object

## Tokenizing

In [None]:
%%time
df['tkp'] = df['text'].apply(get_punct_tokens)

## word_tokenize

In [None]:
%%time
df['tkn'] = df['text'].apply(get_tokens)

## Collocations and bigrams

In [None]:
%%time
df['fq_tkp'] = df['tkp'].apply(get_freqs)

In [None]:
%%time
df['fq_tkn'] = df['tkn'].apply(get_freqs)

In [None]:
%%time
df['bg_tkp'] = df['tkp'].apply(get_bigrams)

In [None]:
%%time
df['bg_tkn'] = df['tkn'].apply(get_bigrams)

In [None]:
%%time
df['fq_bg_tkp'] = df['bg_tkp'].apply(FreqDist)

In [None]:
%%time
df['fq_bg_tkn'] = df['bg_tkn'].apply(FreqDist)

In [None]:
%%time
df['cl_tkp'] = df.apply(lambda row: BigramCollocationFinder(row.fq_tkp, row.fq_bg_tkp), axis=1)

In [None]:
%%time
df['cl_tkn'] = df.apply(lambda row: BigramCollocationFinder(row.fq_tkn, row.fq_bg_tkn), axis=1)

In [None]:
cols = ['fq_tkp',
        'fq_tkn',
        'bg_tkp',
        'bg_tkn',
        'fq_bg_tkp',
        'fq_bg_tkn',
        'cl_tkp',
        'cl_tkn']
for col in cols:
    save_data(col)

In [None]:
df.drop(columns=cols,inplace=True)

## Lemmas

In [None]:
wn = nltk.WordNetLemmatizer()

In [None]:
%%time
df['lm_wn_tkp'] = df.apply(lambda row: [wn.lemmatize(word) for word in row.tkp], axis=1)

In [None]:
%%time
df['lm_wn_tkn'] = df.apply(lambda row: [wn.lemmatize(word) for word in row.tkn], axis=1)

In [None]:
%%time
df['fq_lm_wn_tkp'] = df['lm_wn_tkp'].apply(FreqDist)

In [None]:
%%time
df['fq_lm_wn_tkn'] = df['lm_wn_tkn'].apply(FreqDist)

In [None]:
cols = ['lm_wn_tkp',
        'lm_wn_tkn',
        'fq_lm_wn_tkp',
        'fq_lm_wn_tkn']
for col in cols:
    save_data(col)

In [None]:
df.drop(columns=cols, inplace=True)

## Stems

In [None]:
ps = nltk.PorterStemmer()
ss = nltk.SnowballStemmer(language = 'english')

In [None]:
%%time
df['st_ps_tkp'] = df.apply(lambda row: [ps.stem(word) for word in row.tkp], axis=1)

In [None]:
%%time
df['st_ps_tkn'] = df.apply(lambda row: [ps.stem(word) for word in row.tkn], axis=1)

In [None]:
%%time
df['fq_st_ps_tkp'] = df['st_ps_tkp'].apply(FreqDist)

In [None]:
%%time
df['fq_st_ps_tkn'] = df['st_ps_tkn'].apply(FreqDist)

In [None]:
%%time
df['st_ss_tkp'] = df.apply(lambda row: [ss.stem(word) for word in row.tkp], axis=1)

In [None]:
%%time
df['st_ss_tkn'] = df.apply(lambda row: [ss.stem(word) for word in row.tkn], axis=1)

In [None]:
%%time
df['fq_st_ss_tkp'] = df['st_ss_tkp'].apply(FreqDist)

In [None]:
%%time
df['fq_st_ss_tkn'] = df['st_ss_tkn'].apply(FreqDist)

In [None]:
cols = ['st_ps_tkp',
        'st_ps_tkn',
        'fq_st_ps_tkp',
        'fq_st_ps_tkn',
        'st_ss_tkp',
        'st_ss_tkn',
        'fq_st_ss_tkp',
        'fq_st_ss_tkn']
for col in cols:
    save_data(col)

In [None]:
df.drop(columns=cols, inplace=True)