# Converting parsers

In [1]:
import sys
import os
import re
import email
import pickle

import nltk
import spacy
import pyfreeling
import pandas as pd

from pathlib import Path
from collections import Counter

from dateutil import parser
from email_reply_parser import EmailReplyParser
# from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'spacy'

In [4]:
def process_emails(dir_list, **kwargs):
    df_dict = {}
    
    df_dict['id'] = []
    mg_list = []
    for mail in dir_list:
        df_dict['id'].append(get_email_id(mail))
        mg_list.append(email_parser(mail))
    
    if 'From' in kwargs:
        df_dict['from'] = []
        
    if 'To' in kwargs:
        df_dict['to'] = []
    
    if 'Date' in kwargs:
        df_dict['date'] = []
        
    df_dict['text'] = []
    
    for mg in mg_list:
        if 'from' in df_dict:
            df_dict['from'].append(mg['From'])
        if 'to' in df_dict:
            df_dict['to'].append(preprocess_recipients(mg['To']))
        if 'date' in df_dict:
            df_dict['date'].append(get_timestamp(mg['Date']))
        df_dict['text'].append(EmailReplyParser.parse_reply(mg.get_payload()))
    
        
    df = pd.DataFrame(df_dict)
    
    return df


def read_message(directory):
    with open(directory, 'r', encoding='utf-8', errors='ignore') as f:
        message = f.read()
    
    return message


def email_parser(input_file):
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
        parsed_email = email.message_from_string(f.read())
    
    return parsed_email


def get_email_id(input_file):
    email_dir = '/'.join(str(input_file).split('/')[5:])
    
    return email_dir


def preprocess_recipients(recipient):
    if recipient is not None:
        users = re.sub(r'\s+', '', recipient).split(',')
        if len(users) > 1:
            return users
        else:
            return users[0]


def get_timestamp(input_string):
    parsed_dt = parser.parse(input_string)
    
    return parsed_dt.timestamp()


In [5]:
%%time
# Check whether we know where to find FreeLing data files
if "FREELINGDIR" not in os.environ:
    if sys.platform == "win32" or sys.platform == "win64":
        os.environ["FREELINGDIR"] = "C:\\Program Files"
    else:
        os.environ["FREELINGDIR"] = "/usr/local"
    print(
        "FREELINGDIR environment variable not defined, trying ",
        os.environ["FREELINGDIR"],
        file=sys.stderr,
    )

if not os.path.exists(os.environ["FREELINGDIR"] + "/share/freeling"):
    print(
        "Folder",
        os.environ["FREELINGDIR"] + "/share/freeling",
        "not found.\n" +
        "Please set FREELINGDIR environment variable to FreeLing installation directory",
        file=sys.stderr,
    )
    sys.exit(1)

# Location of FreeLing configuration files.
DATA = os.environ["FREELINGDIR"] + "/share/freeling/"

# Init locales
pyfreeling.util_init_locale("default")

# create language detector. Used just to show it. Results are printed
# but ignored (after, it is assumed language is LANG)
# la = pyfreeling.lang_ident(DATA + "common/lang_ident/ident-few.dat")

# create options set for maco analyzer.
# Default values are Ok, except for data files.
LANG = 'en'
op = pyfreeling.maco_options(LANG)
op.set_data_files(
    "",
    DATA + "common/punct.dat",
    DATA + LANG + "/dicc.src",
    DATA + LANG + "/afixos.dat",
    "",
    DATA + LANG + "/locucions.dat",
    DATA + LANG + "/np.dat",
    DATA + LANG + "/quantities.dat",
    DATA + LANG + "/probabilitats.dat",
)

# create analyzers
tk = pyfreeling.tokenizer(DATA + LANG + "/tokenizer.dat")
sp = pyfreeling.splitter(DATA + LANG + "/splitter.dat")
sid = sp.open_session()
mf = pyfreeling.maco(op)

# activate morpho modules to be used in next call
mf.set_active_options(
    False,  # UserMap
    True,  # NumbersDetection
    True,  # PunctuationDetection
    True,  # DatesDetection
    True,  # DictionarySearch
    True,  # AffixAnalysis
    False,  # CompoundAnalysis
    True,  # RetokContractions
    True,  # MultiwordsDetection
    True,  # NERecognition
    True,  # QuantitiesDetection
    True  # ProbabilityAssignment
)
# default: all created submodules are used

# create tagger, sense anotator, and parsers
tg = pyfreeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
sen = pyfreeling.senses(DATA + LANG + "/senses.dat")
dep = pyfreeling.dep_lstm(
    DATA + LANG + "/dep_lstm/params-en.dat")

CPU times: user 33.8 s, sys: 719 ms, total: 34.5 s
Wall time: 34 s


In [6]:
%%time
if Path('enron_mails.p').is_file():
    df = pd.read_pickle('enron_mails.p')
else:
    emails_path = Path(Path.cwd().parent, 'maildir')
    emails_list = emails_path.rglob('*.')
    # df = process_emails(emails_list, To=True, From=True, Date=True)
    df = process_emails(emails_list)
    df.to_pickle('enron_mails.p')

CPU times: user 304 ms, sys: 199 ms, total: 503 ms
Wall time: 491 ms


In [None]:
df

In [7]:
def get_forms(text):
    tokens = []
    lw = tk.tokenize(text)
    ls = sp.split(sid, lw, True)
    for s in ls:
        for w in s:
            tokens.append(w.get_form())
    
    return tokens


def get_lemmas(text):
    lemmas = []
    lw = tk.tokenize(text)
    ls = sp.split(sid, lw, True)
    ls = mf.analyze(ls)
    for s in ls:
        for w in s:
            lemmas.append(w.get_lemma())
    
    return lemmas


def get_pos(text):
    pos = []
    lw = tk.tokenize(text)
    ls = sp.split(sid, lw, True)
    ls = tg.analyze(ls)
    for s in ls:
        for w in s:
            pos.append(w.get_tag())
    
    return pos


In [8]:
%%time
df['forms'] = df['text'].apply(get_forms)

SPLITTER: Ridiculously long sentence between markers at token '%0D%0A' at input offset 7850.
SPLITTER: Ridiculously long sentence between markers at token 'of' at input offset 5439.
SPLITTER: Ridiculously long sentence between markers at token '/' at input offset 30033.
SPLITTER: Ridiculously long sentence between markers at token '/' at input offset 30810.
SPLITTER: Ridiculously long sentence between markers at token '/' at input offset 30595.
SPLITTER: Ridiculously long sentence between markers at token '/' at input offset 32344.
SPLITTER: Ridiculously long sentence between markers at token 'on' at input offset 11217.
SPLITTER: Ridiculously long sentence between markers at token '25' at input offset 5801.
SPLITTER: Ridiculously long sentence between markers at token '25' at input offset 5801.
SPLITTER: Ridiculously long sentence between markers at token ',' at input offset 10885.
SPLITTER: Ridiculously long sentence between markers at token ',' at input offset 6483.
SPLITTER: Ridicul

In [9]:
%%time
df['lemmas'] = df['text'].apply(get_lemmas)

SPLITTER: Ridiculously long sentence between markers at token '%0D%0A' at input offset 7850.
SPLITTER: Ridiculously long sentence between markers at token 'of' at input offset 5439.
SPLITTER: Ridiculously long sentence between markers at token '/' at input offset 30033.
SPLITTER: Ridiculously long sentence between markers at token '/' at input offset 30810.
SPLITTER: Ridiculously long sentence between markers at token '/' at input offset 30595.
SPLITTER: Ridiculously long sentence between markers at token '/' at input offset 32344.
SPLITTER: Ridiculously long sentence between markers at token 'on' at input offset 11217.
SPLITTER: Ridiculously long sentence between markers at token '25' at input offset 5801.
SPLITTER: Ridiculously long sentence between markers at token '25' at input offset 5801.
SPLITTER: Ridiculously long sentence between markers at token ',' at input offset 10885.
SPLITTER: Ridiculously long sentence between markers at token ',' at input offset 6483.
SPLITTER: Ridicul

In [None]:
df.to_pickle('emails_fl.p')

In [None]:
M = df.loc[3,'text']
M

In [None]:
get_forms(M)

In [None]:
get_lemmas(M)

In [None]:
sid.close()

In [None]:
for s in ls:
    for w in s:
        print(w.get_form())

In [None]:


def obtain_lemmas(self, text):
    results = {}
    # for lin in io.StringIO(text.get_payload()):
    for lin in io.StringIO(text):
        lw = self.tk.tokenize(lin.strip())
        ls = self.sp.split(self.sid, lw, False)
        ls = self.mf.analyze(ls)
        for s in ls:
            ws = s.get_words()
            for w in ws:
                key = f'{w.get_form()}_Lemma_{w.get_lemma()}'
                add_to_dict(key, results)
    return results

def obtain_pos(self, text):
    results = {}
    # for lin in io.StringIO(text.get_payload()):
    for lin in io.StringIO(text):
        lw = self.tk.tokenize(lin.strip())
        ls = self.sp.split(self.sid, lw, False)
        ls = self.tg.analyze(ls)
        for s in ls:
            ws = s.get_words()
            for w in ws:
                key = f'{w.get_form()}_PoS_{w.get_tag()}'
                add_to_dict(key, results)
    return results

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=2022)

In [None]:
train, val = train_test_split(train, test_size=0.4, random_state=2022)

In [None]:
train

In [None]:
val

In [None]:
test