# How does tokenizer and lemmas works

In [None]:
import sys
import os
import io

import pyfreeling

import os
from email.parser import Parser
import email.utils

import time

import re

In [None]:
basedir = "../maildir/lay-k"  # Change root dir to affect how many mails are touched
def messageIDtoSubject(mail_dict, messageID):
    return mail_dict[messageID]["subject"].replace(" ", "")


def raw_parse(inputfile, email_list):
    with open(inputfile, "r",encoding="utf-8",errors="ignore") as f:
        data = f.read()
    parsedEmail = Parser().parsestr(data)
    timestamp = time.mktime(email.utils.parsedate(parsedEmail["date"]))
    email_list.append((timestamp, parsedEmail))


def obtain_raw_threads(mail_dict, email_list):
    subject_dict = {}
    rootMailSet = set()
    auxiliarRootStructure = {}

    for mail in email_list:
        actualEmail = mail[1]
        true_subject = actualEmail['subject']
        true_subject = true_subject.replace(" ", "")
        mail_dict[actualEmail["message-id"]] = actualEmail
        if "Re:" != true_subject[0:3]:
            auxiliarRootStructure[true_subject] = actualEmail["message-id"]
            if true_subject not in subject_dict:
                subject_dict[true_subject] = []
        else:
            new_subject = true_subject.replace("Re:", "")
            if new_subject in subject_dict:
                subject_dict[new_subject].append(actualEmail["message-id"])
                rootMailSet.add(auxiliarRootStructure[new_subject])
            subject_dict[true_subject] = []
    threads = {}
    for mail in rootMailSet:
        subject = messageIDtoSubject(mail_dict, mail)
        childThread = subject_dict[subject]
        threads[mail] = childThread
    return threads


def preprocess_recipients(recipient):
    if recipient is not None:
        users = re.sub(r'\s+', '', recipient).split(',')
        if len(users) > 1:
            return users
        else:
            return users[0]
    else:
        return None


def obtain_base_features(mail):
    email_dict={}
    email_dict[mail['message-id']] = {
            'from': mail['from'],
            # 'to': preprocess_recipients(mail['to']),
            'date': mail['date'],
        }
    return email_dict

In [None]:
lang = 'en'

if "FREELINGDIR" not in os.environ:
    if sys.platform == "win32" or sys.platform == "win64":
        os.environ["FREELINGDIR"] = "C:\\Program Files"
    else:
        os.environ["FREELINGDIR"] = "/usr/local"
    print(
        "FREELINGDIR environment variable not defined, trying ",
        os.environ["FREELINGDIR"],
        file=sys.stderr,
    )

if not os.path.exists(os.environ["FREELINGDIR"] + "/share/freeling"):
    print(
        "Folder",
        os.environ["FREELINGDIR"] + "/share/freeling",
        "not found.\n" +
        "Please set FREELINGDIR environment variable to FreeLing installation directory",
        file=sys.stderr,
    )
    sys.exit(1)

# Location of FreeLing configuration files.
DATA = os.environ["FREELINGDIR"] + "/share/freeling/"

# Init locales
pyfreeling.util_init_locale("default")

# create language detector. Used just to show it. Results are printed
# but ignored (after, it is assumed language is LANG)
# la = pyfreeling.lang_ident(DATA + "common/lang_ident/ident-few.dat")

# create options set for maco analyzer.
# Default values are Ok, except for data files.
LANG = lang
op = pyfreeling.maco_options(LANG)
op.set_data_files(
    "",
    DATA + "common/punct.dat",
    DATA + LANG + "/dicc.src",
    DATA + LANG + "/afixos.dat",
    "",
    DATA + LANG + "/locucions.dat",
    DATA + LANG + "/np.dat",
    DATA + LANG + "/quantities.dat",
    DATA + LANG + "/probabilitats.dat",
)

# create analyzers
tk = pyfreeling.tokenizer(DATA + LANG + "/tokenizer.dat")
sp = pyfreeling.splitter(DATA + LANG + "/splitter.dat")
sid = sp.open_session()
mf = pyfreeling.maco(op)

# activate morpho modules to be used in next call
mf.set_active_options(
    False, # UserMap
    True,  # NumbersDetection
    True,  # PunctuationDetection
    True,  # DatesDetection
    True,  # DictionarySearch
    True,  # AffixAnalysis
    False, # CompoundAnalysis
    True,  # RetokContractions
    True,  # MultiwordsDetection
    True,  # NERecognition
    True,  # QuantitiesDetection
    True,  # ProbabilityAssignment
)
# default: all created submodules are used

# create tagger, sense anotator, and parsers
tg = pyfreeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
sen = pyfreeling.senses(DATA + LANG + "/senses.dat")
dep = pyfreeling.dep_lstm(
    DATA + LANG + "/dep_lstm/params-en.dat")

In [None]:
def obtain_forms(text):
    results = {}
    for lin in io.StringIO(actualEmail.get_payload()):
        if lin.strip():
            lw = tk.tokenize(lin)
            ls = sp.split(sid, lw, False)
            if len(ls) > 0:
                ws = ls[0].get_words()
                for w in ws:
                    key = w.get_form()
                    add_to_dict(key, results)
    return results

def obtain_lemmas(text):
    results = {}
    for lin in io.StringIO(actualEmail.get_payload()):
        if lin.strip():
            lw = tk.tokenize(lin)
            ls = sp.split(sid, lw, False)
            ls = mf.analyze(ls)
            if len(ls) > 0:
                ws = ls[0].get_words()
                for w in ws:
                    key = f'{w.get_form()}_{w.get_lemma()}'
                    add_to_dict(key, results)
    return results

def obtain_pos(text):
    results = {}
    for lin in io.StringIO(actualEmail.get_payload()):
        if lin.strip():
            lw = tk.tokenize(lin)
            ls = sp.split(sid, lw, False)
            ls = mf.analyze(ls)
            ls = tg.analyze(ls)
            ls = sen.analyze(ls)
            if len(ls) > 0:
                ws = ls[0].get_words()
                for w in ws:
                    key = f'{w.get_form()}_{w.get_tag()}'
                    add_to_dict(key, results)
    return results

def process(text, token=True, lemma=False, pos=False):
    features = {}
    if token:
        features['forms'] = obtain_forms(text)
    if lemma:
        features['lemmas'] = obtain_lemmas(text)
    if pos:
        features['PoS'] = obtain_pos(text)
    return features

In [None]:
def add_to_dict(key, feature_Dictionary):
    if key in feature_Dictionary:
        feature_Dictionary[key] += 1
    else:
        feature_Dictionary[key] = 1


In [None]:
mail_dict = {}
email_list = []

for directory, subdirectory, filenames in os.walk(basedir):
    for filename in filenames:
        raw_parse(os.path.join(directory, filename), email_list)
email_list.sort(key=lambda x: x[0])

pureThreads = obtain_raw_threads(mail_dict, email_list)

In [None]:
mailsWithFeatures = {}
for mail in email_list[:1]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)
    print(mailsWithFeatures)

In [None]:
print(email_list[0])

In [None]:
mailsWithFeatures[mail]

In [None]:
lw = []
for lin in io.StringIO(actualEmail.get_payload()):
    if lin.strip():
        lw.append(tk.tokenize(lin))
#lw

In [None]:
ls = []
for l in lw:
    ls.append(sp.split(sid, l, False))
#ls

In [None]:
ws = []
for s in ls:
    if len(s) > 0:
        ws.append(s[0].get_words())
#ws

In [None]:
ws = []
for s in ls:
    if len(s) > 0:
        ws.append(s[0].get_words())

In [None]:
ws[0][0].get_form()

In [None]:
keys = []
for w in ws:
    for k in w:
        keys.append(k.get_form())
#keys

In [None]:
results = {}
forms = {}
for k in keys:
    add_to_dict(k, forms)
    results['forms'] = forms

In [None]:
results = {}
for lin in io.StringIO(actualEmail.get_payload()):
    if lin.strip():
        lw = tk.tokenize(lin)
        ls = sp.split(sid, lw, False)
        if len(ls) > 0:
            ws = ls[0].get_words()
            for w in ws:
                key = w.get_form()
                add_to_dict(key, results)

In [None]:
results

In [None]:
mailsWithFeatures = {}
for mail in email_list[:1]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)
    mailsWithFeatures[mail][actualEmail['message-id']]['forms'] = obtain_forms(actualEmail)

In [None]:
mailsWithFeatures[mail]

## Lemma

In [None]:
results = {}
for lin in io.StringIO(actualEmail.get_payload()):
    if lin.strip():
        lw = tk.tokenize(lin)
        ls = sp.split(sid, lw, False)
        ls = mf.analyze(ls)
        if len(ls) > 0:
            ws = ls[0].get_words()
            for w in ws:
                key = f'{w.get_form()}_{w.get_lemma()}'
                add_to_dict(key, results)

In [None]:
results

In [None]:
mailsWithFeatures = {}
for mail in email_list[:1]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)
    mailsWithFeatures[mail][actualEmail['message-id']]['lemmas'] = obtain_lemmas(actualEmail)

In [None]:
mailsWithFeatures[mail]

## PoS

In [None]:
mailsWithFeatures = {}
for mail in email_list[:1]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)
    mailsWithFeatures[mail][actualEmail['message-id']]['PoS'] = obtain_pos(actualEmail)

In [None]:
mailsWithFeatures[mail]

# All

In [None]:
# Tokens
mailsWithFeatures = {}
for mail in email_list[:1]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)
    mailsWithFeatures[mail][actualEmail['message-id']].update(process(actualEmail))

In [None]:
mailsWithFeatures[mail]

In [None]:
# Tokens + Lemmas + PoS
mailsWithFeatures = {}
for mail in email_list[:1]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)
    mailsWithFeatures[mail][actualEmail['message-id']].update(process(actualEmail, lemma=True, pos=True))

In [None]:
mailsWithFeatures[mail]

In [None]:
sp.close_session(sid)

In [None]:
print(actualEmail.get_payload())