In [1]:
import sys
import os
import io

import pyfreeling

import os
from email.parser import Parser
import email.utils

import time

import re

In [2]:
basedir = "../maildir/lay-k"  # Change root dir to affect how many mails are touched

In [3]:
def messageIDtoSubject(mail_dict, messageID):
    return mail_dict[messageID]["subject"].replace(" ", "")


def raw_parse(inputfile, email_list):
    with open(inputfile, "r",encoding="utf-8",errors="ignore") as f:
        data = f.read()
    parsedEmail = Parser().parsestr(data)
    timestamp = time.mktime(email.utils.parsedate(parsedEmail["date"]))
    email_list.append((timestamp, parsedEmail))


def obtain_raw_threads(mail_dict, email_list):
    subject_dict = {}
    rootMailSet = set()
    auxiliarRootStructure = {}

    for mail in email_list:
        actualEmail = mail[1]
        true_subject = actualEmail['subject']
        true_subject = true_subject.replace(" ", "")
        mail_dict[actualEmail["message-id"]] = actualEmail
        if "Re:" != true_subject[0:3]:
            auxiliarRootStructure[true_subject] = actualEmail["message-id"]
            if true_subject not in subject_dict:
                subject_dict[true_subject] = []
        else:
            new_subject = true_subject.replace("Re:", "")
            if new_subject in subject_dict:
                subject_dict[new_subject].append(actualEmail["message-id"])
                rootMailSet.add(auxiliarRootStructure[new_subject])
            subject_dict[true_subject] = []
    threads = {}
    for mail in rootMailSet:
        subject = messageIDtoSubject(mail_dict, mail)
        childThread = subject_dict[subject]
        threads[mail] = childThread
    return threads


def preprocess_recipients(recipient):
    if recipient is not None:
        users = re.sub(r'\s+', '', recipient).split(',')
        if len(users) > 1:
            return users
        else:
            return users[0]
    else:
        return None


def obtain_features(email_list):
    email_dict = {}
    for num, mail in email_list:
        email_dict[mail['message-id']] = {
            'from': mail['from'],
            'to': preprocess_recipients(mail['to']),
            'date': mail['date'],
            'features': {}
        }
    return email_dict

In [4]:

email_list = []

for directory, subdirectory, filenames in os.walk(basedir):
    for filename in filenames:
        raw_parse(os.path.join(directory, filename), email_list)
email_list.sort(key=lambda x: x[0])

mail_dict = obtain_features(email_list)

pureThreads = obtain_raw_threads(mail_dict, email_list)



In [5]:
lang = 'en'
# Check whether we know where to find FreeLing data files
if "FREELINGDIR" not in os.environ:
    if sys.platform == "win32" or sys.platform == "win64":
        os.environ["FREELINGDIR"] = "C:\\Program Files"
    else:
        os.environ["FREELINGDIR"] = "/usr/local"
    print(
        f"FREELINGDIR environment variable not defined, trying ",
        os.environ["FREELINGDIR"],
        file=sys.stderr,
    )

if not os.path.exists(os.environ["FREELINGDIR"] + "/share/freeling"):
    print(
        "Folder",
        os.environ["FREELINGDIR"] + "/share/freeling",
        "not found.\n"
        + "Please set FREELINGDIR environment variable to FreeLing installation directory",
        file=sys.stderr,
    )
    sys.exit(1)

# Location of FreeLing configuration files.
DATA = os.environ["FREELINGDIR"] + "/share/freeling/"

# Init locales
pyfreeling.util_init_locale("default")

# create options set for maco analyzer.
# Default values are Ok, except for data files.
LANG = lang
op = pyfreeling.maco_options(LANG)
op.set_data_files(
    "",
    DATA + "common/punct.dat",
    DATA + LANG + "/dicc.src",
    DATA + LANG + "/afixos.dat",
    "",
    DATA + LANG + "/locucions.dat",
    DATA + LANG + "/np.dat",
    DATA + LANG + "/quantities.dat",
    DATA + LANG + "/probabilitats.dat",
)

# create analyzers
tk = pyfreeling.tokenizer(DATA + LANG + "/tokenizer.dat")
sp = pyfreeling.splitter(DATA + LANG + "/splitter.dat")
sid = sp.open_session()
mf = pyfreeling.maco(op)

# activate morpho modules to be used in next call
mf.set_active_options(
    False,
    True,
    True,
    True,  # select which among created
    True,
    True,
    False,
    True,  # submodules are to be used.
    True,
    True,
    True,
    True,
)
# default: all created submodules are used

# create tagger, sense anotator, and parsers
tg = pyfreeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
sen = pyfreeling.senses(DATA + LANG + "/senses.dat")
dep = pyfreeling.dep_lstm(DATA + LANG + "/dep_lstm/params-en.dat")

In [13]:
msg = email_list[0][1].get_payload()

In [14]:
print(msg)

Letter dictated by Ken Lay



Hello Janice:

 I enjoyed your recent e-mail but was sorry to hear about your dad.  It 
sounds as though his health has deteriorated significantly.  These are always 
difficult times.  As I watched my mother and father's health deteriorate and 
ultimately watched them die, it is a very defining time in our lives.  But we 
can be very thankful to have such great parents and to have been privilege to 
be raised in such loving homes.

 Sounds as though Eric has done very well as SMSU.  He is joining an 
excellent company.  I am also delighted to hear that he will continue his 
education working toward an MBA.  As we are living in an age were 
intellectual capital is so valuable, it is important for every young person 
to obtain the very best possible education they can.  As to our family, 
within the last ten days our youngest daughter Elizabeth was married to a 
young man from Buenos Aires, Argentina.  They met while they were both 
working on a project as l

In [23]:
for lin in io.StringIO(msg):
    if lin != '\n':
        print(lin)

Letter dictated by Ken Lay

Hello Janice:

 I enjoyed your recent e-mail but was sorry to hear about your dad.  It 

sounds as though his health has deteriorated significantly.  These are always 

difficult times.  As I watched my mother and father's health deteriorate and 

ultimately watched them die, it is a very defining time in our lives.  But we 

can be very thankful to have such great parents and to have been privilege to 

be raised in such loving homes.

 Sounds as though Eric has done very well as SMSU.  He is joining an 

excellent company.  I am also delighted to hear that he will continue his 

education working toward an MBA.  As we are living in an age were 

intellectual capital is so valuable, it is important for every young person 

to obtain the very best possible education they can.  As to our family, 

within the last ten days our youngest daughter Elizabeth was married to a 

young man from Buenos Aires, Argentina.  They met while they were both 

working on a pr

In [33]:
for lin in io.StringIO(msg):
    if lin != '\n':
        l = tk.tokenize(lin)
        ls = sp.split(sid, l, False)

        ls = mf.analyze(ls)
        ls = tg.analyze(ls)
        ls = sen.analyze(ls)
        ls = dep.analyze(ls)
        # output results
        for s in ls:
            ws = s.get_words()
            for w in ws:
                print(w.get_form(), ' ', w.get_lemma())
        #    for w in ws:
        #        print(
        #            w.get_form()
        #           + " "
        #           + w.get_lemma()
        #           + " "
        #           + w.get_tag()
        #           + " "
        #           + w.get_senses_string()
        #       )
        #   print("")

cc   cc
:   :
Bonnie_Bourne_Sharon_Lay_Letter   bonnie_bourne_sharon_lay_letter
dictated   dictate
by   by
Ken_Lay_Hello_Janice   ken_lay_hello_janice
:   :
I   i
enjoyed   enjoy
your   your
recent   recent
e-mail   email
but   but
was   be
sorry   sorry
to   to
hear   hear
about   about
your   your
dad   dad
.   .
It   it
sounds   sound
as   as
though   though
his   his
health   health
has   have
deteriorated   deteriorate
significantly   significantly
.   .
These   these
are   be
always   always
difficult   difficult
times   time
.   .
As   as
I   i
watched   watch
my   my
mother   mother
and   and
father   father
's   's
health   health
deteriorate   deteriorate
and   and
ultimately   ultimately
watched   watch
them   them
die   die
,   ,
it   it
is   be
a   a
very   very
defining   define
time   time
in   in
our   our
lives   life
.   .
But   but
we   we
can   can
be   be
very   very
thankful   thankful
to   to
have   have
such   such
great   great
parents   parent
and   and
to   t

In [29]:
# print(ls)        
        # output results
        #for s in ls:
        #    ws = s.get_words()
        #    for w in ws:
        #        print(
        #            w.get_form()
        #           + " "
        #           + w.get_lemma()
        #           + " "
        #           + w.get_tag()
        #           + " "
        #           + w.get_senses_string()
        #       )
        #   print("")

In [None]:
class freeling_analyzer(object):
    def __init__(self, folder, lang):
        self.folder = folder
        self.lang = lang
        self.tk = None
        self.sp = None
        self.sid = None
        self.mf = None
        self.tg = None
        self.sen = None
        self.dep = None

    # ------------  output a parse tree ------------
    def printTree(self, ptree, depth):

        node = ptree.begin()

        print("".rjust(depth * 2), end="")
        info = node.get_info()
        if info.is_head():
            print("+", end="")

        nch = node.num_children()
        if nch == 0:
            w = info.get_word()
            print(
                "({0} {1} {2})".format(w.get_form(), w.get_lemma(), w.get_tag()), end=""
            )

        else:
            print("{0}_[".format(info.get_label()))

            for i in range(nch):
                child = node.nth_child_ref(i)
                self.printTree(child, depth + 1)

            print("".rjust(depth * 2), end="")
            print("]", end="")

        print("")

    # ------------  output a parse tree ------------
    def printDepTree(self, dtree, depth):

        node = dtree.begin()

        print("".rjust(depth * 2), end="")

        info = node.get_info()
        print(info.get_label() + "/", end="")

        w = node.get_info().get_word()
        print("({0} {1} {2})".format(w.get_form(), w.get_lemma(), w.get_tag()), end="")

        nch = node.num_children()
        if nch > 0:
            print(" [")

            for i in range(nch):
                d = node.nth_child_ref(i)
                if not d.begin().get_info().is_chunk():
                    self.printDepTree(d, depth + 1)

            ch = {}
            for i in range(nch):
                d = node.nth_child_ref(i)
                if d.begin().get_info().is_chunk():
                    ch[d.begin().get_info().get_chunk_ord()] = d

            for i in sorted(ch.keys()):
                self.printDepTree(ch[i], depth + 1)

            print("".rjust(depth * 2), end="")
            print("]", end="")

        print("")

    def setup(self):
        # Check whether we know where to find FreeLing data files
        if "FREELINGDIR" not in os.environ:
            if sys.platform == "win32" or sys.platform == "win64":
                os.environ["FREELINGDIR"] = "C:\\Program Files"
            else:
                os.environ["FREELINGDIR"] = "/usr/local"
            print(
                "FREELINGDIR environment variable not defined, trying ",
                os.environ["FREELINGDIR"],
                file=sys.stderr,
            )

        if not os.path.exists(os.environ["FREELINGDIR"] + "/share/freeling"):
            print(
                "Folder",
                os.environ["FREELINGDIR"] + "/share/freeling",
                "not found.\n"
                + "Please set FREELINGDIR environment variable to FreeLing installation directory",
                file=sys.stderr,
            )
            sys.exit(1)

        # Location of FreeLing configuration files.
        DATA = os.environ["FREELINGDIR"] + "/share/freeling/"

        # Init locales
        pyfreeling.util_init_locale("default")

        # create options set for maco analyzer.
        # Default values are Ok, except for data files.
        LANG = self.lang
        op = pyfreeling.maco_options(LANG)
        op.set_data_files(
            "",
            DATA + "common/punct.dat",
            DATA + LANG + "/dicc.src",
            DATA + LANG + "/afixos.dat",
            "",
            DATA + LANG + "/locucions.dat",
            DATA + LANG + "/np.dat",
            DATA + LANG + "/quantities.dat",
            DATA + LANG + "/probabilitats.dat",
        )

        # create analyzers
        self.tk = pyfreeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        self.sp = pyfreeling.splitter(DATA + LANG + "/splitter.dat")
        self.sid = self.sp.open_session()
        self.mf = pyfreeling.maco(op)

        # activate morpho modules to be used in next call
        self.mf.set_active_options(
            False,
            True,
            True,
            True,  # select which among created
            True,
            True,
            False,
            True,  # submodules are to be used.
            True,
            True,
            True,
            True,
        )
        # default: all created submodules are used

        # create tagger, sense anotator, and parsers
        self.tg = pyfreeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
        self.sen = pyfreeling.senses(DATA + LANG + "/senses.dat")
        self.dep = pyfreeling.dep_lstm(DATA + LANG + "/dep_lstm/params-en.dat")

    def process(self, msg):
        for lin in io.StringIO(msg):
            l = self.tk.tokenize(lin)
            ls = self.sp.split(self.sid, l, False)

            ls = self.mf.analyze(ls)
            ls = self.tg.analyze(ls)
            ls = self.sen.analyze(ls)
            ls = self.dep.analyze(ls)

            # output results
            for s in ls:
                ws = s.get_words()
                for w in ws:
                    print(
                        w.get_form()
                        + " "
                        + w.get_lemma()
                        + " "
                        + w.get_tag()
                        + " "
                        + w.get_senses_string()
                    )
                print("")

                dp = s.get_dep_tree()
                self.printDepTree(dp, 0)

        # clean up
        self.sp.close_session(self.sid)