In [None]:
import sys
import os
import io

import pyfreeling

import os
from email.parser import Parser
import email.utils

import time

import re

In [None]:
basedir = "../maildir/lay-k"  # Change root dir to affect how many mails are touched

In [None]:
def messageIDtoSubject(mail_dict, messageID):
    return mail_dict[messageID]["subject"].replace(" ", "")


def raw_parse(inputfile, email_list):
    with open(inputfile, "r",encoding="utf-8",errors="ignore") as f:
        data = f.read()
    parsedEmail = Parser().parsestr(data)
    timestamp = time.mktime(email.utils.parsedate(parsedEmail["date"]))
    email_list.append((timestamp, parsedEmail))


def obtain_raw_threads(mail_dict, email_list):
    subject_dict = {}
    rootMailSet = set()
    auxiliarRootStructure = {}

    for mail in email_list:
        actualEmail = mail[1]
        true_subject = actualEmail['subject']
        true_subject = true_subject.replace(" ", "")
        mail_dict[actualEmail["message-id"]] = actualEmail
        if "Re:" != true_subject[0:3]:
            auxiliarRootStructure[true_subject] = actualEmail["message-id"]
            if true_subject not in subject_dict:
                subject_dict[true_subject] = []
        else:
            new_subject = true_subject.replace("Re:", "")
            if new_subject in subject_dict:
                subject_dict[new_subject].append(actualEmail["message-id"])
                rootMailSet.add(auxiliarRootStructure[new_subject])
            subject_dict[true_subject] = []
    threads = {}
    for mail in rootMailSet:
        subject = messageIDtoSubject(mail_dict, mail)
        childThread = subject_dict[subject]
        threads[mail] = childThread
    return threads


def preprocess_recipients(recipient):
    if recipient is not None:
        users = re.sub(r'\s+', '', recipient).split(',')
        if len(users) > 1:
            return users
        else:
            return users[0]
    else:
        return None


def obtain_base_features(mail):
    email_dict={}
    email_dict[mail['message-id']] = {
            'from': mail['from'],
            # 'to': preprocess_recipients(mail['to']),
            'date': mail['date'],
        }
    return email_dict

In [None]:
lang = 'en'

In [None]:

def add_to_dict(key, feature_Dictionary):
    if key in feature_Dictionary:
        feature_Dictionary[key] += 1
    else:
        feature_Dictionary[key] = 1
    

In [None]:
class FreelingAnalyzer(object):
    def __init__(self, folder, lang):
        self.folder = folder
        self.lang = lang
        self.tk = None
        self.sp = None
        self.sid = None
        self.mf = None
        self.tg = None
        self.sen = None
        # self.parser = None
        self.dep = None
        self.setup()

    def setup(self):
        # Check whether we know where to find FreeLing data files
        if "FREELINGDIR" not in os.environ:
            if sys.platform == "win32" or sys.platform == "win64":
                os.environ["FREELINGDIR"] = "C:\\Program Files"
            else:
                os.environ["FREELINGDIR"] = "/usr/local"
            print(
                "FREELINGDIR environment variable not defined, trying ",
                os.environ["FREELINGDIR"],
                file=sys.stderr,
            )

        if not os.path.exists(os.environ["FREELINGDIR"] + "/share/freeling"):
            print(
                "Folder",
                os.environ["FREELINGDIR"] + "/share/freeling",
                "not found.\n" +
                "Please set FREELINGDIR environment variable to FreeLing installation directory",
                file=sys.stderr,
            )
            sys.exit(1)

        # Location of FreeLing configuration files.
        DATA = os.environ["FREELINGDIR"] + "/share/freeling/"

        # Init locales
        pyfreeling.util_init_locale("default")

        # create language detector. Used just to show it. Results are printed
        # but ignored (after, it is assumed language is LANG)
        # la = pyfreeling.lang_ident(DATA + "common/lang_ident/ident-few.dat")

        # create options set for maco analyzer.
        # Default values are Ok, except for data files.
        LANG = self.lang
        op = pyfreeling.maco_options(LANG)
        op.set_data_files(
            "",
            DATA + "common/punct.dat",
            DATA + LANG + "/dicc.src",
            DATA + LANG + "/afixos.dat",
            "",
            DATA + LANG + "/locucions.dat",
            DATA + LANG + "/np.dat",
            DATA + LANG + "/quantities.dat",
            DATA + LANG + "/probabilitats.dat",
        )

        # create analyzers
        self.tk = pyfreeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        self.sp = pyfreeling.splitter(DATA + LANG + "/splitter.dat")
        self.sid = self.sp.open_session()
        self.mf = pyfreeling.maco(op)

        # activate morpho modules to be used in next call
        self.mf.set_active_options(
            False, # UserMap
            True,  # NumbersDetection
            True,  # PunctuationDetection
            True,  # DatesDetection
            True,  # DictionarySearch
            True,  # AffixAnalysis
            False, # CompoundAnalysis
            True,  # RetokContractions
            True,  # MultiwordsDetection
            True,  # NERecognition
            True,  # QuantitiesDetection
            True   # ProbabilityAssignment
        )
        # default: all created submodules are used

        # create tagger, sense anotator, and parsers
        self.tg = pyfreeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
        self.sen = pyfreeling.senses(DATA + LANG + "/senses.dat")
        self.dep = pyfreeling.dep_lstm(
            DATA + LANG + "/dep_lstm/params-en.dat")

    def obtain_tokens(self, text, feature_dict):
        for lin in io.StringIO(text.get_payload()):
            if lin.strip():
                lw = self.tk.tokenize(lin)
                ls = self.sp.split(self.sid, lw, False)
        for s in ls:
            ws = s.get_words()
            for w in ws:
                key = w.get_form()
                add_to_dict(key, feature_dict)
        #return results
        return ls
    
    def close(self):
        self.sp.close_session(self.sid)

In [None]:
mail_dict = {}
email_list = []

for directory, subdirectory, filenames in os.walk(basedir):
    for filename in filenames:
        raw_parse(os.path.join(directory, filename), email_list)
email_list.sort(key=lambda x: x[0])

# mail_dict = obtain_base_features(email_list)

pureThreads = obtain_raw_threads(mail_dict, email_list)

In [None]:
mailsWithFeatures = {}
for mail in email_list[:2]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)

In [None]:
class FreelingAnalyzer(object):
    def __init__(self, folder, lang):
        self.folder = folder
        self.lang = lang
        self.tk = None
        self.sp = None
        self.sid = None
        self.mf = None
        self.tg = None
        self.sen = None
        # self.parser = None
        self.dep = None
        self.setup()

    def setup(self):
        # Check whether we know where to find FreeLing data files
        if "FREELINGDIR" not in os.environ:
            if sys.platform == "win32" or sys.platform == "win64":
                os.environ["FREELINGDIR"] = "C:\\Program Files"
            else:
                os.environ["FREELINGDIR"] = "/usr/local"
            print(
                "FREELINGDIR environment variable not defined, trying ",
                os.environ["FREELINGDIR"],
                file=sys.stderr,
            )

        if not os.path.exists(os.environ["FREELINGDIR"] + "/share/freeling"):
            print(
                "Folder",
                os.environ["FREELINGDIR"] + "/share/freeling",
                "not found.\n" +
                "Please set FREELINGDIR environment variable to FreeLing installation directory",
                file=sys.stderr,
            )
            sys.exit(1)

        # Location of FreeLing configuration files.
        DATA = os.environ["FREELINGDIR"] + "/share/freeling/"

        # Init locales
        pyfreeling.util_init_locale("default")

        # create language detector. Used just to show it. Results are printed
        # but ignored (after, it is assumed language is LANG)
        # la = pyfreeling.lang_ident(DATA + "common/lang_ident/ident-few.dat")

        # create options set for maco analyzer.
        # Default values are Ok, except for data files.
        LANG = self.lang
        op = pyfreeling.maco_options(LANG)
        op.set_data_files(
            "",
            DATA + "common/punct.dat",
            DATA + LANG + "/dicc.src",
            DATA + LANG + "/afixos.dat",
            "",
            DATA + LANG + "/locucions.dat",
            DATA + LANG + "/np.dat",
            DATA + LANG + "/quantities.dat",
            DATA + LANG + "/probabilitats.dat",
        )

        # create analyzers
        self.tk = pyfreeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        self.sp = pyfreeling.splitter(DATA + LANG + "/splitter.dat")
        self.sid = self.sp.open_session()
        self.mf = pyfreeling.maco(op)

        # activate morpho modules to be used in next call
        self.mf.set_active_options(
            False, # UserMap
            True,  # NumbersDetection
            True,  # PunctuationDetection
            True,  # DatesDetection
            True,  # DictionarySearch
            True,  # AffixAnalysis
            False, # CompoundAnalysis
            True,  # RetokContractions
            True,  # MultiwordsDetection
            True,  # NERecognition
            True,  # QuantitiesDetection
            True   # ProbabilityAssignment
        )
        # default: all created submodules are used

        # create tagger, sense anotator, and parsers
        self.tg = pyfreeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
        self.sen = pyfreeling.senses(DATA + LANG + "/senses.dat")
        self.dep = pyfreeling.dep_lstm(
            DATA + LANG + "/dep_lstm/params-en.dat")

    def obtain_tokens_alt(self, text, feature_dict):
        for lin in io.StringIO(text.get_payload()):
            lw = self.tk.tokenize(lin.strip())
            ls = self.sp.split(self.sid, lw, False)
            #if lin.strip():
             #   lw = self.tk.tokenize(lin)
             #   ls = self.sp.split(self.sid, lw, False)
            #if len(ls) > 0:
            for s in ls:
                ws = s.get_words()
                for w in ws:
                    key = w.get_form()
                    add_to_dict(key, feature_dict)
            #return results
    
    def obtain_tokens(self, text):
        results = {}
        for lin in io.StringIO(text.get_payload()):
            lw = self.tk.tokenize(lin.strip())
            ls = self.sp.split(self.sid, lw, False)
            #if lin.strip():
             #   lw = self.tk.tokenize(lin)
             #   ls = self.sp.split(self.sid, lw, False)
            #if len(ls) > 0:
            for s in ls:
                ws = s.get_words()
                for w in ws:
                    key = f'{w.get_form()}'
                    add_to_dict(key, results)
        return results
    
    def obtain_lemmas(self, text):
        results = {}
        for lin in io.StringIO(text.get_payload()):
            lw = self.tk.tokenize(lin.strip())
            ls = self.sp.split(self.sid, lw, False)
            ls = self.mf.analyze(ls)
            for s in ls:
                ws = s.get_words()
                for w in ws:
                    key = f'{w.get_form()}_Lemma_{w.get_lemma()}'
                    add_to_dict(key, results)
        return results
    
    def obtain_pos(self, text):
        results = {}
        for lin in io.StringIO(text.get_payload()):
            lw = self.tk.tokenize(lin.strip())
            ls = self.sp.split(self.sid, lw, False)
            ls = self.tg.analyze(ls)
            for s in ls:
                ws = s.get_words()
                for w in ws:
                    key = f'{w.get_form()}_PoS_{w.get_tag()}'
                    add_to_dict(key, results)
        return results
    
    def close(self):
        self.sp.close_session(self.sid)

In [None]:
%%time
anal = FreelingAnalyzer(basedir, lang)

In [None]:
for lin in io.StringIO(actualEmail.get_payload()):
    print(lin.strip())

In [None]:
results = {}
#anal.obtain_tokens(actualEmail, results)
#anal.obtain_tokens(actualEmail)

In [None]:
#results

In [None]:
token = True
mailsWithFeatures = {}
for mail in email_list[:2]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)
    if token:
        mailsWithFeatures[mail][actualEmail['message-id']]['tokens'] = anal.obtain_tokens(actualEmail)

In [None]:
#mailsWithFeatures

In [None]:
token = True
lemma = True
mailsWithFeatures = {}
for mail in email_list[:2]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)
    if token:
        mailsWithFeatures[mail][actualEmail['message-id']]['tokens'] = anal.obtain_tokens(actualEmail)
    if lemma:
        mailsWithFeatures[mail][actualEmail['message-id']]['lemmas'] = anal.obtain_lemmas(actualEmail)

In [None]:
mailsWithFeatures

In [None]:
token = False
lemma = False
pos = True
mailsWithFeatures = {}
for mail in email_list[:2]:
    actualEmail = mail[1]
    mailsWithFeatures[mail] = obtain_base_features(actualEmail)
    if token:
        mailsWithFeatures[mail][actualEmail['message-id']]['tokens'] = anal.obtain_tokens(actualEmail)
    if lemma:
        mailsWithFeatures[mail][actualEmail['message-id']]['lemmas'] = anal.obtain_lemmas(actualEmail)
    if pos:
        mailsWithFeatures[mail][actualEmail['message-id']]['pos'] = anal.obtain_pos(actualEmail)

In [None]:
mailsWithFeatures