# Create elementary objects

In [1]:
from melusine.core.pipeline import MelusinePipeline

from melusine.nlp_tools.normalizer import Normalizer
from melusine.nlp_tools.tokenizer import RegexTokenizer
from melusine.nlp_tools.phraser import Phraser
from melusine.nlp_tools.text_processor import make_tokenizer
from melusine.nlp_tools.text_flagger import DeterministicTextFlagger
from melusine.nlp_tools.token_flagger import FlashtextTokenFlagger
from melusine.nlp_tools.lemmatizer import DummyLemmatizer
from melusine.nlp_tools.embedding import Embedding

from melusine import load_email_data

Using pandas backend for Data transformations


In [2]:
n = Normalizer(form="NFKD", lowercase=True)
t = RegexTokenizer(tokenizer_regex=r"\w+(?:[\?\-\"_]\w+)*", stopwords=["le", "les"])
textf = DeterministicTextFlagger(text_flags = {r"\d{10}": "flag_phone"})
tokenf = FlashtextTokenFlagger(token_flags = {"flag_name": ["joe", "bob"]})
d = DummyLemmatizer()

In [40]:
from melusine import config

In [None]:
segmenter = Segmenter(regex_seg=config["segment"])

In [3]:
e = Embedding(min_count=2)

# Melusine Pipeline

In [4]:
df = load_email_data(type="full")
df["text"] = df["body"]
df.head(2)

Unnamed: 0,body,header,date,from,to,attachment,sexe,age,label,is_begin_by_transfer,...,min__44,min__45,min__49,min__52,min__54,min__56,min__58,attachment_type__0,attachment_type__1,text
0,\n \n \n \n Bonjour \n Je suis client chez...,Devis habitation,2018-05-24 11:36:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,[],F,35,habitation,True,...,0,0,0,0,0,0,0,0,1,\n \n \n \n Bonjour \n Je suis client chez...
1,"\n \n \n \n Bonsoir madame, \n \n Je vous...",Immatriculation voiture,2018-05-24 19:37:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,"[""pj.pdf""]",M,32,vehicule,True,...,0,0,0,0,0,0,0,1,0,"\n \n \n \n Bonsoir madame, \n \n Je vous..."


## Assemble Pipeline

In [5]:
textf2 = DeterministicTextFlagger(text_flags = {r"je": "JE"})
gensim_phraser = Phraser(threshold=2, min_count=2, input_columns=["tokens"], output_columns=["tokens"])

m_pipe = MelusinePipeline([
        ("normalizer", n),
        ("text_flagger", textf),
        ("text_flagger2", textf2),
        ("tokenizer", t),
        ("lemmatizer", d),
        ("gensim_phraser", gensim_phraser),
        ("token_flagger", tokenf),
        ("w2v", e),
],
    verbose=True
)

## Execute pipeline

In [6]:
df = m_pipe.fit_transform(df)

[Pipeline] ........ (step 1 of 8) Processing normalizer, total=   0.0s
[Pipeline] ...... (step 2 of 8) Processing text_flagger, total=   0.0s
[Pipeline] ..... (step 3 of 8) Processing text_flagger2, total=   0.0s
[Pipeline] ......... (step 4 of 8) Processing tokenizer, total=   0.0s
[Pipeline] ........ (step 5 of 8) Processing lemmatizer, total=   0.0s
[Pipeline] .... (step 6 of 8) Processing gensim_phraser, total=   0.0s
[Pipeline] ..... (step 7 of 8) Processing token_flagger, total=   0.0s
[Pipeline] ............... (step 8 of 8) Processing w2v, total=   0.1s


In [7]:
df.iloc[0]["tokens"]

['bonjour_JE',
 'sui',
 'client',
 'chez',
 'vou',
 'pouvez',
 'vou',
 'm',
 'etablir',
 'un_devi',
 'pour',
 'mon',
 'fil',
 'qui',
 'souhaite',
 'louer',
 'lappartement',
 'suivant',
 '25',
 'rue',
 'du',
 'rueimaginaire',
 '77000',
 'merci',
 'envoye_de',
 'mon_iphone']

In [9]:
m_pipe.save("my_pipeline")

In [10]:
m_pipe_reloaded = MelusinePipeline.load("my_pipeline")

In [11]:
m_pipe_reloaded

MelusinePipeline(steps=[('normalizer',
                         Normalizer(input_columns=['text'],
                                    output_columns=['text'])),
                        ('text_flagger',
                         DeterministicTextFlagger(input_columns=['text'],
                                                  output_columns=['text'],
                                                  text_flags={'\\d{10}': 'flag_phone'})),
                        ('text_flagger2',
                         DeterministicTextFlagger(input_columns=['text'],
                                                  output_columns=['text'],
                                                  text_flags={'je': 'JE'})),
                        ('tokenizer'...
                                         output_columns=('tokens',))),
                        ('gensim_phraser',
                         Phraser(input_columns=['tokens'],
                                 output_columns=['tokens'])),
               

In [12]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = m_pipe_reloaded.transform(df)
df.iloc[0]["tokens"]

['bonjour_JE',
 'sui',
 'client',
 'chez',
 'vou',
 'pouvez',
 'vou',
 'm',
 'etablir',
 'un_devi',
 'pour',
 'mon',
 'fil',
 'qui',
 'souhaite',
 'louer',
 'lappartement',
 'suivant',
 '25',
 'rue',
 'du',
 'rueimaginaire',
 '77000',
 'merci',
 'envoye_de',
 'mon_iphone']

In [13]:
import numpy as np
m_pipe_reloaded.named_steps["w2v"].embeddings_["date"], np.ndarray

(array([ 9.0266630e-04,  6.4310022e-03, -6.7403787e-03,  7.1425955e-03,
         1.0331177e-02,  1.9011955e-03,  3.9349636e-03,  6.4286543e-03,
        -2.9777391e-03, -8.4869433e-03,  8.1091013e-04, -5.0247193e-04,
        -7.2138435e-03,  2.3347558e-03,  4.8175771e-03, -8.1246467e-03,
         2.5347041e-03,  6.5802303e-03, -8.8761132e-03,  1.1852063e-03,
        -8.1245080e-03, -7.6027690e-03,  7.6509570e-03, -7.9380795e-03,
        -1.8591663e-03, -7.8568542e-03, -8.5925832e-03, -8.1224414e-03,
        -1.2418941e-03,  2.2142131e-03, -6.9472692e-03, -7.7581573e-03,
        -2.5859969e-03, -2.4417292e-03, -6.8943468e-03, -2.9456376e-03,
         3.4222791e-03,  1.8526295e-03,  5.4572974e-03,  2.1080021e-03,
        -5.4242183e-04,  1.3953992e-03,  8.9178765e-03,  2.0206973e-03,
         3.1961922e-03,  4.3547744e-04, -9.5608933e-03,  4.6774661e-03,
         5.7900925e-03,  4.0709460e-04,  1.0159505e-02, -6.6816816e-03,
         3.5485327e-03, -7.7318372e-03, -9.7941346e-03,  2.79994

# Pipeline Visualization

In [14]:
from sklearn import set_config

set_config(display='diagram')
m_pipe_reloaded

# Simplified Pipeline creation

In [15]:
easy_pipe = make_tokenizer(
        form = "NFKD",
        lowercase = True,
        tokenizer_regex = r"\w+(?:[\?\-\"_]\w+)*",
        stopwords = ["le", "les"],
        text_flags = {r"\d{10}": "flag_phone"},
        token_flags = {"flag_name": ["joe", "bob"]},  
)

In [16]:
easy_pipe

In [17]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = easy_pipe.transform(df)
df.iloc[0]["tokens"]

['bonjour',
 'je',
 'suis',
 'client',
 'chez',
 'vous',
 'pouvez',
 'vous',
 'm',
 'etablir',
 'un',
 'devis',
 'pour',
 'mon',
 'fils',
 'qui',
 'souhaite',
 'louer',
 'lappartement',
 'suivant',
 '25',
 'rue',
 'du',
 'rueimaginaire',
 '77000',
 'merci',
 'envoye',
 'de',
 'mon',
 'iphone']

In [None]:
easy_pipe.save("my_easy_pipeline")

In [None]:
easy_pipe_reloaded = MelusinePipeline.load("my_easy_pipeline")

In [None]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = easy_pipe_reloaded.transform(df)
df.iloc[0]["tokens"]

# Pipeline composition

In [19]:
p1 = MelusinePipeline([
        ("normalizer", n),
        ("text_flagger", textf),
        ("text_flagger2", textf2),
        ("tokenizer", t),
],
    verbose=True
)
p2 = MelusinePipeline([
        ("lemmatizer", d),
        ("gensim_phraser", gensim_phraser),
        ("token_flagger", tokenf),
],
    verbose=True
)

In [20]:
p3 = MelusinePipeline([("text_pipe", p1), ("token_pipe", p2)])

In [21]:
p3.transform(df).head(2)

Unnamed: 0,body,header,date,from,to,attachment,sexe,age,label,is_begin_by_transfer,...,min__45,min__49,min__52,min__54,min__56,min__58,attachment_type__0,attachment_type__1,text,tokens
0,\n \n \n \n Bonjour \n Je suis client chez...,Devis habitation,2018-05-24 11:36:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,[],F,35,habitation,True,...,0,0,0,0,0,0,0,1,\n \n \n \n bonjour \n JE suis client chez...,"[bonjour_JE, sui, client, chez, vou, pouvez, v..."
1,"\n \n \n \n Bonsoir madame, \n \n Je vous...",Immatriculation voiture,2018-05-24 19:37:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,"[""pj.pdf""]",M,32,vehicule,True,...,0,0,0,0,0,0,1,0,"\n \n \n \n bonsoir madame, \n \n JE vous...","[bonsoir, madame_JE, vou, informe, que, la_nou..."


In [None]:
p3

In [22]:
p3.save("pipeline_compo")

In [None]:
p3_reloaded = MelusinePipeline.load("pipeline_compo")
p3_reloaded

In [None]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = m_pipe_reloaded.transform(df)
print("stupid" in df.columns)
df.iloc[0]["tokens"]

# Change Pipeline Execution Backend

In [23]:
from melusine.backend.active_backend import switch_backend, backend

In [None]:
apply_transform(data, func, input_colums, out)

In [24]:
backend._backend

<melusine.backend.pandas_backend.PandasBackend at 0x7ffe5f4e17c0>

In [33]:
# switch_backend("pandas", progress_bar=True, workers=5)

In [26]:
backend._backend

<melusine.backend.dict_backend.DictBackend at 0x7ffe608e0190>

In [27]:
ddd = df.iloc[0].to_dict()

In [None]:
type(m_pipe.transform(ddd))

In [32]:
m_pipe.transform(df)

normalize: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 23350.34it/s]
flag_text: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 11249.31it/s]
flag_text: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 26968.68it/s]
tokenize: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 11101.92it/s]
lemmatize: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 19418.07it/s]
flag_tokens: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 4077.

Unnamed: 0,body,header,date,from,to,attachment,sexe,age,label,is_begin_by_transfer,...,min__45,min__49,min__52,min__54,min__56,min__58,attachment_type__0,attachment_type__1,text,tokens
0,\n \n \n \n Bonjour \n Je suis client chez...,Devis habitation,2018-05-24 11:36:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,[],F,35,habitation,True,...,0,0,0,0,0,0,0,1,\n \n \n \n bonjour \n JE suis client chez...,"[bonjour_JE, sui, client, chez, vou, pouvez, v..."
1,"\n \n \n \n Bonsoir madame, \n \n Je vous...",Immatriculation voiture,2018-05-24 19:37:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,"[""pj.pdf""]",M,32,vehicule,True,...,0,0,0,0,0,0,1,0,"\n \n \n \n bonsoir madame, \n \n JE vous...","[bonsoir, madame_JE, vou, informe, que, la_nou..."
2,"\n \n \n Bonjours, \n \n Suite a notre con...",Re: Envoi d'un document de la Société Imaginaire,2018-05-25 06:45:00,Monsieur Dupont <monsieurdupont@extensiona.com>,demandes@societeimaginaire.fr,[],M,66,compte,False,...,1,0,0,0,0,0,0,1,"\n \n \n bonjours, \n \n suite a notre con...","[bonjour_suite, a_notre, conversation, telepho..."
3,"\n \n \n \n \n Bonjour, \n \n \n Je fai...",Re: Votre adhésion à la Société Imaginaire,2018-05-25 10:15:00,Monsieur Dupont <monsieurdupont@extensiond.com>,demandes@societeimaginaire.fr,"[""fichedepaie.png""]",M,50,adhesion,False,...,0,0,0,0,0,0,1,0,"\n \n \n \n \n bonjour, \n \n \n JE fai...","[bonjour_JE, fai, suite_a, votre, mail, j_ai, ..."
4,"\n \n \n Bonjour, \n Voici ci joint mon bul...",Bulletin de salaire,2018-05-25 17:30:00,Monsieur Dupont <monsieurdupont@extensiona.com>,demandes@societeimaginaire.fr,"[""pj.pdf""]",M,15,adhesion,False,...,0,0,0,0,0,0,1,0,"\n \n \n bonjour, \n voici ci joint mon bul...","[bonjour_voici, ci, joint, mon_bulletin, de_sa..."
5,"Madame, Monsieur, \n \n Je vous avais contact...",Modification et extension de ma maison,2018-05-31 10:28:00,Monsieur Dupont <monsieurdupont@extensiona.com>,demandes@societeimaginaire.fr,[],F,22,habitation,False,...,0,0,0,0,0,0,0,1,"madame, monsieur, \n \n JE vous avais contact...","[madame_monsieur, JE_vou, avai, contacte, car,..."
6,"\n \n \n \n Bonjour, \n \n J'emménage dan...",Assurance d'un nouveau logement,2018-05-30 15:56:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,"[""pj.pdf""]",F,28,resiliation,True,...,0,0,0,0,1,0,1,0,"\n \n \n \n bonjour, \n \n j'emmenage dan...","[bonjour, j, emmenage, dan, un, nouveau, studi..."
7,"\n \n \n \n \n Bonjour, \n \n \n \n Je...",Assurance véhicules,2018-05-31 14:02:00,Monsieur Dupont <monsieurdupont@extensiona.com>,demandes@societeimaginaire.fr,"[""image001.png""]",M,39,vehicule,False,...,0,0,0,0,0,0,1,0,"\n \n \n \n \n bonjour, \n \n \n \n JE...","[bonjour_JE, me, permet, de, venir, ver, vou, ..."
8,"\n \n \n Bonjour, \n \n Voici la copie du ...",Re: Virement,2018-05-31 17:10:00,Monsieur Dupont <monsieurdupont@extensione.com>,demandes@societeimaginaire.fr,"[""pj.pdf""]",M,38,autres,False,...,0,0,0,0,0,0,1,0,"\n \n \n bonjour, \n \n voici la copie du ...","[bonjour_voici, la, copie, du, virement, effec..."
9,\n \n \n \n \n \n \n \n BONJOUR \n \n...,Prêt véhicule,2018-05-31 08:54:00,Monsieur Dupont <monsieurdupont@extensionb.com>,demandes@societeimaginaire.fr,"[""pj.pdf""]",M,30,vehicule,False,...,0,0,0,1,0,0,1,0,\n \n \n \n \n \n \n \n bonjour \n \n...,"[bonjour, ci-joint, pret, vehicule, cordialeme..."


# Help users understand the Framework

In [34]:
from melusine.core.melusine_transformer import MelusineTransformer

In [36]:
class NoFilename(MelusineTransformer):
    FILENAME = "wesh"

    def __init__(self, input_columns=("text",), output_columns=("text",)):
        super().__init__(input_columns, output_columns)
        
    def load(self):
        pass
    
    def save(self):
        pass    
        
nofilename = NoFilename()

In [37]:
class NoFunc(MelusineTransformer):
    FILENAME = "nofunc"
    def __init__(self, input_columns=("text",), output_columns=("text",)):
        super().__init__(input_columns, output_columns)
        
    def load(self):
        pass

    def save(self):
        pass    
        
nofunc = NoFunc()

In [38]:
print(nofunc.transform(ddd))

Instance of <class '__main__.NoFunc'> does not have a func attribute
You should either specify a func attribute or define your own transform method


# Regex definition

In [None]:
# === Info importante ===

# Les sauts de lignes sont remplacés par le pattern " ; " à la réception de la requête
# C'est clairement une transformation indésirable aujourd'hui 
# mais elle est encrée dans les codes et il faudrait prendre quelques jours pour modifier ça et étudier les impacts !


# === Start pattern ===
# On cherche un début de ligne ou un ";"
start_pattern = r"""(?:^|;)"""

# === Symboles de début de ligne ===
# Les emails avec des multiples retours à la ligne génèrent des paterns de " ; ; ; ; ; ; "
# Certains messages transférés / réponses ont des symboles en début de lignes (> et/ou |)
# Ex:
# Merci
# > De foo@maif.fr A bar@gmail.com
# > Voici le document
# On ignore tous ces symboles
ignore_characters = """(?:[>| ;]*)"""

# === Keywords de transition ===
# Certains mot-clés présents dans les réponses et emails transférés sont utilisés pour la segmentation
# Ces mot clés sont suivis du symbole ":"
# Ex:
# De : XX A : XX Sujet : Blah Blah
meta_transition_words = """(?:\\b(?:[Ee]nvoy[ée](?: par)?|[Dd]e|[Oo]bjet|[Cc]c|Date|[AÀàa]|[Dd]estinataire|[Ss]ent|[Tt]o|[Ss]ub?jec?t|[Ff]rom|[Cc]opie [àa])\\b\\s{,4}:)"""

# === Méta-données ===
# Les mots-clés sont suivi de champs de textes libre qu'il faut identifier
# On accepte un retour à la ligne à la suite du mot clé ("\s{,4};\s{,4}")
# On limite le champs de texte libre à 150 charactères (car un .* est très couteux en temps de calcul)
# Le champ de texte libre s'arrête lorsqu'on détecte un ";" (retour à la ligne)
meta_content_pattern = r"""(?:\\s{,4};\\s{,4}[^;]{,100}[;|]\\s{,4})"""


# === Meta data pattern ===
meta_pattern = fr"(?:{ignore_characters}{meta_transition_words}{meta_content_pattern})"

# === Full pattern ===
# On cherche un start pattern suivi de répétitions de meta_pattern
regex = fr"""({start_pattern}{meta_pattern}+)"""

In [None]:
print(regex)

In [None]:
print("""(?:(?:^|;)(?:(?:[>| ;]*(?:Envoy[ée]|De|Objet|Cc|Envoy[ée] par|Date|A|À|Destinataire|Sent|To|Subject|Sujet|From|Copie [àa])\s{,4}:\s{,4};?\s{,4}[^;]{,100}[;|]\s{,4}))+)""")

In [None]:
print("""| |\n| |\n| |-------- Message transféré --------\n| |\n| |Sujet :\n| | [INTERNET] Dossier F210306856A -\n| | Date :| | Mon, 13 Sep 2021 13:45:39 +0200 | | De :| | gestionsinistre@maif.fr | | Pour :| | anomalies-vol-siv@interieur.gouv.fr | | | |Bonjour, |""")

In [None]:
import re

class RegexEngine:
    def __init__(self, regex_catalog):
        self.regex_catalog = regex_catalog

    @staticmethod
    def match_text(text, regex):
        return bool(re.search(regex, text))
        
    def detect(self, df, detection_key, input_column, output_column):
        
        r_list = self.regex_catalog[detection_key]
        regex = "|".join(r_list)
        
        f = lambda x: self.match_text(x, regex=regex)
        
        df = backend.apply_transform(
            data=df,
            input_columns=(input_column,),
            output_columns=(output_column,),
            func=f,
        )
        
        return df


In [None]:
catalog = {
    "insatisfaction": [
        r"pas content",
        r"furieux"
    ],
    "remerciement": [
        r"merci",
        r"thanks"
    ],    
}

In [None]:
import pandas as pd
df = pd.DataFrame({"text": ["je suis furieux today", "wesh"]})

In [None]:
r = RegexEngine(catalog)

In [None]:
r.detect(df, "insatisfaction", "text", "output")

In [None]:
use("dict")

In [None]:
r.detect(df.iloc[0].to_dict(), "insatisfaction", "text", "output")

In [None]:
def f(x, n=2):
    return x * n, x*10

In [None]:
import pandas as pd
dd_dict = {"a": 3}
dd = pd.DataFrame([dd_dict, dd_dict])

In [None]:
args = None

In [None]:
dd["a"].apply(f, args=args)

In [None]:
dd[["c", "d"]] = getattr(dd["a"], "apply")(f).apply(pd.Series)

In [None]:
import tqdm

In [None]:
tqdm.__version__

In [None]:
dd

In [None]:
dictx = {
        "input": 2,
        "expected_output1": 6,
        "expected_output2": 20,
        "expected_output_kwargs1": 10,
        "expected_output_kwargs2": 20,
    }


df = pd.DataFrame(
        [dictx, dictx]
    )


In [None]:
def f_single_input_multi_output(x, n=3):
    return x * n, x * 10


In [None]:
result = df["input"].apply(f_single_input_multi_output)


In [None]:
result

In [None]:
if True:
    result = result.apply(pd.Series)


In [None]:
result

In [None]:
df[['output1', 'output2']] = result

In [None]:
df

In [None]:
list(('output1', 'output2'))

In [45]:
[
          "De\\s*:\\s*[^<]*?<?[a-zA-Z0-9._%+-\/=]+\\@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}>?\\s[;\nAÀ:](?:.{,80}\n){,3}Objet.+\n",
          "[- ]*?Mail transf[ée]r[ée].*?[;|\n]",
          "[- ]*?gestionsocietaire@maif.fr a [ée]crit.*?[;|\n]",
          "Courriel original.+?Objet\\s*:.+?[;|\n]",
          "Transf[ée]r[ée] par.+?Objet\\s*:.+?[;|\n]",
          "Message transmis.+?Objet\\s*:.+?[;|\n]",
          "Message transf[ée]r[ée].+?Objet\\s*:.+?[;|\n]",
          "Message transf[ée]r[ée].+?Pour\\s*:.+?[;|\n]",
          "D[ée]but du message transf[ée]r[ée].+?Objet\\s*:.+?[;|\n]",
          "D[ée]but du message r[ée]exp[ée]di[ée].+?Objet\\s*:.+?[;|\n]",
          "D[ée]but du message transf[ée]r[ée].+?Destinataire\\s*:.+?[;|\n]",
          "mail transf[ée]r[ée].+?Objet\\s*:.+?[;|\n]",
          "Forwarded message.+?To\\s*:.+?[;|\n]",
          "Message d'origine.+?Objet\\s*:.+?[;|\n]",
          "Mail original.+?Objet\\s*:.+?[;|\n]",
          "Original Message.+?Subject\\s*:.+?[;|\n]",
          "Message original.+?Objet\\s*:.+?[;|\n]",
          "Exp[ée]diteur.+?Objet\\s*:.+?[;|\n]",
          "(?:>?[;|\n]?\\s*(?:Envoy[ée]|De|Objet|Cc|Envoy[ée] par|Date|A|À|Destinataire|Sent|To|Subject|From|Copie [àa])+?\\s*:\\s*(?:.*?)\\s*[;|\n]\\s*)+",
          "En date de.+?[ée]crit",
          ">?\\s*\\bLe[^;\n]{0,30}[;|\n]{0,1}[^;\n]{0,30}a[^;\n]{0,30};{0,1}[^;\n]{0,30}[ée]crit\\s*:?",
          ">?\\s*Message d[eu].+?Objet\\s*:.+?[;|\n]",
          "En date de.+?[ée]crit"
]

['De\\s*:\\s*[^<]*?<?[a-zA-Z0-9._%+-\\/=]+\\@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}>?\\s[;\nAÀ:](?:.{,80}\n){,3}Objet.+\n',
 '[- ]*?Mail transf[ée]r[ée].*?[;|\n]',
 '[- ]*?gestionsocietaire@maif.fr a [ée]crit.*?[;|\n]',
 'Courriel original.+?Objet\\s*:.+?[;|\n]',
 'Transf[ée]r[ée] par.+?Objet\\s*:.+?[;|\n]',
 'Message transmis.+?Objet\\s*:.+?[;|\n]',
 'Message transf[ée]r[ée].+?Objet\\s*:.+?[;|\n]',
 'Message transf[ée]r[ée].+?Pour\\s*:.+?[;|\n]',
 'D[ée]but du message transf[ée]r[ée].+?Objet\\s*:.+?[;|\n]',
 'D[ée]but du message r[ée]exp[ée]di[ée].+?Objet\\s*:.+?[;|\n]',
 'D[ée]but du message transf[ée]r[ée].+?Destinataire\\s*:.+?[;|\n]',
 'mail transf[ée]r[ée].+?Objet\\s*:.+?[;|\n]',
 'Forwarded message.+?To\\s*:.+?[;|\n]',
 "Message d'origine.+?Objet\\s*:.+?[;|\n]",
 'Mail original.+?Objet\\s*:.+?[;|\n]',
 'Original Message.+?Subject\\s*:.+?[;|\n]',
 'Message original.+?Objet\\s*:.+?[;|\n]',
 'Exp[ée]diteur.+?Objet\\s*:.+?[;|\n]',
 '(?:>?[;|\n]?\\s*(?:Envoy[ée]|De|Objet|Cc|Envoy[ée] par|Date