# Create elementary objects

In [1]:
from melusine import config

In [2]:
from melusine.core.pipeline import MelusinePipeline

from melusine.nlp_tools.normalizer import Normalizer
from melusine.nlp_tools.tokenizer import RegexTokenizer
from melusine.nlp_tools.phraser import Phraser
from melusine.nlp_tools.text_flagger import DeterministicTextFlagger
from melusine.nlp_tools.token_flagger import FlashtextTokenFlagger
from melusine.nlp_tools.lemmatizer import DummyLemmatizer
from melusine.nlp_tools.embedding import Embedding

from melusine import load_email_data

Using pandas backend for Data transformations


# Instanciate transformers (explicitely or from config)

## Normalizer

In [3]:
# Explicit
# normalizer = Normalizer(form="NFKD", lowercase=True)

# From config but overriding some parameters
# normalizer = Normalizer.from_config("normalizer", form="NFD")

# From config
normalizer = Normalizer.from_config("normalizer", form="NFD")

# Text Flagger

In [4]:
text_flagger = DeterministicTextFlagger(text_flags = {r"\d{10}": "flag_phone"})

# Tokenizer

In [5]:
tokenizer = RegexTokenizer(tokenizer_regex=r"\w+(?:[\?\-\"_]\w+)*", stopwords=["le", "les"])

## Token Flagger

In [6]:
token_flagger = FlashtextTokenFlagger(token_flags = {"flag_name": ["joe", "bob"]})

## Lemmatizer

In [7]:
lemmatizer = DummyLemmatizer()

## Embedding

In [8]:
embedding = Embedding(min_count=2)

## Phraser

In [9]:
phraser = Phraser(threshold=2, min_count=2, input_columns=["tokens"], output_columns=["tokens"])

## Transformers can be used as stand-alone or inserted in a pipeline

In [10]:
text_flagger.flag_text("Appellez moi au 0612345678")

'Appellez moi au flag_phone'

# Melusine Pipeline

In [11]:
# Load data
df = load_email_data(type="full")
df["text"] = df["body"]
df.head(2)

Unnamed: 0,body,header,date,from,to,attachment,sexe,age,label,is_begin_by_transfer,...,min__44,min__45,min__49,min__52,min__54,min__56,min__58,attachment_type__0,attachment_type__1,text
0,\n \n \n \n Bonjour \n Je suis client chez...,Devis habitation,2018-05-24 11:36:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,[],F,35,habitation,True,...,0,0,0,0,0,0,0,0,1,\n \n \n \n Bonjour \n Je suis client chez...
1,"\n \n \n \n Bonsoir madame, \n \n Je vous...",Immatriculation voiture,2018-05-24 19:37:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,"[""pj.pdf""]",M,32,vehicule,True,...,0,0,0,0,0,0,0,1,0,"\n \n \n \n Bonsoir madame, \n \n Je vous..."


## Assemble Pipeline

In [12]:
m_pipe = MelusinePipeline([
        ("normalizer", normalizer),
        ("text_flagger", text_flagger),
        ("tokenizer", tokenizer),
        ("lemmatizer", lemmatizer),
        ("gensim_phraser", phraser),
        ("token_flagger", token_flagger),
        ("w2v", embedding),
],
    verbose=True
)

## Execute pipeline

In [13]:
# df.iloc[0]

In [14]:
df = m_pipe.fit_transform(df)

[Pipeline] ........ (step 1 of 7) Processing normalizer, total=   0.0s
[Pipeline] ...... (step 2 of 7) Processing text_flagger, total=   0.0s
[Pipeline] ......... (step 3 of 7) Processing tokenizer, total=   0.0s
[Pipeline] ........ (step 4 of 7) Processing lemmatizer, total=   0.0s
[Pipeline] .... (step 5 of 7) Processing gensim_phraser, total=   0.0s
[Pipeline] ..... (step 6 of 7) Processing token_flagger, total=   0.0s
[Pipeline] ............... (step 7 of 7) Processing w2v, total=   0.0s


In [15]:
# Tokenized text
df.iloc[0]["tokens"]

['bonjour_je',
 'sui',
 'client',
 'chez',
 'vou',
 'pouvez',
 'vou',
 'm',
 'etablir',
 'un_devi',
 'pour',
 'mon',
 'fil',
 'qui',
 'souhaite',
 'louer',
 'lappartement',
 'suivant',
 '25',
 'rue',
 'du',
 'rueimaginaire',
 '77000',
 'merci',
 'envoye_de',
 'mon_iphone']

## Save and Load a MelusinePipeline

In [16]:
m_pipe.save("my_pipeline")

In [17]:
m_pipe_reloaded = MelusinePipeline.load("my_pipeline")

In [18]:
m_pipe_reloaded

MelusinePipeline(steps=[('normalizer',
                         Normalizer(input_columns=['text'],
                                    output_columns=['text'])),
                        ('text_flagger',
                         DeterministicTextFlagger(input_columns=['text'],
                                                  output_columns=['text'],
                                                  text_flags={'\\d{10}': 'flag_phone'})),
                        ('tokenizer',
                         RegexTokenizer(input_columns=['text'],
                                        output_columns=['tokens'],
                                        stopwords={'le', 'les'})),
                        ('lemmatizer',
                         DummyLemma...
                                         output_columns=('tokens',))),
                        ('gensim_phraser',
                         Phraser(input_columns=['tokens'],
                                 output_columns=['tokens'])),
         

In [19]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = m_pipe_reloaded.transform(df)
df.iloc[0]["tokens"]

['bonjour_je',
 'sui',
 'client',
 'chez',
 'vou',
 'pouvez',
 'vou',
 'm',
 'etablir',
 'un_devi',
 'pour',
 'mon',
 'fil',
 'qui',
 'souhaite',
 'louer',
 'lappartement',
 'suivant',
 '25',
 'rue',
 'du',
 'rueimaginaire',
 '77000',
 'merci',
 'envoye_de',
 'mon_iphone']

## Make sure that transformers have been fitted
Test the word embedding

In [20]:
m_pipe_reloaded.named_steps["w2v"].embeddings_.most_similar("date")

[('souhaite', 0.37198781967163086),
 ('au_nom', 0.3559889793395996),
 ('32', 0.31578245759010315),
 ('une', 0.27164793014526367),
 ('monsieurdupont', 0.2551308572292328),
 ('acte', 0.23880381882190704),
 ('prie_de', 0.2232523262500763),
 ('si', 0.2162163406610489),
 ('notre', 0.2139136791229248),
 ('je', 0.2138250172138214)]

# Pipeline Visualization

In [21]:
from sklearn import set_config

set_config(display='diagram')
m_pipe_reloaded

# Pipeline composition (3 layers !)

In [22]:
p1 = MelusinePipeline([
        ("normalizer", normalizer),
        ("text_flagger", text_flagger),
        ("tokenizer", tokenizer),
],
    verbose=True
)
p2 = MelusinePipeline([
        ("lemmatizer", lemmatizer),
        ("gensim_phraser", phraser),
],
    verbose=True
)
p3 = MelusinePipeline([
        ("token_flagger", token_flagger),
],
    verbose=True
)

In [23]:
p12 = MelusinePipeline([("text_pipe", p1), ("token_pipe", p2)])
p123 = MelusinePipeline([("p12", p12), ("p3", p3)])

In [24]:
p123.transform(df).head(2)

Unnamed: 0,body,header,date,from,to,attachment,sexe,age,label,is_begin_by_transfer,...,min__45,min__49,min__52,min__54,min__56,min__58,attachment_type__0,attachment_type__1,text,tokens
0,\n \n \n \n Bonjour \n Je suis client chez...,Devis habitation,2018-05-24 11:36:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,[],F,35,habitation,True,...,0,0,0,0,0,0,0,1,\n \n \n \n bonjour \n je suis client chez...,"[bonjour_je, sui, client, chez, vou, pouvez, v..."
1,"\n \n \n \n Bonsoir madame, \n \n Je vous...",Immatriculation voiture,2018-05-24 19:37:00,Dupont <monsieurdupont@extensiona.com>,conseiller@Societeimaginaire.fr,"[""pj.pdf""]",M,32,vehicule,True,...,0,0,0,0,0,0,1,0,"\n \n \n \n bonsoir madame, \n \n je vous...","[bonsoir, madame_je, vou, informe, que, la_nou..."


In [25]:
p123

In [50]:
p123.save("pipeline_compo")

In [27]:
p123_reloaded = MelusinePipeline.load("pipeline_compo")
p123_reloaded

In [28]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = p123.transform(df)
print("stupid" in df.columns)
df.iloc[0]["tokens"]

False


['bonjour_je',
 'sui',
 'client',
 'chez',
 'vou',
 'pouvez',
 'vou',
 'm',
 'etablir',
 'un_devi',
 'pour',
 'mon',
 'fil',
 'qui',
 'souhaite',
 'louer',
 'lappartement',
 'suivant',
 '25',
 'rue',
 'du',
 'rueimaginaire',
 '77000',
 'merci',
 'envoye_de',
 'mon_iphone']

# Change Pipeline Execution Backend

In [29]:
from melusine.backend.active_backend import switch_backend, backend

In [30]:
backend._backend

<melusine.backend.pandas_backend.PandasBackend at 0x7f9223781690>

In [31]:
switch_backend("dict")

Using dict backend for Data transformations


In [32]:
backend._backend

<melusine.backend.dict_backend.DictBackend at 0x7f91f02728d0>

In [33]:
ddd = df.iloc[0].to_dict()

In [34]:
type(m_pipe.transform(ddd))

dict

In [35]:
switch_backend("pandas")

Using pandas backend for Data transformations


# Help users understand the Framework

In [36]:
from melusine.core.melusine_transformer import MelusineTransformer

In [37]:
class NoFilename(MelusineTransformer):
    FILENAME = "wesh"

    def __init__(self, input_columns=("text",), output_columns=("text",)):
        super().__init__(input_columns, output_columns)
        
    def load(self):
        pass
    
    def save(self):
        pass    
        
nofilename = NoFilename()

In [38]:
class NoFunc(MelusineTransformer):
    FILENAME = "nofunc"
    def __init__(self, input_columns=("text",), output_columns=("text",)):
        super().__init__(input_columns, output_columns)
        
    def load(self):
        pass

    def save(self):
        pass    
        
nofunc = NoFunc()

In [39]:
print(nofunc.transform(ddd))

Instance of <class '__main__.NoFunc'> does not have a func attribute
You should either specify a func attribute or define your own transform method


# Regex definition

In [40]:
# === Info importante ===

# Les sauts de lignes sont remplacés par le pattern " ; " à la réception de la requête
# C'est clairement une transformation indésirable aujourd'hui 
# mais elle est encrée dans les codes et il faudrait prendre quelques jours pour modifier ça et étudier les impacts !


# === Start pattern ===
# On cherche un début de ligne ou un ";"
start_pattern = r"""(?:^|;)"""

# === Symboles de début de ligne ===
# Les emails avec des multiples retours à la ligne génèrent des paterns de " ; ; ; ; ; ; "
# Certains messages transférés / réponses ont des symboles en début de lignes (> et/ou |)
# Ex:
# Merci
# > De foo@maif.fr A bar@gmail.com
# > Voici le document
# On ignore tous ces symboles
ignore_characters = """(?:[>| ;]*)"""

# === Keywords de transition ===
# Certains mot-clés présents dans les réponses et emails transférés sont utilisés pour la segmentation
# Ces mot clés sont suivis du symbole ":"
# Ex:
# De : XX A : XX Sujet : Blah Blah
meta_transition_words = """(?:\\b(?:[Ee]nvoy[ée](?: par)?|[Dd]e|[Oo]bjet|[Cc]c|Date|[AÀàa]|[Dd]estinataire|[Ss]ent|[Tt]o|[Ss]ub?jec?t|[Ff]rom|[Cc]opie [àa])\\b\\s{,4}:)"""

# === Méta-données ===
# Les mots-clés sont suivi de champs de textes libre qu'il faut identifier
# On accepte un retour à la ligne à la suite du mot clé ("\s{,4};\s{,4}")
# On limite le champs de texte libre à 150 charactères (car un .* est très couteux en temps de calcul)
# Le champ de texte libre s'arrête lorsqu'on détecte un ";" (retour à la ligne)
meta_content_pattern = r"""(?:\\s{,4};\\s{,4}[^;]{,100}[;|]\\s{,4})"""


# === Meta data pattern ===
meta_pattern = fr"(?:{ignore_characters}{meta_transition_words}{meta_content_pattern})"

# === Full pattern ===
# On cherche un start pattern suivi de répétitions de meta_pattern
regex = fr"""({start_pattern}{meta_pattern}+)"""

In [41]:
print(regex)

((?:^|;)(?:(?:[>| ;]*)(?:\b(?:[Ee]nvoy[ée](?: par)?|[Dd]e|[Oo]bjet|[Cc]c|Date|[AÀàa]|[Dd]estinataire|[Ss]ent|[Tt]o|[Ss]ub?jec?t|[Ff]rom|[Cc]opie [àa])\b\s{,4}:)(?:\\s{,4};\\s{,4}[^;]{,100}[;|]\\s{,4}))+)


In [42]:
print("""(?:(?:^|;)(?:(?:[>| ;]*(?:Envoy[ée]|De|Objet|Cc|Envoy[ée] par|Date|A|À|Destinataire|Sent|To|Subject|Sujet|From|Copie [àa])\s{,4}:\s{,4};?\s{,4}[^;]{,100}[;|]\s{,4}))+)""")

(?:(?:^|;)(?:(?:[>| ;]*(?:Envoy[ée]|De|Objet|Cc|Envoy[ée] par|Date|A|À|Destinataire|Sent|To|Subject|Sujet|From|Copie [àa])\s{,4}:\s{,4};?\s{,4}[^;]{,100}[;|]\s{,4}))+)


In [43]:
print("""| |\n| |\n| |-------- Message transféré --------\n| |\n| |Sujet :\n| | [INTERNET] Dossier F210306856A -\n| | Date :| | Mon, 13 Sep 2021 13:45:39 +0200 | | De :| | gestionsinistre@maif.fr | | Pour :| | anomalies-vol-siv@interieur.gouv.fr | | | |Bonjour, |""")

| |
| |
| |-------- Message transféré --------
| |
| |Sujet :
| | [INTERNET] Dossier F210306856A -
| | Date :| | Mon, 13 Sep 2021 13:45:39 +0200 | | De :| | gestionsinistre@maif.fr | | Pour :| | anomalies-vol-siv@interieur.gouv.fr | | | |Bonjour, |


In [44]:
import re

class RegexEngine:
    def __init__(self, regex_catalog):
        self.regex_catalog = regex_catalog

    @staticmethod
    def match_text(text, regex):
        return bool(re.search(regex, text))
        
    def detect(self, df, detection_key, input_column, output_column):
        
        r_list = self.regex_catalog[detection_key]
        regex = "|".join(r_list)
        
        f = lambda x: self.match_text(x, regex=regex)
        
        df = backend.apply_transform(
            data=df,
            input_columns=(input_column,),
            output_columns=(output_column,),
            func=f,
        )
        
        return df


In [45]:
catalog = {
    "insatisfaction": [
        r"pas content",
        r"furieux"
    ],
    "remerciement": [
        r"merci",
        r"thanks"
    ],    
}

In [46]:
import pandas as pd
df = pd.DataFrame({"text": ["je suis furieux today", "wesh"]})

In [47]:
r = RegexEngine(catalog)

In [48]:
r.detect(df, "insatisfaction", "text", "output")

Unnamed: 0,text,output
0,je suis furieux today,True
1,wesh,False


In [49]:
use("dict")

NameError: name 'use' is not defined

In [None]:
r.detect(df.iloc[0].to_dict(), "insatisfaction", "text", "output")

In [None]:
from abc import ABC, abstractmethod

In [None]:
class Base(ABC):
    
    def __init__(self):
        self.__name__ = type(self).__name__
        
    @staticmethod
    @property
    @abstractmethod
    def a():
        return "YO"

In [None]:
class Child(Base):
    _a = 3
    
    @staticmethod
    @property
    def a():
        return "Wesh"
    
    @classmethod
    def print_cls(cls):
        return cls.a
    
    def print_cls2(self):
        return self.a    

In [None]:
c = Child()

In [None]:
c.a()

In [None]:
Child.print_cls()

In [None]:
c.print_cls()

In [None]:
Child.__name__

In [None]:
c.__name__