# Text Processor object

In [1]:
from melusine.nlp_tools.normalizer import Normalizer
from melusine.nlp_tools.tokenizer import RegexTokenizer
from melusine.nlp_tools.phraser import Phraser
from melusine.nlp_tools.text_processor import TextProcessor, create_pipeline
from melusine.nlp_tools.text_flagger import DeterministicTextFlagger
from melusine.nlp_tools.token_flagger import FlashtextTokenFlagger
from melusine.nlp_tools.lemmatizer import DummyLemmatizer
from melusine.nlp_tools.pipeline import MelusinePipeline

from melusine import load_email_data

In [2]:
n = Normalizer(form="NFKD", lowercase=True)
t = RegexTokenizer(tokenizer_regex=r"\w+(?:[\?\-\"_]\w+)*", stopwords=["le", "les"])
textf = DeterministicTextFlagger(text_flags = {r"\d{10}": "flag_phone"})
tokenf = FlashtextTokenFlagger(token_flags = {"flag_name": ["joe", "bob"]})
d = DummyLemmatizer()


pp = TextProcessor(
        tokenizer=t,
        normalizer=n,
        text_flagger=textf,
        token_flagger=tokenf,
        lemmatizer=d,
)

In [3]:
pp.process("Appelle bob, sont numéro est le 0611111111 ! Il a les billets")

['appelle',
 'flag_name',
 'sont',
 'numero',
 'est',
 'flag_phone',
 'il',
 'a',
 'billet']

In [4]:
# pp.save("my_text_processor")

In [5]:
# ppp = TextProcessor.load("my_text_processor")

In [6]:
# ppp.process("les écureuils sont présents !!!")

# Melusine Pipeline

## Add a custom SKlearn transformer to the pipeline

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class Stupid(BaseEstimator, TransformerMixin):
    def fit(self, df, y=None):
        return self

    def transform(self, df):
        df["stupid"] = True
        return df    

In [6]:
textf2 = DeterministicTextFlagger(text_flags = {r"je": "JE"})
stupid = Stupid()
gensim_phraser = Phraser(threshold=2, min_count=2)

m_pipe = MelusinePipeline([
        ("normalizer", n),
        ("text_flagger", textf),
        ("text_flagger2", textf2),
        ("tokenizer", t),
        ("lemmatizer", d),
        ("stupid", stupid),
        ("gensim_phraser", gensim_phraser),
        ("token_flagger", tokenf),
]
)

In [10]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = m_pipe.fit_transform(df)
print("stupid" in df.columns)
df.iloc[0]["tokens"]

True


['bonjour_JE',
 'sui',
 'client',
 'chez',
 'vou',
 'pouvez',
 'vou',
 'm',
 'etablir',
 'un_devi',
 'pour',
 'mon',
 'fil',
 'qui',
 'souhaite',
 'louer',
 'lappartement',
 'suivant',
 '25',
 'rue',
 'du',
 'rueimaginaire',
 '77000',
 'merci',
 'envoye_de',
 'mon_iphone']

In [7]:
m_pipe.save("my_pipeline")

In [12]:
m_pipe_reloaded = MelusinePipeline.load("my_pipeline")

{'memory': None, 'verbose': None}


In [13]:
m_pipe_reloaded

MelusinePipeline(steps=[('normalizer', Normalizer()),
                        ('text_flagger',
                         DeterministicTextFlagger(text_flags={'\\d{10}': 'flag_phone'})),
                        ('text_flagger2',
                         DeterministicTextFlagger(text_flags={'je': 'JE'})),
                        ('tokenizer', RegexTokenizer(stopwords={'le', 'les'})),
                        ('lemmatizer', DummyLemmatizer()), ('stupid', Stupid()),
                        ('gensim_phraser', Phraser()),
                        ('token_flagger',
                         FlashtextTokenFlagger(flashtext_separators=['-', '_',
                                                                     '/'],
                                               token_flags={'flag_name': ['joe',
                                                                          'bob']}))])

In [14]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = m_pipe_reloaded.transform(df)
print("stupid" in df.columns)
df.iloc[0]["tokens"]

True


['bonjour_JE',
 'sui',
 'client',
 'chez',
 'vou',
 'pouvez',
 'vou',
 'm',
 'etablir',
 'un_devi',
 'pour',
 'mon',
 'fil',
 'qui',
 'souhaite',
 'louer',
 'lappartement',
 'suivant',
 '25',
 'rue',
 'du',
 'rueimaginaire',
 '77000',
 'merci',
 'envoye_de',
 'mon_iphone']

In [19]:
from sklearn import set_config

set_config(display='diagram')
m_pipe_reloaded

# Simplified Pipeline creation

In [14]:
easy_pipe = create_pipeline(
        form = "NFKD",
        lowercase = True,
        tokenizer_regex = r"\w+(?:[\?\-\"_]\w+)*",
        stopwords = ["le", "les"],
        text_flags = {r"\d{10}": "flag_phone"},
        token_flags = {"flag_name": ["joe", "bob"]},  
)

In [15]:
easy_pipe

MelusinePipeline(steps=[('normalizer', Normalizer()),
                        ('text_flagger',
                         DeterministicTextFlagger(text_flags={'\\d{10}': 'flag_phone'})),
                        ('tokenizer', RegexTokenizer(stopwords={'le', 'les'})),
                        ('token_flagger',
                         FlashtextTokenFlagger(token_flags={'flag_name': ['joe',
                                                                          'bob']}))])

In [16]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = easy_pipe.transform(df)
df.iloc[0]["tokens"]

['bonjour',
 'je',
 'suis',
 'client',
 'chez',
 'vous',
 'pouvez',
 'vous',
 'm',
 'etablir',
 'un',
 'devis',
 'pour',
 'mon',
 'fils',
 'qui',
 'souhaite',
 'louer',
 'lappartement',
 'suivant',
 '25',
 'rue',
 'du',
 'rueimaginaire',
 '77000',
 'merci',
 'envoye',
 'de',
 'mon',
 'iphone']

In [17]:
easy_pipe.save("my_easy_pipeline")

In [18]:
easy_pipe_reloaded = MelusinePipeline.load("my_easy_pipeline")

{'memory': False, 'verbose': False}


In [9]:
df = load_email_data(type="full")
df["text"] = df["body"]
df = easy_pipe_reloaded.transform(df)
df.iloc[0]["tokens"]

NameError: name 'easy_pipe_reloaded' is not defined

In [21]:
x = 1
f"{3:0{len(str(x))}d}"

'3'

In [11]:
def lowercase(text):
    return text.lower()

In [13]:
%%timeit
df["body"].apply(lowercase)

264 µs ± 20.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%%timeit
df["body"].str.lower()

234 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
from triton_api import predict_triton

def MelusineModel(BaseMelusineClass):
    
    def server_inference(X):
        predict_triton(X)
        
    def predict
        