# Import

In [None]:
import logging
import sys

import pandas as pd
from melusine.nlp_tools.embedding import Embedding

In [None]:
from melusine.models.models_v2.transformers_model import TransformerMelusineModel
from melusine.models.models_v2.cnn_model import CnnMelusineModel
from melusine.models.models_v2.transformers_model import TransformerMelusineModel
from melusine.models.models_v2.trainer import MelusineTrainer

# Setup logging

In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("gensim").setLevel(logging.WARNING)

# Load data

In [None]:
df_emails_clean = pd.read_csv('../tutorial/data/emails_preprocessed.csv', encoding='utf-8', sep=';')
# Artificially increase df size by duplication
df_emails_clean = pd.concat([df_emails_clean] * 100, ignore_index=True) 
df_emails_clean['clean_body'] = df_emails_clean['clean_body'].astype(str)

# Metadata preprocessing

In [None]:
from sklearn.pipeline import Pipeline
from melusine.prepare_email.metadata_engineering import MetaExtension
from melusine.prepare_email.metadata_engineering import MetaDate
from melusine.prepare_email.metadata_engineering import MetaAttachmentType
from melusine.prepare_email.metadata_engineering import Dummifier

# Pipeline to extract dummified metadata
MetadataPipeline = Pipeline([
    ('MetaExtension', MetaExtension()),
    ('MetaDate', MetaDate()),
    ('MetaAttachmentType',MetaAttachmentType()),
    ('Dummifier', Dummifier())
])
df_meta = MetadataPipeline.fit_transform(df_emails_clean)

In [None]:
X = pd.concat([df_emails_clean['clean_body'],df_meta],axis=1)
y = df_emails_clean['label']

# Tokenizer 

In [None]:
from melusine.nlp_tools.tokenizer import WordLevelTokenizer

In [None]:
tokenizer = WordLevelTokenizer()

In [None]:
df_emails_clean['tokens'] = df_emails_clean['clean_body'].apply(tokenizer.tokenize)

# Train word embeddings

In [None]:
from gensim.models import Word2Vec

vector_size = 50
min_count = 2
epochs = 2

embedding = Word2Vec(
    size=vector_size,
    min_count=min_count,
)


embedding.build_vocab(df_emails_clean['tokens'])
embedding.train(
    df_emails_clean['tokens'],
    total_examples=embedding.corpus_count,
    epochs=epochs,
)

# Classification using a CnnMelusineModel and a custom network architecture

In [None]:
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import GlobalMaxPooling1D

In [None]:
# Define a custom CNN architecture
def custom_cnn_archi(input_net):
    cnn_net = Conv1D(200, 2, padding="same", activation="linear", strides=1)(input_net)
    cnn_net = SpatialDropout1D(0.15)(cnn_net)
    cnn_net = BatchNormalization()(cnn_net)
    cnn_net = LeakyReLU(alpha=0.05)(cnn_net)
    cnn_net = GlobalMaxPooling1D()(cnn_net)

    return cnn_net

In [None]:
# Define a custom meta architecture
def custom_meta_archi(nb_meta):
    meta_input = Input(shape=(nb_meta,), dtype="float32")

    meta_net = Dense(150, activation="linear")(meta_input)
    meta_net = Dropout(0.2)(meta_net)
    meta_net = LeakyReLU(alpha=0.05)(meta_net)

    return meta_input, meta_net

In [None]:
# Define a custom dense architecture
def custom_dense_archi(input_net):
    dense_net = Dense(200, activation="linear")(input_net)
    dense_net = Dropout(0.2)(dense_net)
    dense_net = LeakyReLU(alpha=0.05)(dense_net)

    return dense_net

In [None]:
# Define a output layer
def custom_output_layer(input_net, n_targets):
    output = Dense(n_targets, activation="linear")(input_net)
    return output

In [None]:
model = CnnMelusineModel(  
    tokenizer=tokenizer,
    text_column="clean_body",
    seq_max=128,
    pretrained_embedding=embedding.wv,
    meta_input_list=['extension', 'dayofweek','hour', 'min', 'attachment_type'],
    cnn_archi=custom_cnn_archi,
    meta_archi=custom_meta_archi,
    dense_archi=custom_dense_archi,
    output_archi=custom_output_layer,
)

# Melusine Trainer

In [None]:
trainer = MelusineTrainer(model, epochs=2, batch_size=256)

In [None]:
trainer.train(X, y)

In [None]:
model.model.summary()

# Classification using a custom MelusineModel class

The methodology presented above makes the CnnMelusineModel very flexible in terms of architecture.  
If this enough for you, MelusineModel classes are designed to be easily customized by inheritance.  

You just need to define a custom class that inherits from a MelusineModel class 
(BaseMelusineModel, CnnMelusineModel or TransformersMelusineModel).  
Then you can override :  
* the create_network method to define a custom network
* the fit method to define a custom data preparztion methodology

In the exemple below, a custom class is defined to implement an RNN model.  
The class is then simply fed to the MelusineTrainer for model training.

In [None]:
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.models import Model


class MyCustomMelusineModel(CnnMelusineModel):
    
    def __init__(
        self,
        text_column,
        tokenizer,
        seq_max,
        pretrained_embedding,
        **kwargs
    ):
        
        super().__init__(
            text_column=text_column,
            tokenizer=tokenizer,
            seq_max=seq_max,
            pretrained_embedding=pretrained_embedding,
            **kwargs
        )
    
    def create_network(self) -> None:
        """
        Create the neural network using Keras.
        """
        inputs = list()

        # Text input
        text_input = Input(shape=(self.seq_max,), dtype="int32")
        inputs.append(text_input)

        # Embedding layer
        embedding_net = self.pretrained_embedding.get_keras_embedding(
            train_embeddings=self.trainable
        )(text_input)
        
        # RNN layer
        x = Bidirectional(GRU(80, return_sequences=True))(embedding_net)
        x = SpatialDropout1D(0.15)(x)
        x = Bidirectional(GRU(40, return_sequences=True))(x)
        x = SpatialDropout1D(0.15)(x)
        x = GlobalMaxPooling1D()(x)
        x = Dense(250, activation="linear")(x)
        x = LeakyReLU(alpha=0.05)(x)
        x = Dense(150, activation="linear")(x)
        x = Dropout(0.15)(x)
        x = LeakyReLU(alpha=0.05)(x)
        
        # Output layer
        output = Dense(self.n_targets, activation="softmax")(x)
                                                        
        # Build model
        model = Model(inputs=inputs, outputs=output)

        self.model = model


In [None]:
model = MyCustomMelusineModel(  
    tokenizer=tokenizer,
    text_column="clean_body",
    seq_max=128,
    pretrained_embedding=embedding.wv,
)

In [None]:
trainer = MelusineTrainer(model, epochs=2, batch_size=256)

In [None]:
trainer.train(X, y)