# Import

In [None]:
import logging
import sys

import pandas as pd
from melusine.nlp_tools.embedding import Embedding

# Setup logging

In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("gensim").setLevel(logging.WARNING)

# Load data

In [None]:
df_emails_clean = pd.read_csv('../tutorial/data/emails_preprocessed.csv', encoding='utf-8', sep=';')
# Artificially increase df size by duplication
df_emails_clean = pd.concat([df_emails_clean] * 100, ignore_index=True) 
df_emails_clean['clean_body'] = df_emails_clean['clean_body'].astype(str)

# Metadata preprocessing

In [None]:
from sklearn.pipeline import Pipeline
from melusine.prepare_email.metadata_engineering import MetaExtension
from melusine.prepare_email.metadata_engineering import MetaDate
from melusine.prepare_email.metadata_engineering import MetaAttachmentType
from melusine.prepare_email.metadata_engineering import Dummifier

# Pipeline to extract dummified metadata
MetadataPipeline = Pipeline([
    ('MetaExtension', MetaExtension()),
    ('MetaDate', MetaDate()),
    ('MetaAttachmentType',MetaAttachmentType()),
    ('Dummifier', Dummifier())
])
df_meta = MetadataPipeline.fit_transform(df_emails_clean)

In [None]:
X = pd.concat([df_emails_clean['clean_body'],df_meta],axis=1)
y = df_emails_clean['label']

# Tokenizer 

In [None]:
from melusine.nlp_tools.tokenizer import WordLevelTokenizer

In [None]:
tokenizer = WordLevelTokenizer()

In [None]:
df_emails_clean['tokens'] = df_emails_clean['clean_body'].apply(tokenizer.tokenize)

# Train word embeddings

In [None]:
from gensim.models import Word2Vec

vector_size = 50
min_count = 2
epochs = 2

embedding = Word2Vec(
    size=vector_size,
    min_count=min_count,
)


embedding.build_vocab(df_emails_clean['tokens'])
embedding.train(
    df_emails_clean['tokens'],
    total_examples=embedding.corpus_count,
    epochs=epochs,
)

# Classification using a CnnMelusineModel

In [None]:
from melusine.models.models_v2.cnn_model import CnnMelusineModel

In [None]:
model = CnnMelusineModel(  
    tokenizer=tokenizer,
    text_column="clean_body",
    seq_max=128,
    pretrained_embedding=embedding.wv,
    meta_input_list=['extension', 'dayofweek','hour', 'min', 'attachment_type'],
    # meta_input_list=None,
)

# Melusine Trainer

In [None]:
from melusine.models.models_v2.trainer import MelusineTrainer

In [None]:
trainer = MelusineTrainer(model, epochs=2, batch_size=256)

In [None]:
trainer.train(X, y)

# Classification using a Transformer model

In [None]:
from melusine.models.models_v2.transformers_model import TransformersMelusineModel

In [None]:
model = TransformersMelusineModel(  
    text_column="clean_body",
    model_name_or_path="camembert-base",
    seq_max=128,
    meta_input_list=['extension', 'dayofweek','hour', 'min', 'attachment_type'],
)

In [None]:
trainer = MelusineTrainer(model, epochs=2, batch_size=256)

In [None]:
trainer.train(X, y)