Skip to content

Commit

Permalink
Merge pull request #8 from BenoitLebreton/tokenizer
Browse files Browse the repository at this point in the history
Accept: replace by our own Tokenizer
  • Loading branch information
sachasamama committed Jun 11, 2019
2 parents 503aff8 + b2dfbb7 commit 5aef139
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 55 deletions.
6 changes: 4 additions & 2 deletions melusine/config/conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"aurais", "aurait", "aurions", "auriez", "auraient", "avais", "avait",
"avions", "aviez", "avaient", "eut", "e没mes", "e没tes", "eurent", "aie",
"aies", "ait", "ayons", "ayez", "aient", "eusse", "eusses", "e没t",
"eussions", "eussiez", "eussent"],
"eussions", "eussiez", "eussent", "nan"],
"names": ["abel", "absolon", "achille", "adam", "adelaide", "adele", "adeline",
"adolphe", "adrien", "adrienne", "agathe", "agnes", "aime", "aimee", "alain",
"albert", "albertine", "alexandre", "alexandrie", "alexis", "alfred", "alice",
Expand Down Expand Up @@ -107,6 +107,8 @@

"regex": {

"tokenizer" : "\\w+(?:[\\?\\-\"_]\\w+)*",

"manage_transfer_reply" : {

"begin_transfer" : "^[;\\s]*[-\\s]*Transf茅r茅 par",
Expand Down Expand Up @@ -143,7 +145,7 @@
"Exp茅diteur.+?Objet\\s*:.+?[;|\n]",
"(?:>?[;|\n]?\\s*(?:Envoy茅|De|脌|Objet|Cc|Envoy茅 par|Date|A|Destinataire|Sent|To|Subject|From|Copie 脿)+?\\s*:\\s*(?:.*?)\\s*[;|\n]\\s*)+",
"En date de.+?茅crit",
">?\\s*Le[^;\n]+?[;|\n]{0,1}[^;\n]+?a[^;\n]+?;{0,1}[^;\n]+?茅crit\\s*:?",
">?\\s*Le[^;\n]{0,30}[;|\n]{0,1}[^;\n]{0,30}a[^;\n]{0,30};{0,1}[^;\n]{0,30}茅crit\\s*:?",
">?\\s*Message de.+?Objet\\s*:.+?[;|\n]",
">?\\s*Message du.+?Objet\\s*:.+?[;|\n]",
"En date de.+?茅crit"
Expand Down
2 changes: 0 additions & 2 deletions melusine/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,10 @@ def get_config_file(self):
Load the list of names from the names.csv file.
"""
path = self.config['PATH']['template_config']

if path == self.path_default_conf_json_:
config_file = self.load_config_file(path=None)
else:
config_file = self.load_config_file(path=path)

name_file_path = self.config['PATH']['default_name_file']

if name_file_path == self.path_default_names_csv_:
Expand Down
87 changes: 55 additions & 32 deletions melusine/models/train.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from melusine.nlp_tools.tokenizer import Tokenizer
import numpy as np
from collections import Counter
from sklearn.base import BaseEstimator, ClassifierMixin
from keras.utils import np_utils
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam

Expand Down Expand Up @@ -76,22 +76,25 @@ class NeuralModel(BaseEstimator, ClassifierMixin):
"""

def __init__(self,
pretrained_embedding=None,
architecture_function=None,
pretrained_embedding=None,
text_input_column='clean_text',
meta_input_list=['extension', 'dayofweek', 'hour', 'min'],
vocab_size=25000,
seq_size=100,
embedding_dim=200,
loss='categorical_crossentropy',
batch_size=4096,
n_epochs=15,
**kwargs):
self.architecture_function = architecture_function
self.pretrained_embedding = pretrained_embedding
self.tokenizer = Tokenizer(input_column = text_input_column)
self.text_input_column = text_input_column
self.meta_input_list = meta_input_list
self.vocab_size = vocab_size
self.seq_size = seq_size
self.embedding_dim = embedding_dim
self.loss = loss
self.batch_size = batch_size
self.n_epochs = n_epochs
Expand Down Expand Up @@ -146,10 +149,13 @@ def fit(self, X, y, **kwargs):
self : object
Returns the instance
"""
self._fit_tokenizer(X)
self._create_word_indexes_from_tokens()
self._get_embedding_matrix()

X = self.tokenizer.transform(X)
if self.pretrained_embedding:
self._get_embedding_matrix()
else:
self._create_vocabulary_from_tokens(X)
self._generate_random_embedding_matrix()
X_seq = self._prepare_sequences(X)
X_meta, nb_meta_features = self._get_meta(X)
y_categorical = np_utils.to_categorical(y)
Expand Down Expand Up @@ -200,6 +206,7 @@ def predict_proba(self, X, **kwargs):
-------
np.array
"""
X = self.tokenizer.transform(X)
X_seq = self._prepare_sequences(X)
X_meta, nb_meta_features = self._get_meta(X)
if nb_meta_features == 0:
Expand All @@ -208,46 +215,62 @@ def predict_proba(self, X, **kwargs):
X_input = [X_seq, X_meta]
return self.model.predict(X_input, **kwargs)

def _fit_tokenizer(self, X):
"""Fit a Tokenizer instance from Keras on a clean body."""
self.tokenizer = Tokenizer(num_words=self.vocab_size,
oov_token='UNK')
self.tokenizer.fit_on_texts(X[self.text_input_column])
pass

def _create_word_indexes_from_tokens(self):
def _create_vocabulary_from_tokens(self, X):
"""Create a word indexes dictionary from tokens."""
c = Counter(self.tokenizer.word_counts)
self.tokenizer.word_index = {t[0]: i + 1 for i, t
in enumerate(c.most_common(len(c)))}
self.tokenizer.word_index['UNK'] = 0
tokens = X['tokens']
c = Counter([token for text in X.tokens for token in text])
self.vocabulary = [t[0] for t in c.most_common(self.vocab_size)]
pass

def _get_embedding_matrix(self):
"""Prepares the embedding matrix to be used as an input for
the neural network model."""
the neural network model.
The vocabulary of the NN is those of the pretrained embedding
"""
pretrained_embedding = self.pretrained_embedding
vocab_size = self.vocab_size
self.vocabulary = pretrained_embedding.embedding.wv.index2word
vocab_size = len(self.vocabulary)
vector_dim = pretrained_embedding.embedding.vector_size
wv_dict = {word: vec for word, vec in
zip(pretrained_embedding.embedding.wv.index2word,
pretrained_embedding.embedding.wv.syn0)}
embedding_matrix = np.zeros((vocab_size+1, vector_dim))
for word, index in self.tokenizer.word_index.items():
if index >= vocab_size:
continue
embedding_vector = wv_dict.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
embedding_matrix = np.zeros((vocab_size + 2, vector_dim))
for index, word in enumerate(self.vocabulary):
if word not in ['PAD', 'UNK']:
embedding_matrix[index + 2, :] = pretrained_embedding.embedding.wv.get_vector(word)
embedding_matrix[1, :] = np.mean(embedding_matrix, axis=0)

self.vocabulary.insert(0, 'PAD')
self.vocabulary.insert(1, 'UNK')
self.embedding_matrix = embedding_matrix
pass

def _generate_random_embedding_matrix(self):
"""Prepares the embedding matrix to be used as an input for
the neural network model.
The vocabulary of the NN is those of the pretrained embedding
"""
vocab_size = len(self.vocabulary)
vector_dim = self.embedding_dim
embedding_matrix = np.random.uniform(low=-1,high=1,size=(vocab_size + 2, vector_dim))
embedding_matrix[0:2, :] = np.zeros((2, vector_dim))
self.vocabulary.insert(0, 'PAD')
self.vocabulary.insert(1, 'UNK')
self.embedding_matrix = embedding_matrix
pass

def tokens_to_indices(self, tokens):
"""
Input : list of tokens ["ma", "carte_verte", ...]
Output : list of indices [46, 359, ...]
"""
return [self.vocabulary.index(t) if t in self.vocabulary else 1 for t in tokens]

def _prepare_sequences(self, X):
"""Prepares the sequence to be used as input for the neural network
model."""
seqs = self.tokenizer.texts_to_sequences(X[self.text_input_column])
model.
The input column must be an already tokenized text : tokens
The tokens must have been optained using the same tokenizer than the
one used for the pre-trained embedding."""
seqs = X['tokens'].apply(self.tokens_to_indices)
X_seq = pad_sequences(seqs, maxlen=self.seq_size)

return X_seq

def _get_meta(self, X):
Expand Down
14 changes: 10 additions & 4 deletions melusine/nlp_tools/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,17 @@ def __init__(self,
iter=15,
size=300,
window=5,
min_count=100):
min_count=100,
stop_removal = True):
self.logger = logging.getLogger('NLUtils.Embedding')
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
self.logger.addHandler(ch)
self.logger.debug('Create an Embedding instance.')
self.input_column = input_column
self.streamer = Streamer(column=self.input_column)
self.stop_removal = stop_removal
self.streamer = Streamer(column=self.input_column, stop_removal=stop_removal)
self.workers = workers
self.seed = seed
self.iter = iter
Expand Down Expand Up @@ -78,12 +80,16 @@ def train(self, X):
"""
self.logger.info('Start training for embedding')
self.streamer.to_stream(X)
self.embedding = Word2Vec(self.streamer.stream,
workers=self.workers,
self.embedding = Word2Vec(workers=self.workers,
seed=self.seed,
iter=self.iter,
size=self.size,
window=self.window,
min_count=self.min_count)
self.embedding.build_vocab(self.streamer.stream)
self.embedding.train(self.streamer.stream,
total_examples=self.embedding.corpus_count,
epochs=self.embedding.epochs)

self.logger.info('Done.')
pass
2 changes: 1 addition & 1 deletion melusine/nlp_tools/phraser.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def __init__(self,
self.threshold = threshold
self.min_count = min_count
self.input_column = input_column
self.streamer = Streamer(column=self.input_column)
self.streamer = Streamer(column=self.input_column, stop_removal=False)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
Expand Down
23 changes: 14 additions & 9 deletions melusine/nlp_tools/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import logging
import nltk
import re
from sklearn.base import BaseEstimator, TransformerMixin
from melusine.config.config import ConfigJsonReader

conf_reader = ConfigJsonReader()
config = conf_reader.get_config_file()

stopwords = config["words_list"]["stopwords"] + config["words_list"]["names"]

regex_tokenize = "\w+(?:[\?\-\"_]\w+)*"
stopwords = config["words_list"]["stopwords"]
names_list = config["words_list"]["names"]
regex_tokenize = config["regex"]["tokenizer"]


class Tokenizer(BaseEstimator, TransformerMixin):
Expand Down Expand Up @@ -53,8 +53,9 @@ def __init__(self,
stop_removal=True,
n_jobs=20):
self.input_column = input_column
self.stopwords = stopwords
self.stopwords = set(stopwords)
self.stop_removal = stop_removal
self.names_list = set(names_list)
self.n_jobs = n_jobs
self.logger = logging.getLogger('emails_application.preprocessing.Preprocessing')
self.logger.debug('creating an instance of Preprocessing')
Expand Down Expand Up @@ -96,7 +97,7 @@ def transform(self, X):
"""
self.logger.debug('Start transform tokenizing')
X['tokens'] = X[[self.input_column]].apply(self.tokenize, axis=1)
X['tokens'] = X['tokens'].apply(lambda x: x[0])
X['tokens'] = X.apply(lambda x: x['tokens'][0], axis=1)
self.logger.info('X shape : %s' % str(X.shape))
self.logger.debug('Done.')
return X
Expand All @@ -115,17 +116,21 @@ def tokenize(self, row):
"""
text = row[self.input_column]
tokens = self._tokenize(text)
tokens = self._remove_stopwords(tokens)
return [tokens]

def _tokenize(self, text, pattern=regex_tokenize):
"""Returns list of tokens from text."""
return nltk.tokenize.regexp_tokenize(str(text), pattern=pattern)
if isinstance(text, str):
tokens = re.findall("\w+(?:[\?\-\"_]\w+)*", text, re.M+re.DOTALL)
tokens = self._remove_stopwords(tokens)
else:
tokens = []
return tokens

def _remove_stopwords(self, list):
""" Removes stopwords from list if stop_removal parameter
set to True."""
if self.stop_removal:
return [x for x in list if x not in self.stopwords]
return [tok if tok not in self.names_list else "flag_name_" for tok in list if tok not in self.stopwords]
else:
return list
3 changes: 3 additions & 0 deletions melusine/prepare_email/mail_segmenting.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ def structure_email(row):
structured_body = []
for message in row['structured_historic']:
structured_message = structure_message(message)
if len(structured_message["structured_text"]["text"]) == 0:
if structured_message["structured_text"]["header"] is None:
continue
structured_body.append(structured_message)

return structured_body
Expand Down
13 changes: 8 additions & 5 deletions melusine/utils/streamer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import nltk
from melusine.prepare_email.mail_segmenting import split_message_to_sentences
from melusine.utils.multiprocessing import apply_by_multiprocessing
from melusine.nlp_tools.tokenizer import Tokenizer
from melusine.config.config import ConfigJsonReader

conf_reader = ConfigJsonReader()

class Streamer():
"""Class to transform pd.Series into stream.
Expand All @@ -25,9 +27,12 @@ class Streamer():
"""

def __init__(self, column='clean_body', n_jobs=2):
def __init__(self, stop_removal=False, column='clean_body', n_jobs=1):
self.column_ = column
self.n_jobs = n_jobs
config = conf_reader.get_config_file()
stopwords = config["words_list"]["stopwords"] + config["words_list"]["names"]
self.tokenizer = Tokenizer(stopwords, stop_removal=stop_removal)

def to_stream(self, X):
"""Build a MailIterator object containing a stream of tokens from
Expand Down Expand Up @@ -62,7 +67,6 @@ def to_flattoks(self, X):
"""
tokenized_sentences_list = apply_by_multiprocessing(df=X[[self.column_]],
func=lambda x: self.to_list_of_tokenized_sentences(x[self.column_]),
#func=self.to_list_of_tokenized_sentences,
args=None,
workers=self.n_jobs,
progress_bar=False
Expand All @@ -85,8 +89,7 @@ def to_list_of_tokenized_sentences(self, text):
"""
#text = row[self.column_]
sentences_list = split_message_to_sentences(text)
tokenized_sentences_list = [nltk.regexp_tokenize(sentence,
pattern="\w+(?:[\?\-\'\"_]\w+)*")
tokenized_sentences_list = [self.tokenizer._tokenize(sentence)
for sentence in sentences_list
if sentence != ""]
return tokenized_sentences_list
Expand Down

0 comments on commit 5aef139

Please sign in to comment.