Merge pull request #14 from milidris/master

Adding Guided Unsupervised Sentiment Analysis capabilities
MAIF · Aug 28, 2019 · 92c6ce6 · 92c6ce6
2 parents 038641b + 6f976bf
commit 92c6ce6
Show file tree

Hide file tree

Showing 3 changed files with 596 additions and 41 deletions.
diff --git a/melusine/models/modeler_sentiment.py b/melusine/models/modeler_sentiment.py
@@ -0,0 +1,249 @@
+import logging
+import re
+import numpy as np
+import pandas as pd
+from multiprocessing import Pool
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from melusine.utils.multiprocessing import apply_by_multiprocessing
+
+
+class SentimentDetector(BaseEstimator, TransformerMixin):
+    """Class to fit a Lexicon on an embedding and predicts the
+    Attributes
+    ----------
+    base_seed_words : list,
+        Seedwords list containing the seedwords for computing the Lexicons given by the User.
+    seed_list : list,
+        Same as base_seed_words but only the seeds present in the embedding vocabulary are kept.
+    n_jobs : int,
+        Number of CPUs to use to rate the emails.
+    progress_bar : bool,
+        Whether to print or not the progress bar while rating the emails.
+    root : bool,
+        Whether to use the seedwords as prefixes.
+    seed_dict : dict,
+        Filled if root==True. Key : prefix, Value : list of seedwords having this prefix in the vocabulary
+    tokens_column : str,
+        Name of the column in the Pandas Dataframe on which the polarity scores will be computed. Must be a column of tokens.
+    normalize_scores : bool,
+        Whether or not to normalize the lexicons' scores (so that they are centered around 0 and with a variance of 1)
+    lexicon_dict : dict,
+        Key : Seedword, Value : dict having all the embedding vocabulary as keys, and their semantic polarity value (cosine similarity) towards the seedword as values
+    normalized_lexicon_dict : dict,
+        Filled only if nomalize_scores==True. Contains the same as lexicon_dict, but with the normalized polarity values instead.
+    aggregation_function_seed_wise : function,
+        Function to aggregate the scores returned by all the different Lexicons associated to the seedwords, for a specific token (default=np.max)
+    aggregation_function_email_wise : function,
+        Function to aggregate the scores given to the tokens in a e-mail (default=np.percentile(.,60))
+
+    Examples
+    --------
+
+    """
+
+    def __init__(self, base_seed_words, tokens_column, n_jobs=1,
+                 progress_bar=False, root=False,
+                 normalize_scores=False,
+                 aggregation_function_seed_wise=np.max,
+                 aggregation_function_email_wise=lambda x: np.percentile(x, 60)
+                 ):
+        """
+        Parameters
+        --------
+        base_seed_words : list,
+            Seedwords list containing the seedwords for computing the Lexicons given by the User.
+        tokens_column : str,
+            Name of the column in the Pandas Dataframe on which the polarity scores will be computed. Must be a column of tokens.
+        n_jobs : int,
+            Number of CPUs to use to rate the emails.
+        progress_bar : bool,
+            Whether to print or not the progress bar while rating the emails.
+        root : bool,
+            Whether to use the seedwords as prefixes.
+        normalize_scores : bool,
+            Whether or not to normalize the lexicons' scores (so that they are centered around 0 and with a variance of 1)
+        aggregation_function_seed_wise : function,
+            Function to aggregate the scores returned by all the different Lexicons associated to the seedwords, for a specific token (default=np.max)
+        aggregation_function_email_wise : function,
+            Function to aggregate the scores given to the tokens in a e-mail (default=np.percentile(.,60))
+        """
+
+        self.n_jobs = n_jobs
+        self.progress_bar = progress_bar
+
+        self.base_seed_words = base_seed_words
+        self.seed_dict = {word: [] for word in self.base_seed_words}
+        self.seed_list = base_seed_words
+        self.root = root
+        self.tokens_column = tokens_column
+        self.normalize_scores = normalize_scores
+
+        self.lexicon_dict = {}
+        self.normalized_lexicon_dict = {}
+
+        self.aggregation_function_seed_wise = aggregation_function_seed_wise
+        self.aggregation_function_email_wise = aggregation_function_email_wise
+
+    def __getstate__(self):
+        """should return a dict of attributes that will be pickled
+        To override the default pickling behavior and
+        avoid the pickling of the logger
+        """
+        d = self.__dict__.copy()
+        # disable multiprocessing when saving
+        d['n_jobs'] = 1
+        d['progress_bar'] = False
+        if 'logger' in d:
+            d['logger'] = d['logger'].name
+        return d
+
+    def __setstate__(self, d):
+        """To override the default pickling behavior and
+        avoid the pickling of the logger"""
+        if 'logger' in d:
+            d['logger'] = logging.getLogger(d['logger'])
+        self.__dict__.update(d)
+
+    def fit(self, embedding):
+        """
+        Computes the Lexicons for the specific embedding.
+        Parameters
+        -------
+        embedding : Embedding Object,
+            A Melusine Embedding object.
+
+        """
+
+        if self.root:
+            self.seed_dict, self.seed_list = self.compute_seeds_from_root(embedding, self.base_seed_words)
+
+        self.seed_list = [token for token in self.seed_list if token in embedding.embedding.vocab.keys()]
+
+        if not self.seed_list:
+            raise ValueError('None of the seed words are in the vocabulary associated with the Embedding')
+
+        self.lexicon_dict = self.compute_lexicon(embedding, self.seed_list)
+
+        if self.normalize_scores :
+            self.normalize_lexicon()
+
+    @staticmethod
+    def compute_seeds_from_root(embedding, base_seed_words):
+        """
+        Uses the seedwords list provided by the User and treats them as prefixes to find the effective tokens that will be used to compute the Lexicons.
+
+        Parameters
+        --------
+        embedding : Embedding Object,
+            A Melusine Embedding object.
+        base_seed_words :list,
+            Seedwords list containing the seedwords for computing the Lexicons given by the User.
+
+        Returns
+        -------
+        (seed_dict, seed_list) : (dict, list),
+            Tuple contraining a dict with key:prefixe value : list of seedwords, and a list containing all the seedwords found with the given prefixes
+
+        """
+        words = list(embedding.embedding.vocab.keys())
+        seed_dict = dict()
+        seed_list = []
+
+        for seed in base_seed_words:
+            extended_seed_words = [token for token in words if token.startswith(seed)]
+            seed_dict[seed] = extended_seed_words
+            seed_list.extend(extended_seed_words)
+
+        return seed_dict, seed_list
+
+    @staticmethod
+    def compute_lexicon(embedding, seed_list):
+        """
+        Computes the Lexicons for the given embedding. Computes the cosine similarity between all the tokens in seed_list and the embedding's vocabulary.
+
+        Parameters
+        --------
+        embedding : Embedding Object,
+            A Melusine Embedding object on which the cosine similarity between the words will be computed.
+        seed_list : list,
+            A list containing the seedwords on which the cosine similarity will be computed.
+
+        Returns
+        -------
+        lexicon_dict : dict,
+            A dict representing the lexicon. The keys will be all the tokens in seed_list, the values will be a dict which keys will be all the tokens of the embedding's vocabulary, and the values their cosine similarity with the seed.
+
+        """
+        words = list(embedding.embedding.vocab.keys())
+        lexicon_dict = {}
+
+        for seed in seed_list:
+            lexicon_dict[seed] = {}
+            for word in words:
+                lexicon_dict[seed][word] = embedding.embedding.similarity(seed, word)
+
+        return lexicon_dict
+
+
+    def predict(self, X):
+        """
+        Given the objet has already been fitted, will add a new column "sentiment_score" to the Pandas Dataset containing the polarity scores of the documents towards the list of seeds provided.
+        Parameters
+        ----------
+        X : DataFrame
+            Input emails DataFrame
+
+        """
+        X['sentiment_score'] = apply_by_multiprocessing(X, self.rate_email, workers=self.n_jobs,
+                                             progress_bar=self.progress_bar)
+
+        return X
+
+    def normalize_lexicon(self) :
+        """
+        Normalizes the Lexicon scores (centered around 0, with variance 1).
+        """
+        lexicon_dict = self.lexicon_dict
+
+        normalized_lexicon=dict()
+        for seed in lexicon_dict.keys() :
+            mean=np.mean(list(lexicon_dict[seed].values()))
+            sd=np.std(list(lexicon_dict[seed].values()))
+            lex_norm={k:(v-mean)/sd for k,v in lexicon_dict[seed].items()}
+            normalized_lexicon[seed]=lex_norm
+
+        self.normalized_lexicon_dict = normalized_lexicon
+
+    def rate_email(self, row):
+        """
+        Given the object has been fitted, will compute the polarity score towards the seedwords for a specific e-mail.
+
+        Parameters
+        ----------
+        row : row,
+            A Pandas Dataframe containing a tokenized document.
+        """
+
+        # TODO make the aggregation function as an argument
+        tokens_column = self.tokens_column
+        seed_list = self.seed_list
+
+        if self.normalize_lexicon:
+            lexicon_dict = self.normalized_lexicon_dict
+        else:
+            lexicon_dict = self.lexicon_dict
+        effective_tokens_list = [token for token in row[tokens_column] if token in lexicon_dict[seed_list[0]]]
+
+        if effective_tokens_list:
+
+            token_score_list = [
+                self.aggregation_function_seed_wise(
+                    [lexicon_dict[seed][token] for seed in seed_list]
+                )
+                for token in effective_tokens_list
+            ]
+
+            return self.aggregation_function_email_wise(token_score_list)
+        else :
+            return(np.nan)