Merge pull request #66 from farthur/master

Add handling of attachments type (.pdf, .png, etc.) as meta data.
MAIF · Oct 22, 2020 · 4ec001a · 4ec001a
2 parents f0b6595 + 121fbb1
commit 4ec001a
Show file tree

Hide file tree

Showing 30 changed files with 1,809 additions and 952 deletions.
diff --git a/melusine/data/emails.csv b/melusine/data/emails.csv
diff --git a/melusine/prepare_email/metadata_engineering.py b/melusine/prepare_email/metadata_engineering.py
@@ -2,6 +2,7 @@
 import copy
 import pandas as pd
 from collections import Counter
+from  itertools import chain
 from sklearn import preprocessing
 from sklearn.base import BaseEstimator, TransformerMixin
 from melusine.utils.transformer_scheduler import TransformerScheduler
@@ -185,12 +186,12 @@ def get_dayofweek(row):
 
 
 class Dummifier(BaseEstimator, TransformerMixin):
-    """Transformer to dummifies categorial features.
+    """Transformer to dummifies categorial features and list of .
     Compatible with scikit-learn API.
     """
 
     def __init__(
-        self, columns_to_dummify=["extension", "dayofweek", "hour", "min"], copy=True,
+        self, columns_to_dummify=["extension", "dayofweek", "hour", "min", "attachment_type"], copy=True,
     ):
         self.columns_to_dummify = columns_to_dummify
         self.copy = copy
@@ -205,13 +206,19 @@ def fit(self, X, y=None):
             raise TypeError(
                 "You should not use fit on a dictionary object. Use a DataFrame"
             )
-
-        self.X_ = pd.get_dummies(
-            X, columns=self.columns_to_dummify, prefix_sep="__", dummy_na=False
+        
+        X_ = pd.get_dummies(
+            X, columns=[col for col in self.columns_to_dummify if col!="attachment_type"], prefix_sep="__", dummy_na=False
         )
 
         dummies_ = tuple([col + "__" for col in self.columns_to_dummify])
-        self.dummy_features = [c for c in self.X_.columns if c.startswith(dummies_)]
+
+        if ("attachment_type" in self.columns_to_dummify):
+            X_attachment = pd.get_dummies(X["attachment_type"].apply(pd.Series).stack().astype(int)).sum(level=0)
+            X_attachment = X_attachment.add_prefix('attachment_type__')
+            self.dummy_features = [c for c in pd.concat([X_,X_attachment],axis=1) if c.startswith(dummies_)]
+        else:
+            self.dummy_features = [c for c in X_ if c.startswith(dummies_)]
 
         return self
 
@@ -239,12 +246,101 @@ def transform(self, X, y=None):
                 X_ = X
 
         X_ = pd.get_dummies(
-            X_, columns=self.columns_to_dummify, prefix_sep="__", dummy_na=False,
+            X_, columns=[col for col in self.columns_to_dummify if col!="attachment_type"], prefix_sep="__", dummy_na=False
         )
 
+        if ("attachment_type" in self.columns_to_dummify):
+            X_attachment = pd.get_dummies(X_["attachment_type"].apply(pd.Series).stack().astype(int)).sum(level=0)
+            X_attachment = X_attachment.add_prefix('attachment_type__')
+            X_ = pd.concat([X_,X_attachment],axis=1)
+
+
         if return_dict:
             X_ = X_.T.reindex(self.dummy_features).T.fillna(0)
             return X_[self.dummy_features].to_dict(orient="records")[0]
         else:
             X_ = X_.T.reindex(self.dummy_features).T.fillna(0)
             return X_[self.dummy_features]
+
+class MetaAttachmentType(BaseEstimator, TransformerMixin):
+    """Transformer which creates 'type' feature extracted
+    from regex in metadata. It extracts types of attached files.
+
+    Compatible with scikit-learn API.
+    """
+
+    def __init__(self):
+        self.le_extension = preprocessing.LabelEncoder()
+
+    def fit(self, X, y=None):
+
+        if isinstance(X, dict):
+            raise TypeError(
+                "You should not use fit on a dictionary object. Use a DataFrame"
+            )
+
+        """ Fit LabelEncoder on encoded extensions."""
+        X["attachment_type"] = X.apply(self.get_attachment_type, axis=1)
+        self.top_attachment_type = self.get_top_attachment_type(X, n=100)
+        X["attachment_type"] = X.apply(
+            self.encode_type, args=(self.top_attachment_type,), axis=1
+        )
+        self.le_extension.fit(X["attachment_type"].sum())
+        return self
+
+    def transform(self, X):
+        """Encode extensions"""
+
+        if isinstance(X, dict):
+            apply_func = TransformerScheduler.apply_dict
+        else:
+            apply_func = TransformerScheduler.apply_pandas
+
+        X["attachment_type"] = apply_func(X, self.get_attachment_type)
+        X["attachment_type"] = apply_func(
+            X, self.encode_type, args_=(self.top_attachment_type,)
+        )
+        if isinstance(X["attachment_type"], list):
+            X["attachment_type"] = self.le_extension.transform([X["attachment_type"]])[0]
+        else:
+            X["attachment_type"] = [self.le_extension.transform(t) for t in X["attachment_type"]]
+        return X
+
+    @staticmethod
+    def get_attachment_type(row):
+        """Gets type from attachment."""
+        x = row["attachment"]
+        attached_types = []
+
+        try:
+            for file in x:
+                match = re.findall(r".*\.(.*)", file)
+                if match:
+                    attached_types.append(match[0])
+        except Exception:
+            return ""
+        return attached_types
+
+    @staticmethod
+    def get_top_attachment_type(X, n=100):
+        "Returns list of most common types of attachment."
+
+        type_counter = Counter(chain(*X["attachment_type"]))
+        type_counter = type_counter.most_common(n)
+        top_attachment_type = [x[0] for x in type_counter]
+        return top_attachment_type
+
+    @staticmethod
+    def encode_type(row, top_ext):
+        x = row["attachment_type"]
+        """Encode most common type of attachment and set the rest to 'other'."""
+        encode = []
+        if x:
+            for attachment in x:
+                if attachment in top_ext:
+                    encode.append(attachment)
+                else:
+                    encode.append("other")
+        else : #No attachments
+            encode.append("none") 
+        return encode
diff --git a/tests/integration_test/test_classification.py b/tests/integration_test/test_classification.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import copy
 import os
+import ast
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import LabelEncoder
 
@@ -26,6 +27,7 @@
 
 from melusine.prepare_email.metadata_engineering import MetaExtension
 from melusine.prepare_email.metadata_engineering import MetaDate
+from melusine.prepare_email.metadata_engineering import MetaAttachmentType
 from melusine.prepare_email.metadata_engineering import Dummifier
 
 from melusine.models.neural_architectures import cnn_model
@@ -107,11 +109,13 @@ def test_classification():
         [
             ("MetaExtension", MetaExtension()),
             ("MetaDate", MetaDate()),
+            ("MetaAttachmentType", MetaAttachmentType()),
             ("Dummifier", Dummifier()),
         ]
     )
 
     # ============== Transform MetaData ==============
+    input_df['attachment'] = input_df['attachment'].apply(ast.literal_eval)
     df_meta = MetadataPipeline.fit_transform(input_df)
 
     # ============== Keywords Generator ==============
@@ -132,7 +136,7 @@ def test_classification():
         architecture_function=cnn_model,
         pretrained_embedding=pretrained_embedding,
         text_input_column="clean_body",
-        meta_input_list=["extension", "dayofweek", "hour", "min"],
+        meta_input_list=["extension", "dayofweek", "hour", "min", "attachment_type"],
         n_epochs=2,
     )