Skip to content

Commit

Permalink
Merge pull request #66 from farthur/master
Browse files Browse the repository at this point in the history
Add handling of attachments type (.pdf, .png, etc.) as meta data.
  • Loading branch information
farthur committed Oct 22, 2020
2 parents f0b6595 + 121fbb1 commit 4ec001a
Show file tree
Hide file tree
Showing 30 changed files with 1,809 additions and 952 deletions.
82 changes: 41 additions & 41 deletions melusine/data/emails.csv

Large diffs are not rendered by default.

110 changes: 103 additions & 7 deletions melusine/prepare_email/metadata_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import copy
import pandas as pd
from collections import Counter
from itertools import chain
from sklearn import preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from melusine.utils.transformer_scheduler import TransformerScheduler
Expand Down Expand Up @@ -185,12 +186,12 @@ def get_dayofweek(row):


class Dummifier(BaseEstimator, TransformerMixin):
"""Transformer to dummifies categorial features.
"""Transformer to dummifies categorial features and list of .
Compatible with scikit-learn API.
"""

def __init__(
self, columns_to_dummify=["extension", "dayofweek", "hour", "min"], copy=True,
self, columns_to_dummify=["extension", "dayofweek", "hour", "min", "attachment_type"], copy=True,
):
self.columns_to_dummify = columns_to_dummify
self.copy = copy
Expand All @@ -205,13 +206,19 @@ def fit(self, X, y=None):
raise TypeError(
"You should not use fit on a dictionary object. Use a DataFrame"
)

self.X_ = pd.get_dummies(
X, columns=self.columns_to_dummify, prefix_sep="__", dummy_na=False
X_ = pd.get_dummies(
X, columns=[col for col in self.columns_to_dummify if col!="attachment_type"], prefix_sep="__", dummy_na=False
)

dummies_ = tuple([col + "__" for col in self.columns_to_dummify])
self.dummy_features = [c for c in self.X_.columns if c.startswith(dummies_)]

if ("attachment_type" in self.columns_to_dummify):
X_attachment = pd.get_dummies(X["attachment_type"].apply(pd.Series).stack().astype(int)).sum(level=0)
X_attachment = X_attachment.add_prefix('attachment_type__')
self.dummy_features = [c for c in pd.concat([X_,X_attachment],axis=1) if c.startswith(dummies_)]
else:
self.dummy_features = [c for c in X_ if c.startswith(dummies_)]

return self

Expand Down Expand Up @@ -239,12 +246,101 @@ def transform(self, X, y=None):
X_ = X

X_ = pd.get_dummies(
X_, columns=self.columns_to_dummify, prefix_sep="__", dummy_na=False,
X_, columns=[col for col in self.columns_to_dummify if col!="attachment_type"], prefix_sep="__", dummy_na=False
)

if ("attachment_type" in self.columns_to_dummify):
X_attachment = pd.get_dummies(X_["attachment_type"].apply(pd.Series).stack().astype(int)).sum(level=0)
X_attachment = X_attachment.add_prefix('attachment_type__')
X_ = pd.concat([X_,X_attachment],axis=1)


if return_dict:
X_ = X_.T.reindex(self.dummy_features).T.fillna(0)
return X_[self.dummy_features].to_dict(orient="records")[0]
else:
X_ = X_.T.reindex(self.dummy_features).T.fillna(0)
return X_[self.dummy_features]

class MetaAttachmentType(BaseEstimator, TransformerMixin):
"""Transformer which creates 'type' feature extracted
from regex in metadata. It extracts types of attached files.
Compatible with scikit-learn API.
"""

def __init__(self):
self.le_extension = preprocessing.LabelEncoder()

def fit(self, X, y=None):

if isinstance(X, dict):
raise TypeError(
"You should not use fit on a dictionary object. Use a DataFrame"
)

""" Fit LabelEncoder on encoded extensions."""
X["attachment_type"] = X.apply(self.get_attachment_type, axis=1)
self.top_attachment_type = self.get_top_attachment_type(X, n=100)
X["attachment_type"] = X.apply(
self.encode_type, args=(self.top_attachment_type,), axis=1
)
self.le_extension.fit(X["attachment_type"].sum())
return self

def transform(self, X):
"""Encode extensions"""

if isinstance(X, dict):
apply_func = TransformerScheduler.apply_dict
else:
apply_func = TransformerScheduler.apply_pandas

X["attachment_type"] = apply_func(X, self.get_attachment_type)
X["attachment_type"] = apply_func(
X, self.encode_type, args_=(self.top_attachment_type,)
)
if isinstance(X["attachment_type"], list):
X["attachment_type"] = self.le_extension.transform([X["attachment_type"]])[0]
else:
X["attachment_type"] = [self.le_extension.transform(t) for t in X["attachment_type"]]
return X

@staticmethod
def get_attachment_type(row):
"""Gets type from attachment."""
x = row["attachment"]
attached_types = []

try:
for file in x:
match = re.findall(r".*\.(.*)", file)
if match:
attached_types.append(match[0])
except Exception:
return ""
return attached_types

@staticmethod
def get_top_attachment_type(X, n=100):
"Returns list of most common types of attachment."

type_counter = Counter(chain(*X["attachment_type"]))
type_counter = type_counter.most_common(n)
top_attachment_type = [x[0] for x in type_counter]
return top_attachment_type

@staticmethod
def encode_type(row, top_ext):
x = row["attachment_type"]
"""Encode most common type of attachment and set the rest to 'other'."""
encode = []
if x:
for attachment in x:
if attachment in top_ext:
encode.append(attachment)
else:
encode.append("other")
else : #No attachments
encode.append("none")
return encode
6 changes: 5 additions & 1 deletion tests/integration_test/test_classification.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import copy
import os
import ast
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

Expand All @@ -26,6 +27,7 @@

from melusine.prepare_email.metadata_engineering import MetaExtension
from melusine.prepare_email.metadata_engineering import MetaDate
from melusine.prepare_email.metadata_engineering import MetaAttachmentType
from melusine.prepare_email.metadata_engineering import Dummifier

from melusine.models.neural_architectures import cnn_model
Expand Down Expand Up @@ -107,11 +109,13 @@ def test_classification():
[
("MetaExtension", MetaExtension()),
("MetaDate", MetaDate()),
("MetaAttachmentType", MetaAttachmentType()),
("Dummifier", Dummifier()),
]
)

# ============== Transform MetaData ==============
input_df['attachment'] = input_df['attachment'].apply(ast.literal_eval)
df_meta = MetadataPipeline.fit_transform(input_df)

# ============== Keywords Generator ==============
Expand All @@ -132,7 +136,7 @@ def test_classification():
architecture_function=cnn_model,
pretrained_embedding=pretrained_embedding,
text_input_column="clean_body",
meta_input_list=["extension", "dayofweek", "hour", "min"],
meta_input_list=["extension", "dayofweek", "hour", "min", "attachment_type"],
n_epochs=2,
)

Expand Down

0 comments on commit 4ec001a

Please sign in to comment.