In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
import features as cf
from util import *

In [5]:
def simple_extractors():
    # Extraigo dos atributos simples: 
    # 1) Longitud del mail.
    # 2) Cantidad de espacios en el mail.
    # 3) Tiene el mail contenido HTML?
    # 4) Tiene el mail imágenes?
    # 5) Cantidad de oraciones
    
    return [ ('body_length', cf.body_length), 
      ('count_spaces', cf.count_spaces), 
      ('has_html', cf.has_html), 
      ('has_image', cf.has_image), 
      ('number_of_sentences', cf.number_of_sentences) ]

In [6]:
def vectorizer_extractor(vectorizer_type, **kwargs):
    if vectorizer_type == "bow":
        vectorizer = CountVectorizer(stop_words='english', **kwargs)
    elif vectorizer_type == "tfidf":
        vectorizer = TfidfVectorizer(stop_words='english', **kwargs)
    elif vectorizer_type == "hashing_bow":
        vectorizer = HashingVectorizer(stop_words='english', **kwargs)
    else:
        raise ValueError('Invalid vectorizer_type. Expected \'bow\', \'tfidf\' or \'hashing_bow\'')
    
    return vectorizer

In [7]:
def features_extractors(simple_features=True, subject_vectorizer='tfidf', body_vectorizer='tfidf', body_and_subject_vectorizer=None):
    extractors = []
    if simple_features:
        # Simple features extactor
        extractors = [('simple_features', cf.SimpleFeaturesExtractor(simple_extractors()))]
    
    if body_and_subject_vectorizer is not None:
        # Pipeline for pulling vectorizer features from the post's body
        extractors = extractors + \
            [('body_and_subject', Pipeline([
                ('selector', ColumnSelectorExtractor('body_and_subject')),
                (body_vectorizer, vectorizer_extractor(body_and_subject_vectorizer)),
            ]))]
    else:
        if subject_vectorizer is not None:
            # Pipeline for pulling vectorizer features from the post's subject
            extractors = extractors + \
                [('subject', Pipeline([
                    ('selector', ColumnSelectorExtractor('subject')),
                    (subject_vectorizer, vectorizer_extractor(subject_vectorizer)),
                ]))]

        if body_vectorizer is not None:
            # Pipeline for pulling vectorizer features from the post's body
            extractors = extractors + \
                [('body', Pipeline([
                    ('selector', ColumnSelectorExtractor('body')),
                    (body_vectorizer, vectorizer_extractor(body_vectorizer)),
                ]))]
    
    # Use FeatureUnion to combine the features
    return FeatureUnion(extractors)

In [8]:
train_set, test_set = load_data(merge_body_and_subject=True)

Loading data from dataset/ham_dev.json
Done in 3.470076s
Loaded 45000(465.272MB) mails
Parsing mails
Done in 17.655162s
Parsed 45000 mails
Loading data from dataset/spam_dev.json
Done in 1.959641s
Loaded 45000(200.517MB) mails
Parsing mails
Done in 17.517740s
Parsed 45000 mails
Generating Pandas DataFrame
Done in 60.748598s
Splitting into Training and Test Set
Done in 60.817210s
Train Set: 72000 samples - Ham: 35925(0.50%) Spam: 36075(0.50%)
Test Set:  18000 samples - Ham: 9075(0.50%) Spam: 8925(0.50%)


In [11]:
dt_pipeline_unified_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, body_and_subject_vectorizer='tfidf')),
  ('tree_classifier', DecisionTreeClassifier())
])

result = cross_val_score(dt_pipeline_unified_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)
print "Mean : " + np.asarray(result).mean() + " STD: " + np.asarray(result).std()

TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')

In [None]:
dt_pipeline_splitted_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, 'tfidf', 'tfidf')),
  ('tree_classifier', DecisionTreeClassifier())
])

result = cross_val_score(dt_pipeline_splitted_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)
print "Mean : " + np.asarray(result).mean() + " STD: " + np.asarray(result).std()