In [2]:
%matplotlib inline

In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [21]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

In [5]:
import features as cf
from util import *

## Extractores

### Extractores simples
Extraen atributos bivalentes.

In [6]:
def simple_extractors():
    # Extraigo dos atributos simples: 
    # 1) Longitud del mail.
    # 2) Cantidad de espacios en el mail.
    # 3) Tiene el mail contenido HTML?
    # 4) Tiene el mail imágenes?
    # 5) Cantidad de oraciones
    
    return [ ('body_length', cf.body_length), 
      ('count_spaces', cf.count_spaces), 
      ('has_html', cf.has_html), 
      ('has_image', cf.has_image), 
      ('number_of_sentences', cf.number_of_sentences) ]

In [7]:
def vectorizer_extractor(vectorizer_type, **kwargs):
    if vectorizer_type == "bow":
        vectorizer = CountVectorizer(stop_words='english', **kwargs)
    elif vectorizer_type == "tfidf":
        vectorizer = TfidfVectorizer(stop_words='english', **kwargs)
    elif vectorizer_type == "hashing_bow":
        vectorizer = HashingVectorizer(stop_words='english', **kwargs)
    else:
        raise ValueError('Invalid vectorizer_type. Expected \'bow\', \'tfidf\' or \'hashing_bow\'')
    
    return vectorizer

### Pipeline de extracción
Cadena de extracción de atributos.

En la celda inferior, se utiliza body_and_subject_vectorizer para determinar si el sujeto y el cuerpo del mail deben estar juntos en la creación de la matriz DF-IDF (la extracción se realiza sobre el texto concatenado) o separados (se realiza la extracción sobre los dos textos independientemente).

In [8]:
def features_extractors(simple_features=True, subject_vectorizer='tfidf', body_vectorizer='tfidf', body_and_subject_vectorizer=None):
    extractors = []
    if simple_features:
        # Simple features extactor
        extractors = [('simple_features', cf.SimpleFeaturesExtractor(simple_extractors()))]
    
    if body_and_subject_vectorizer is not None:
        # Pipeline for pulling vectorizer features from the post's body
        extractors = extractors + \
            [('body_and_subject', Pipeline([
                ('selector', ColumnSelectorExtractor('body_and_subject')),
                (body_vectorizer, vectorizer_extractor(body_and_subject_vectorizer)),
            ]))]
    else:
        if subject_vectorizer is not None:
            # Pipeline for pulling vectorizer features from the post's subject
            extractors = extractors + \
                [('subject', Pipeline([
                    ('selector', ColumnSelectorExtractor('subject')),
                    (subject_vectorizer, vectorizer_extractor(subject_vectorizer)),
                ]))]

        if body_vectorizer is not None:
            # Pipeline for pulling vectorizer features from the post's body
            extractors = extractors + \
                [('body', Pipeline([
                    ('selector', ColumnSelectorExtractor('body')),
                    (body_vectorizer, vectorizer_extractor(body_vectorizer)),
                ]))]
    
    # Use FeatureUnion to combine the features
    return FeatureUnion(extractors)

### Carga de datos

In [9]:
train_set, test_set = load_data(merge_body_and_subject=True)

Loading data from dataset/ham_dev.json
Done in 3.903567s
Loaded 45000(465.272MB) mails
Parsing mails
Done in 19.629904s
Parsed 45000 mails
Loading data from dataset/spam_dev.json
Done in 2.253287s
Loaded 45000(200.517MB) mails
Parsing mails
Done in 18.128129s
Parsed 45000 mails
Generating Pandas DataFrame
Done in 64.343704s
Splitting into Training and Test Set
Done in 64.416754s
Train Set: 72000 samples - Ham: 35942(0.50%) Spam: 36058(0.50%)
Test Set:  18000 samples - Ham: 9058(0.50%) Spam: 8942(0.50%)


### Experimentación con árboles de decisión

#### Cuerpos y sujetos concatenados

In [9]:
dt_pipeline_unified_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, body_and_subject_vectorizer='tfidf')),
  ('tree_classifier', DecisionTreeClassifier())
])

result = cross_val_score(dt_pipeline_unified_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [10]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.978764 STD: 0.002119


#### Cuerpos y sujetos separados

In [11]:
dt_pipeline_splitted_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, 'tfidf', 'tfidf')),
  ('tree_classifier', DecisionTreeClassifier())
])

result = cross_val_score(dt_pipeline_splitted_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [12]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.979306 STD: 0.002468


### Experimentación con SVM

#### Cuerpos y sujetos concatenados

In [13]:
svm_pipeline_unified_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, body_and_subject_vectorizer='tfidf')),
  ('svm_classifier', SVC())
])

result = cross_val_score(svm_pipeline_unified_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [14]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.610222 STD: 0.002689


#### Cuerpos y sujetos separados

In [15]:
svm_pipeline_splitted_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, 'tfidf', 'tfidf')),
  ('svm_classifier', SVC())
])

result = cross_val_score(svm_pipeline_splitted_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [16]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.608903 STD: 0.002846


## Experimentación con Naive Bayes

### Gaussian Naive Bayes

Gaussian Naive bayes no funciona optimamente con datos esparsos. Sumado a esta limitación del algorimto, la conversión de nuestras matrices a esparsar implica un consumo de memoria demasiado alto, imposibilitando la utilización de este algoritmo.

### Multinomial Naive Bayes

#### Cuerpos y sujetos concatenados

In [10]:
mnb_pipeline_unified_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, body_and_subject_vectorizer='tfidf')),
  ('mnb_classifier', MultinomialNB())
])

result = cross_val_score(mnb_pipeline_unified_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [11]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.912722 STD: 0.005642


#### Cuerpos y sujetos separados

In [12]:
mnb_pipeline_splitted_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, 'tfidf', 'tfidf')),
  ('mnb_classifier', MultinomialNB())
])

result = cross_val_score(mnb_pipeline_splitted_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [13]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.921708 STD: 0.005261


### BernoulliNB

#### Cuerpos y sujetos concatenados

In [14]:
bnb_pipeline_unified_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, body_and_subject_vectorizer='tfidf')),
  ('bnb_classifier', BernoulliNB())
])

result = cross_val_score(bnb_pipeline_unified_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [15]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.948569 STD: 0.002133


#### Cuerpos y sujetos separados

In [16]:
bnb_pipeline_splitted_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, 'tfidf', 'tfidf')),
  ('bnb_classifier', BernoulliNB())
])

result = cross_val_score(bnb_pipeline_splitted_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [18]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.951486 STD: 0.002426


## Experimentación con Random Forests

#### Cuerpos y sujetos concatenados

In [22]:
rndf_pipeline_unified_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, body_and_subject_vectorizer='tfidf')),
  ('rndf_classifier', RandomForestClassifier())
])

result = cross_val_score(rndf_pipeline_unified_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [24]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.978708 STD: 0.001738


#### Cuerpos y sujetos separados

In [25]:
rndf_pipeline_splitted_body_and_subject = Pipeline([
  ('features_extractor', features_extractors(True, 'tfidf', 'tfidf')),
  ('rndf_classifier', RandomForestClassifier())
])

result = cross_val_score(rndf_pipeline_splitted_body_and_subject, train_set, train_set['label'], cv=10, n_jobs=-1)

In [26]:
print "Mean: %f STD: %f" % (np.mean(result), np.std(result))

Mean: 0.979264 STD: 0.002180


## Conclusiones