# Selección de Features

El objetivo de este notebook es decidir que extractores de features usamos en el pipeline principal 

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
%matplotlib inline

In [4]:
import time
import numpy as np
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score

In [6]:
import features as cf
from util import *



## Features

Decidimos definir los siguientes extractores de features:
1. Features sencillos, definidos en features.py, como por ejemplo longitud del cuerpo del mail, o si contiene imagenes
2. Vectorización (ya sea por medio del método Bag Of Words, del método TF-IDF, o del método Hashing Vectorizer) del Subject
3. Vectorización (idem anterior) del Body

In [7]:
def splitted_features_extractors_names(simple_features=True,
                        subject_vectorizer='bow',
                        body_vectorizer='bow'):
    if simple_features:
        features_name = 'simple_'
    else:
        features_name = ''
        
    if subject_vectorizer is not None:
        features_name = features_name + 'subject_' + subject_vectorizer + '_'

    if body_vectorizer is not None:
        features_name = features_name + 'body_' + body_vectorizer + '_'
        
    features_name = features_name[:-1]

    if features_name == '':
        return None
    
    return features_name
    
def merged_features_extractors_names(simple_features=True, merged_vectorizer='bow'):
    if simple_features:
        features_name = 'simple_'
    else:
        features_name = ''
        
    if merged_vectorizer is not None:
        features_name = features_name + 'subject_and_body_' + merged_vectorizer + '_'

    features_name = features_name[:-1]

    if features_name == '':
        return None
    
    return features_name

## Experimentación

### Evaluación de features

Para cada posible extractor de features, definimos cual usar (similar a un Grid Search, pero a mano).
Definimos también distintos clasificadores para utilizar.
Finalmente evaluamos todas las posibles combinaciones realizando un 10-Fold CV.

In [8]:
opt_simple_features = [True, False]
opt_vectorizers = [None, 'bow', 'tfidf', 'hashing_bow']
opt_classifier = ['dt',
                  'random_forest', 
                  'bernoulli_nb',
                  'multinomial_nb',
                  'knn',
                  'svm']

classifier_dict = {'dt': DecisionTreeClassifier,
                  'random_forest': RandomForestClassifier, 
                  'bernoulli_nb': BernoulliNB,
                  'multinomial_nb': MultinomialNB, 
                  'knn': KNeighborsClassifier, 
                  'svm': SVC}

def get_models_names():    
    models_names = []
    for simple_features in opt_simple_features:
        for subject_vect in opt_vectorizers:
            for body_vect in opt_vectorizers:
                features_name = splitted_features_extractors_names(simple_features, subject_vect, body_vect)
                
                if features_name is None:
                    continue

                for classifier in opt_classifier:
                    models_names = models_names + [features_name + '_' + classifier]
                    
    for simple_features in opt_simple_features:
        for subject_and_body_vect in opt_vectorizers:
            features_name = merged_features_extractors_names(simple_features, subject_and_body_vect)
            
            if features_name is None:
                continue

            for classifier in opt_classifier: 
                models_names = models_names + [features_name + '_' + classifier]
    
    return models_names

In [9]:
models_names = get_models_names()

In [20]:
print '%d/%d' % (models_names.index('simple_subject_hashing_bow_body_bow_random_forest'), len(models_names))

79/228
