# Selección de Features

El objetivo de este notebook es decidir que extractores de features usamos en el pipeline principal 

In [2]:
%matplotlib inline

In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score

In [5]:
import features as cf
from util import *



## Features

Decidimos definir los siguientes extractores de features:
1. Features sencillos, definidos en features.py, como por ejemplo longitud del cuerpo del mail, o si contiene imagenes
2. Vectorización (ya sea por medio del método Bag Of Words, del método TF-IDF, o del método Hashing Vectorizer) del Subject
3. Vectorización (idem anterior) del Body

In [6]:
def vectorizer_extractor(vectorizer_type='bow', **kwargs):
    if vectorizer_type == "bow":
        vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), **kwargs)
    elif vectorizer_type == "tfidf":
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), **kwargs)
    elif vectorizer_type == "hashing_bow":
        vectorizer = HashingVectorizer(stop_words='english', ngram_range=(1, 2), **kwargs)
    else:
        raise ValueError('Invalid vectorizer_type. Expected \'bow\', \'tfidf\' or \'hashing_bow\'')
    
    return vectorizer

def column_extractor(column_name, vectorizer_type='bow', **vect_kwargs):
    if not vectorizer_type is None:
        return Pipeline([
            ('selector', ColumnSelectorExtractor(column_name)),
            (vectorizer_type, vectorizer_extractor(vectorizer_type, **vect_kwargs))])
    else:
        return None
    
def subject_and_body_merged_extractor(vectorizer_type='bow', **vect_kwargs):
    if not vectorizer_type is None:
        return Pipeline([
            ('selector', SubjectAndBodyMergerExtractor()),
            (vectorizer_type, vectorizer_extractor(vectorizer_type, **vect_kwargs))])
    else:
        return None
    
def splitted_features_extractors(simple_features=True,
                        subject_vectorizer='bow',
                        body_vectorizer='bow',
                        **vect_kwargs):
    if simple_features:
        # Some simple handmade features
        simple_extractors = [ ('body_length', cf.body_length),
                             ('count_spaces', cf.count_spaces),
                             ('has_html', cf.has_html), 
                             ('has_image', cf.has_image), 
                             ('number_of_sentences', cf.number_of_sentences) ]
        
        extractors = [('simple', cf.SimpleFeaturesExtractor(simple_extractors))]
        features_name = 'simple_'
    else:
        extractors = []
        features_name = ''
        
    if subject_vectorizer is not None:
        extractors = extractors + [('subject', column_extractor('subject',
                                           vectorizer_type=subject_vectorizer,
                                           **vect_kwargs))]        
        features_name = features_name + 'subject_' + subject_vectorizer + '_'

    if body_vectorizer is not None:
        extractors = extractors + [('body', column_extractor('body',
                                        vectorizer_type=body_vectorizer,
                                        **vect_kwargs))]
        features_name = features_name + 'body_' + body_vectorizer + '_'
        
    features_name = features_name[:-1]

    if len(extractors) == 0:
        return None
    if len(extractors) == 1:
        return extractors[0][1], features_name
    else:
        return FeatureUnion(extractors), features_name
    
def merged_features_extractors(simple_features=True,
                        merged_vectorizer='bow',                        
                        **vect_kwargs):
    if simple_features:
        # Some simple handmade features
        simple_extractors = [ ('body_length', cf.body_length),
                             ('count_spaces', cf.count_spaces),
                             ('has_html', cf.has_html), 
                             ('has_image', cf.has_image), 
                             ('number_of_sentences', cf.number_of_sentences) ]
        
        extractors = [('simple', cf.SimpleFeaturesExtractor(simple_extractors))]
        features_name = 'simple_'
    else:
        extractors = []
        features_name = ''
        
    if merged_vectorizer is not None:
        extractors = extractors + [('subject_and_body', subject_and_body_merged_extractor(
                                           vectorizer_type=merged_vectorizer,
                                           **vect_kwargs))]        
        features_name = features_name + 'subject_and_body_' + merged_vectorizer + '_'

    features_name = features_name[:-1]

    if len(extractors) == 0:
        return None
    if len(extractors) == 1:
        return extractors[0][1], features_name
    else:
        return FeatureUnion(extractors), features_name

## Experimentación

In [18]:
def load_train_set():
    train_set = joblib.load('dataset/train_set.pkl').sample(10000)
    return train_set.drop('label', axis=1), train_set['label']

In [19]:
X, y = load_train_set()

### Evaluación de features

Para cada posible extractor de features, definimos cual usar (similar a un Grid Search, pero a mano).
Definimos también distintos clasificadores para utilizar.
Finalmente evaluamos todas las posibles combinaciones realizando un 10-Fold CV.

In [20]:
opt_simple_features = [True, False]
opt_vectorizers = [None, 'bow', 'tfidf', 'hashing_bow']
opt_classifier = ['dt',
                  'random_forest', 
                  'bernoulli_nb',
                  'multinomial_nb',
                  'knn',
                  'svm']

classifier_dict = {'dt': DecisionTreeClassifier,
                  'random_forest': RandomForestClassifier, 
                  'bernoulli_nb': BernoulliNB,
                  'multinomial_nb': MultinomialNB, 
                  'knn': KNeighborsClassifier, 
                  'svm': SVC}

def get_models_builders():
    """Construye todos los modelos para evaluar"""
    
    models_builders = []
    for simple_features in opt_simple_features:
        for subject_vect in opt_vectorizers:
            for body_vect in opt_vectorizers:
                extractor_tuple = splitted_features_extractors(simple_features, subject_vect, body_vect)
                
                if extractor_tuple is None:
                    continue

                extractors, features_name = extractor_tuple     

                for classifier in opt_classifier: 
                    builder = lambda: Pipeline([
                      ('features_extractor', extractors),
                      ('classifier', classifier_dict[classifier]())
                    ])
                    models_builders.append((features_name, classifier, builder))
                    
    for simple_features in opt_simple_features:
        for subject_and_body_vect in opt_vectorizers:
            extractor_tuple = merged_features_extractors(simple_features, subject_and_body_vect)
            
            if extractor_tuple is None:
                continue

            extractors, features_name = extractor_tuple     

            for classifier in opt_classifier:
                builder = lambda: Pipeline([
                      ('features_extractor', extractors),
                      ('classifier', classifier_dict[classifier]())
                    ])
                models_builders.append((features_name, classifier, builder))

    return models_builders

def score_models(models_builders, X, y, cv=10, n_jobs=1):
    """
    Evalua con K-Fold CV todos los modelos usando X e y como datos.
    Guarda los puntajes y los tiempos de evaluacion en archivos Pickle
    """
    
    scores = {}
    times = {}

    for features_name, classifier_name, model_builder in models_builders:
        print 'Running %d-Fold CV for model %s' % (cv, features_name + '_' +  classifier_name) 
        
        model = model_builder()
        
        t0 = time.time()
        score = cross_val_score(model, X, y, cv=cv, n_jobs=n_jobs)
        duration = time.time() - t0
        print "Done in %fs" % duration
        
        if not features_name in scores:
            scores[features_name] = {}
            times[features_name] = {}
            
        scores[features_name][classifier_name] = score
        times[features_name][classifier_name] = duration
                            
        print 'CV Scores: ', score
        print 'Mean: ', np.mean(score), 'Std: ', np.std(score)
        print ''
    
    # Las filas van a ser la elección de features y las columnas los clasificadores
    scores = pd.DataFrame.from_dict(scores).transpose()
    times = pd.DataFrame.from_dict(times).transpose()
    
    if not os.path.exists('features_extraction'):
        os.makedirs('features_extraction')

    joblib.dump(scores, 'features_extraction/features_cv_scores.pkl', compress=True)
    joblib.dump(times, 'features_extraction/features_cv_times.pkl', compress=True)
    
    return scores, times

def load_models():
    """Carga de archivos Pickle los puntajes y los tiempos calculados previamente"""
    
    scores = joblib.load('features_extraction/features_cv_scores.pkl')
    times = joblib.load('features_extraction/features_cv_times.pkl')
    
    return scores, times

In [21]:
def summary(scores, times):
    """Imprime un resumen con distintas estadisticas sobre los datos"""
    # Calculo en cada celda el promedio de los cross validation scores
    scores_mean = scores.applymap(np.mean)
    
     # Estadistica: Promedio sobre todos los clasificadores para cada eleccion de features
    mean = scores_mean.mean(axis=1)
    print 'Max of Mean Score Across Classifiers: %f for Extractor \'%s\'' % (mean.max(), mean.idxmax())
    
    # Estadistica: UCB sobre todos los clasificadores para cada eleccion de features
    # Inspirado por Optimización Bayesiana
    ucb = scores_mean.mean(axis=1) + 0.5 * scores_mean.std(axis=1)
    print 'Max of UCB Score Across Classifiers: %f for Extractor \'%s\'' % (ucb.max(), ucb.idxmax())
    
    # Estadistica: Mejor clasificador para cada eleccion de features
    best_clf = scores_mean.max(axis=1)
    best_clf_name = scores_mean.idxmax(axis=1)
    print 'Max of Score Using Best Classifier: %f for Extractor \'%s\' and Classifier \'%s\'' % \
        (best_clf.max(), best_clf.idxmax(), best_clf_name[best_clf.idxmax()])
        
    # Estadistica: Promedio sobre todos los clasificadores para cada eleccion de features
    mean = times.mean(axis=1)
    print 'Min of Mean Time Across Classifiers: %f for Extractor \'%s\'' % (mean.min(), mean.idxmin())
    
    # Estadistica: Mejor clasificador para cada eleccion de features
    fastest_clf = times.max(axis=1)
    fastest_clf_name = times.idxmax(axis=1)
    print 'Min of Time Using Fastest Classifier: %f for Extractor \'%s\' and Classifier \'%s\'' % \
        (fastest_clf.max(), fastest_clf.idxmax(), fastest_clf_name[fastest_clf.idxmax()])

In [None]:
scores, times = score_models(get_models_builders(), X, y, n_jobs=4) 

Running 10-Fold CV for model simple_dt
Done in 301.404000s
CV Scores:  [ 0.50649351  0.50649351  0.50649351  0.50649351  0.507       0.507
  0.50650651  0.50650651  0.50650651  0.50650651]
Mean:  0.5066000052 Std:  0.000200081883421

Running 10-Fold CV for model simple_random_forest
Done in 303.783000s
CV Scores:  [ 0.50649351  0.50649351  0.50649351  0.50649351  0.507       0.507
  0.50650651  0.50650651  0.50650651  0.50650651]
Mean:  0.5066000052 Std:  0.000200081883421

Running 10-Fold CV for model simple_bernoulli_nb
Done in 307.475000s
CV Scores:  [ 0.50649351  0.50649351  0.50649351  0.50649351  0.507       0.507
  0.50650651  0.50650651  0.50650651  0.50650651]
Mean:  0.5066000052 Std:  0.000200081883421

Running 10-Fold CV for model simple_multinomial_nb
Done in 306.021000s
CV Scores:  [ 0.50649351  0.50649351  0.50649351  0.50649351  0.507       0.507
  0.50650651  0.50650651  0.50650651  0.50650651]
Mean:  0.5066000052 Std:  0.000200081883421

Running 10-Fold CV for model si

In [None]:
summary(scores, times)

In [None]:
simple_extractors = [ ('body_length', cf.body_length),
                             ('count_spaces', cf.count_spaces),
                             ('has_html', cf.has_html), 
                             ('has_image', cf.has_image), 
                             ('number_of_sentences', cf.number_of_sentences) ]
X = train_set.drop('label', axis=1)
y = train_set['label']

In [None]:
%timeit Pipeline([('extractor', cf.SimpleFeaturesExtractor(simple_extractors)), ('clf', DecisionTreeClassifier())]).fit(X, y)

In [None]:
%timeit Pipeline([('extractor', column_extractor('body', 'bow')), ('clf', DecisionTreeClassifier())]).fit(X, y)

In [None]:
%timeit cf.SimpleFeaturesExtractor(simple_extractors).fit_transform(X, y)

In [None]:
%timeit column_extractor('body', 'bow').fit_transform(X, y)

In [None]:
X_transformed = cf.SimpleFeaturesExtractor(simple_extractors).fit_transform(X, y)

In [None]:
%timeit DecisionTreeClassifier().fit(X_transformed, y)

In [None]:
X_transformed = column_extractor('body', 'bow').fit_transform(X, y)

In [None]:
%timeit DecisionTreeClassifier().fit(X_transformed, y)