# Selección de Features

El objetivo de este notebook es 

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score

In [4]:
import features as cf
from util import *



## Features

Las siguientes features componen el conjunto de features simples(?) a utilizar:

In [5]:
def simple_extractors():
    # Extraigo dos atributos simples: 
    # 1) Longitud del mail.
    # 2) Cantidad de espacios en el mail.
    # 3) Tiene el mail contenido HTML?
    # 4) Tiene el mail imágenes?
    # 5) Cantidad de oraciones
    
    return [ ('body_length', cf.body_length), 
      ('count_spaces', cf.count_spaces), 
      ('has_html', cf.has_html), 
      ('has_image', cf.has_image), 
      ('number_of_sentences', cf.number_of_sentences) ]

In [6]:
def vectorizer_extractor(vectorizer_type, **kwargs):
    if vectorizer_type == "bow":
        vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), **kwargs)
    elif vectorizer_type == "tfidf":
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), **kwargs)
    elif vectorizer_type == "hashing_bow":
        vectorizer = HashingVectorizer(stop_words='english', ngram_range=(1, 2), **kwargs)
    else:
        raise ValueError('Invalid vectorizer_type. Expected \'bow\', \'tfidf\' or \'hashing_bow\'')
    
    return vectorizer

In [7]:
def column_extractors(column_name, sentiment_analysis=True, vectorizer_type='tfidf', **vect_kwargs):
    extractors = []
    if sentiment_analysis:
        # Step for pulling sentiment features from the column
        extractors = [('sentiment_analysis', Pipeline([
                        ('stats', cf.SentimentsStats()),
                        ('vect', DictVectorizer()),  # list of dicts -> feature matrix
                    ]))]
        
    if vectorizer_type is not None:
        # Step for pulling vectorizer features from the column
        extractor = extractors + [(vectorizer_type, vectorizer_extractor(vectorizer_type, **vect_kwargs))]
    
    if len(extractors) == 0:
        return None
    if len(extractors) == 1:
        extractor_name = extractors[0][0]
        extractor = extractors[0][1]
    else:
        extractor_name = 'extractors'
        extractor = FeatureUnion(extractors)
    
    return Pipeline([
            ('selector', ColumnSelectorExtractor(column_name)),
            (extractor_name, extractor)])

In [8]:
def features_extractors(simple_features=True,
                        subject_sentiment_analysis=False,
                        body_sentiment_analysis=False,
                        subject_vectorizer='tfidf',
                        body_vectorizer='tfidf',
                        **vect_kwargs):
    if simple_features:
        # Some simple handmade features
        extractors = [('simple', cf.SimpleFeaturesExtractor(simple_extractors()))]
    else:
        extractors = []
        
    
    subject_extractors = column_extractors('subject',
                                           sentiment_analysis=subject_sentiment_analysis,
                                           vectorizer_type=subject_vectorizer,
                                           **vect_kwargs)
    body_extractors = column_extractors('body',
                                        sentiment_analysis=subject_sentiment_analysis,
                                        vectorizer_type=subject_vectorizer,
                                        **vect_kwargs)
    
    if subject_extractors is not None:
        extractors = extractors + [('subject', subject_extractors)]    
    if body_extractors is not None:
        extractors = extractors + [('body', body_extractors)]
    
    if len(extractors) == 0:
        return None
    if len(extractors) == 1:
        return extractors[0][1]
    else:
        return FeatureUnion(extractors)

## Experimentación

In [9]:
train_set, test_set = load_data(20000)

Loading data from dataset/ham_dev.json
Done in 2.487000s
Loaded 45000(465.272MB) mails
Parsing mails
Done in 1.057000s
Parsed 10000 mails
Loading data from dataset/spam_dev.json
Done in 1.555000s
Loaded 45000(200.517MB) mails
Parsing mails
Done in 2.203000s
Parsed 10000 mails
Generating Pandas DataFrame
Done in 6.250000s
Splitting into Training and Test Set
Done in 6.257000s
Train Set: 16000 samples - Ham: 8009(0.50%) Spam: 7991(0.50%)
Test Set:  4000 samples - Ham: 1991(0.50%) Spam: 2009(0.50%)


### Evaluación de features

Para cada posible paso, la opcion a elegir(similar a un Grid Search, pero estatico)

In [10]:
opt_simple_features = [False, True]
opt_subject_sentiment_analysis = [False, True]
opt_body_sentiment_analysis = [False, True]
opt_subject_vect = [None, 'bow', 'tfidf', 'hashing_bow']
opt_body_vect = [None, 'bow', 'tfidf', 'hashing_bow']
opt_classifier = ['dt',
                  'random_forest', 
                  'bernoulli_nb',
                  'multinomial_nb',
                  'knn',
                  'svm']

classifier_dict = {'dt': DecisionTreeClassifier,
                  'random_forest': RandomForestClassifier, 
                  'bernoulli_nb': BernoulliNB,
                  'multinomial_nb': MultinomialNB, 
                  'knn': KNeighborsClassifier, 
                  'svm': SVC}

In [11]:
models = []

In [12]:
print 'Building models'
t0 = time.time()
for simple_features in opt_simple_features:
    for subject_sentiment_analysis in opt_subject_sentiment_analysis:
        for body_sentiment_analysis in opt_body_sentiment_analysis:
            for subject_vect in opt_subject_vect:
                for body_vect in opt_body_vect:
                    for classifier in opt_classifier:
                        features_name = ''
                        
                        if simple_features:
                            features_name = features_name + 'simple_'
                        
                        if subject_sentiment_analysis and body_sentiment_analysis:
                            features_name = features_name + 'all_sentiment_'
                        elif subject_sentiment_analysis:
                            features_name = features_name + 'subject_sentiment_'
                        elif body_sentiment_analysis:
                            features_name = features_name + 'body_sentiment_'
                            
                        if subject_vect is not None:
                            features_name = features_name + 'subject_' + subject_vect + '_'
                        
                        if body_vect is not None:
                            features_name = features_name + 'body_' + body_vect + '_'
                        
                        features_name = features_name[:-1]
                        
                        extractors = features_extractors(simple_features,
                                                         subject_sentiment_analysis,
                                                         body_sentiment_analysis,
                                                         subject_vect,
                                                         body_vect)
                        if extractors is None:
                            continue
                        
                        model = Pipeline([
                          ('features_extractor', extractors),
                          ('tree_classifier', classifier_dict[classifier]())
                        ])
                        
                        models = models + [(features_name, classifier, model)]
duration = time.time() - t0
print "Done in %fs" % duration

Building models
Done in 0.029000s


En principio corrimos la busqueda, pero ahora lo cargamos de un Pickle para analizarlo

In [31]:
# scores = {}
# for features_name, classifier_name, model in models:
#     print 'Running 10-Fold CV for model', features_name + '_' +  classifier_name                     
#                         
#     t0 = time.time()
#     score = cross_val_score(model, train_set, train_set['label'], cv=10, n_jobs=8)
#     duration = time.time() - t0
#     print "Done in %fs" % duration
#     if not features_name in scores:
#         scores[features_name] = {}
#         
#     scores[features_name][classifier_name] = score
#                         
#     print 'CV Scores: ', score
#     print 'Mean: ', np.mean(score), 'Std: ', np.std(score)
#     print ''
# joblib.dump(scores, 'features_cv_scores.pkl', compress=True)
# joblib.dump(models, 'features_models.pkl', compress=True)
scores = joblib.load('features_cv_scores.pkl')
models = joblib.load('features_models.pkl')
scores

{'all_sentiment': {'bernoulli_nb': array([ 0.61648969,  0.64875   ,  0.63875   ,  0.611875  ,  0.6325    ,
          0.630625  ,  0.6125    ,  0.6475    ,  0.63      ,  0.63101939]),
  'dt': array([ 0.69768894,  0.720625  ,  0.711875  ,  0.705625  ,  0.695625  ,
          0.704375  ,  0.703125  ,  0.72125   ,  0.708125  ,  0.69355847]),
  'knn': array([ 0.69331668,  0.72375   ,  0.726875  ,  0.695625  ,  0.7075    ,
          0.714375  ,  0.693125  ,  0.725625  ,  0.708125  ,  0.71357098]),
  'multinomial_nb': array([ 0.59900062,  0.6375    ,  0.62375   ,  0.60625   ,  0.614375  ,
          0.604375  ,  0.5975    ,  0.624375  ,  0.62      ,  0.60537836]),
  'random_forest': array([ 0.70830731,  0.734375  ,  0.728125  ,  0.715625  ,  0.72      ,
          0.7275    ,  0.7075    ,  0.735     ,  0.719375  ,  0.71794872]),
  'svm': array([ 0.60961899,  0.638125  ,  0.630625  ,  0.605     ,  0.621875  ,
          0.618125  ,  0.598125  ,  0.628125  ,  0.62625   ,  0.61538462])},
 'all_senti

In [32]:
scores_df = pd.DataFrame.from_dict(scores)
scores_df

Unnamed: 0,all_sentiment,all_sentiment_body_bow,all_sentiment_body_hashing_bow,all_sentiment_body_tfidf,all_sentiment_subject_bow,all_sentiment_subject_bow_body_bow,all_sentiment_subject_bow_body_hashing_bow,all_sentiment_subject_bow_body_tfidf,all_sentiment_subject_hashing_bow,all_sentiment_subject_hashing_bow_body_bow,...,subject_sentiment_subject_bow_body_hashing_bow,subject_sentiment_subject_bow_body_tfidf,subject_sentiment_subject_hashing_bow,subject_sentiment_subject_hashing_bow_body_bow,subject_sentiment_subject_hashing_bow_body_hashing_bow,subject_sentiment_subject_hashing_bow_body_tfidf,subject_sentiment_subject_tfidf,subject_sentiment_subject_tfidf_body_bow,subject_sentiment_subject_tfidf_body_hashing_bow,subject_sentiment_subject_tfidf_body_tfidf
bernoulli_nb,"[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...",...,"[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0...","[0.616489693941, 0.64875, 0.63875, 0.611875, 0..."
dt,"[0.69768894441, 0.720625, 0.711875, 0.705625, ...","[0.693316677077, 0.72, 0.711875, 0.705625, 0.6...","[0.699562773267, 0.7175, 0.71375, 0.7, 0.69187...","[0.69768894441, 0.71875, 0.715, 0.706875, 0.70...","[0.694565896315, 0.719375, 0.711875, 0.705, 0....","[0.703310430981, 0.71875, 0.71125, 0.70375, 0....","[0.699562773267, 0.71625, 0.701875, 0.705, 0.6...","[0.69768894441, 0.723125, 0.705, 0.7075, 0.695...","[0.698313554029, 0.718125, 0.708125, 0.7, 0.69...","[0.699562773267, 0.718125, 0.71, 0.706875, 0.6...",...,"[0.701436602124, 0.7175, 0.71125, 0.7025, 0.69...","[0.690818238601, 0.7225, 0.71125, 0.70625, 0.6...","[0.695815115553, 0.725625, 0.715, 0.709375, 0....","[0.69768894441, 0.72, 0.71125, 0.708125, 0.693...","[0.693316677077, 0.723125, 0.711875, 0.70375, ...","[0.700187382886, 0.723125, 0.708125, 0.713125,...","[0.695815115553, 0.72, 0.70875, 0.708125, 0.69...","[0.693316677077, 0.72125, 0.7075, 0.703125, 0....","[0.698938163648, 0.724375, 0.710625, 0.71125, ...","[0.699562773267, 0.71375, 0.706875, 0.70625, 0..."
knn,"[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...",...,"[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ...","[0.693316677077, 0.72375, 0.726875, 0.695625, ..."
multinomial_nb,"[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...",...,"[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61...","[0.59900062461, 0.6375, 0.62375, 0.60625, 0.61..."
random_forest,"[0.708307307933, 0.734375, 0.728125, 0.715625,...","[0.717676452217, 0.726875, 0.7225, 0.715, 0.71...","[0.701436602124, 0.723125, 0.726875, 0.70375, ...","[0.712054965646, 0.73125, 0.7175, 0.71125, 0.7...","[0.698938163648, 0.728125, 0.723125, 0.713125,...","[0.704559650219, 0.75, 0.7175, 0.708125, 0.708...","[0.712054965646, 0.735, 0.7275, 0.713125, 0.71...","[0.7039350406, 0.7375, 0.726875, 0.70625, 0.71...","[0.698938163648, 0.73, 0.725625, 0.708125, 0.7...","[0.705808869457, 0.739375, 0.719375, 0.701875,...",...,"[0.708931917552, 0.74125, 0.715625, 0.710625, ...","[0.700187382886, 0.728125, 0.704375, 0.704375,...","[0.702685821362, 0.738125, 0.718125, 0.706875,...","[0.696439725172, 0.726875, 0.7175, 0.709375, 0...","[0.700187382886, 0.7325, 0.726875, 0.724375, 0...","[0.716427232979, 0.734375, 0.72625, 0.713125, ...","[0.698313554029, 0.74125, 0.7125, 0.711875, 0....","[0.7039350406, 0.740625, 0.72, 0.715625, 0.706...","[0.703310430981, 0.7325, 0.724375, 0.71125, 0....","[0.710805746408, 0.731875, 0.725, 0.7025, 0.70..."
svm,"[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....",...,"[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0....","[0.609618988132, 0.638125, 0.630625, 0.605, 0...."


In [33]:
feature_mean_and_std_df = scores_df.apply(lambda feature_df: 
                                          feature_df.apply(lambda model_cv: (np.mean(model_cv), np.std(model_cv))))
feature_mean_and_std_df

Unnamed: 0,all_sentiment,all_sentiment_body_bow,all_sentiment_body_hashing_bow,all_sentiment_body_tfidf,all_sentiment_subject_bow,all_sentiment_subject_bow_body_bow,all_sentiment_subject_bow_body_hashing_bow,all_sentiment_subject_bow_body_tfidf,all_sentiment_subject_hashing_bow,all_sentiment_subject_hashing_bow_body_bow,...,subject_sentiment_subject_bow_body_hashing_bow,subject_sentiment_subject_bow_body_tfidf,subject_sentiment_subject_hashing_bow,subject_sentiment_subject_hashing_bow_body_bow,subject_sentiment_subject_hashing_bow_body_hashing_bow,subject_sentiment_subject_hashing_bow_body_tfidf,subject_sentiment_subject_tfidf,subject_sentiment_subject_tfidf_body_bow,subject_sentiment_subject_tfidf_body_hashing_bow,subject_sentiment_subject_tfidf_body_tfidf
bernoulli_nb,"(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)",...,"(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)","(0.630000908106, 0.0124873886848)"
dt,"(0.706187241846, 0.00909772769978)","(0.705687905982, 0.0102282904778)","(0.704874859253, 0.0103297916625)","(0.70787493728, 0.00962829354664)","(0.704875210645, 0.0109116596959)","(0.705186851416, 0.0101746025257)","(0.704937359253, 0.0101645410439)","(0.705500132715, 0.0100765151683)","(0.705250015503, 0.00882550757115)","(0.707562437427, 0.00824362972583)",...,"(0.705124859399, 0.00963719276682)","(0.705875366699, 0.0103195714001)","(0.708187984351, 0.00923449195107)","(0.707562632715, 0.00990701225079)","(0.706624897852, 0.0116791296732)","(0.707312320215, 0.00928450139293)","(0.705500484351, 0.00982628051349)","(0.704187905982, 0.0107628014031)","(0.706999976465, 0.0106172608663)","(0.704874976514, 0.00804233759145)"
knn,"(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)",...,"(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)","(0.710188765894, 0.0123653147942)"
multinomial_nb,"(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)",...,"(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)","(0.613250398609, 0.0122871549412)"
random_forest,"(0.721375602588, 0.00922871324202)","(0.721812047973, 0.00979383890928)","(0.715626461963, 0.010376083771)","(0.717812751099, 0.00948809329327)","(0.719751266334, 0.0100636161091)","(0.719187828687, 0.0147542352857)","(0.720312516577, 0.00823028693842)","(0.719376344898, 0.00975854754374)","(0.717876031812, 0.0134196034565)","(0.718251110523, 0.0104899739411)",...,"(0.721438493506, 0.0111476351371)","(0.71618814104, 0.0117438440032)","(0.718000797583, 0.0121661434165)","(0.714500679834, 0.00949219658501)","(0.721001344605, 0.013907676397)","(0.721562516919, 0.00900106214975)","(0.716876266285, 0.0123148481772)","(0.716000367725, 0.0104405639066)","(0.71843896211, 0.0115324709462)","(0.717500446436, 0.0103743162519)"
svm,"(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)",...,"(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)","(0.619125360352, 0.0117407345912)"


In [34]:
mean = feature_mean_and_std_df.apply(lambda feature_df: np.mean([cv_mean for cv_mean, cv_std in feature_df]))
print (mean.max(), mean.idxmax())

(0.86927118110772039, 'simple_all_sentiment_subject_bow')


In [35]:
alpha = 0.5
ucb = feature_mean_and_std_df.apply(lambda feature_df: 
                                    np.mean([cv_mean for cv_mean, cv_std in feature_df]) +
                                   alpha * np.std([cv_mean for cv_mean, cv_std in feature_df]))
print (ucb.max(), ucb.idxmax())

(0.91944430673287858, 'simple_all_sentiment_subject_bow')
