# Aprendizaje Automatico - TP 1

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier

In [4]:
import simple_features as sf
from util import *

## Features

Aca definimos distintos tipos de features para hacer las pruebas

In [5]:
def simple_features():
    # Extraigo dos atributos simples: 
    # 1) Longitud del mail.
    # 2) Cantidad de espacios en el mail.
    extractors = [ ('length', len), ('count_spaces', sf.count_spaces) ]
    feature_extractor = sf.SimpleFeaturesExtractor(extractors)

    return feature_extractor, "SimpleFeaturesExtractor"

In [6]:
def vectorizer_features(vectorizer_type, **kwargs):
    if vectorizer_type == "count":
        vectorizer = CountVectorizer(stop_words='english', **kwargs)
        vectorizer_descr = "BagOfWords"        
    elif vectorizer_type == "tfidf":
        vectorizer = TfidfVectorizer(stop_words='english', **kwargs)
        vectorizer_descr = "TfIdf"
    elif vectorizer_type == "hashing":
        vectorizer = HashingVectorizer(stop_words='english', **kwargs)
        vectorizer_descr = "HashingBagOfWords"
    else:
        raise ValueError('Invalid vectorizer_type. Expected \'count\', \'tfidf\' or \'hashing\'')
    
    return vectorizer, "%sSparseVectorizer" % vectorizer_descr

## Experimentacion

### Carga de datos

Cargamos y spliteamos el dataset

In [7]:
data, labels = load_data()

Dataset: 90000 samples(665.789MB) - Ham: 45000(50.00%) Spam: 45000(50.00%)


### Pruebas de Features

Aca probamos distintos tipos de features. Mediante la funcion 'train_benchmark_and_save' entrenamos, benchmarkeamos y grabamos en disco los resultados y modelos de cada combinacion que decidimos probar

#### Features del Baseline Example

In [8]:
run_ml_pipeline(
    simple_features(), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

Running ML Pipeline for SimpleFeaturesExtractor-DecisionTree(20160903-203024)
Extracting features from the dataset using a SimpleFeaturesExtractor
Done in 0.605000s
Set: 90000 samples 2 features

Running 10-Fold Cross Validation for DecisionTree
Done in 1.360000s
CV Score: mean 0.687478 std 0.019115
Saving Trained Extractor and Model SimpleFeaturesExtractor-DecisionTree


#### Features Vectorizer de SkLearn

In [9]:
run_ml_pipeline(
    vectorizer_features('count'), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

Running ML Pipeline for BagOfWordsSparseVectorizer-DecisionTree(20160903-203026)
Extracting features from the dataset using a BagOfWordsSparseVectorizer
Done in 87.154000s
Set: 90000 samples 7460706 features

Running 10-Fold Cross Validation for DecisionTree
Done in 1699.139000s
CV Score: mean 0.993822 std 0.004529
Saving Trained Extractor and Model BagOfWordsSparseVectorizer-DecisionTree


In [10]:
run_ml_pipeline(
    vectorizer_features('tfidf'), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

Running ML Pipeline for TfIdfSparseVectorizer-DecisionTree(20160903-210408)
Extracting features from the dataset using a TfIdfSparseVectorizer
Done in 96.903000s
Set: 90000 samples 7460706 features

Running 10-Fold Cross Validation for DecisionTree
Done in 1613.365000s
CV Score: mean 0.992622 std 0.005582
Saving Trained Extractor and Model TfIdfSparseVectorizer-DecisionTree


In [11]:
run_ml_pipeline(
    vectorizer_features('hashing'), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

Running ML Pipeline for HashingBagOfWordsSparseVectorizer-DecisionTree(20160903-213641)
Extracting features from the dataset using a HashingBagOfWordsSparseVectorizer
Done in 54.131000s
Set: 90000 samples 1048576 features

Running 10-Fold Cross Validation for DecisionTree
Done in 188.892000s
CV Score: mean 0.992744 std 0.005164
Saving Trained Extractor and Model HashingBagOfWordsSparseVectorizer-DecisionTree


#### Features Nuestros