# Aprendizaje Automatico - TP 1

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

In [4]:
import features as cf
from util import *



## Features

Las siguientes features componen el conjunto de features simples(?) a utilizar:

In [5]:
def simple_extractors():
    # Extraigo dos atributos simples: 
    # 1) Longitud del mail.
    # 2) Cantidad de espacios en el mail.
    # 3) Tiene el mail contenido HTML?
    # 4) Tiene el mail imágenes?
    # 5) Cantidad de oraciones
    
    return [ ('body_length', cf.body_length), 
      ('count_spaces', cf.count_spaces), 
      ('has_html', cf.has_html), 
      ('has_image', cf.has_image), 
      ('number_of_sentences', cf.number_of_sentences) ]

In [6]:
def vectorizer_extractor(vectorizer_type, **kwargs):
    if vectorizer_type == "bow":
        vectorizer = CountVectorizer(stop_words='english', **kwargs)
    elif vectorizer_type == "tfidf":
        vectorizer = TfidfVectorizer(stop_words='english', **kwargs)
    elif vectorizer_type == "hashing_bow":
        vectorizer = HashingVectorizer(stop_words='english', **kwargs)
    else:
        raise ValueError('Invalid vectorizer_type. Expected \'bow\', \'tfidf\' or \'hashing_bow\'')
    
    return vectorizer

## Experimentación

### Carga de datos
Cargamos y spliteamos el dataset

In [7]:
train_set, test_set = load_data()

Loading data from dataset/ham_dev.json
Done in 2.500000s
Loaded 45000(465.272MB) mails
Parsing mails
Done in 12.578000s
Parsed 45000 mails
Loading data from dataset/spam_dev.json
Done in 1.525000s
Loaded 45000(200.517MB) mails
Parsing mails
Done in 12.219000s
Parsed 45000 mails
Generating Pandas DataFrame
Done in 22.020000s
Splitting into Training and Test Set
Done in 22.051000s
Train Set: 72000 samples - Ham: 35978(0.50%) Spam: 36022(0.50%)
Test Set:  18000 samples - Ham: 9022(0.50%) Spam: 8978(0.50%)


### Extracción de atributos

A continuación, definimos nuestro pipeline para la extracción de features.
1. Se realiza la extracción de las simple features descriptas anteriormente.
2. Se computa la matriz de term frequency–inverse document frequency para:
    - El sujeto de los mails.
    - El cuerpo de los mails.
3. Se utiliza el sentiment analyzer de NLTK para extraer la intención del mensaje.

In [8]:
def features_extractors(simple_features=True, subject_vectorizer='tfidf', body_vectorizer='tfidf'):
    extractors = []
    if simple_features:
        # Simple features extactor
        extractors = [('simple_features', cf.SimpleFeaturesExtractor(simple_extractors()))]
    
    if subject_vectorizer is not None:
        # Pipeline for pulling vectorizer features from the post's subject
        extractors = extractors + \
            [('subject', Pipeline([
                ('selector', ColumnSelectorExtractor('subject')),
                (subject_vectorizer, vectorizer_extractor(subject_vectorizer)),
            ]))]
            
    if body_vectorizer is not None:
        # Pipeline for pulling vectorizer features from the post's body
        extractors = extractors + \
            [('body', Pipeline([
                ('selector', ColumnSelectorExtractor('body')),
                (body_vectorizer, vectorizer_extractor(body_vectorizer)),
            ]))]
    
    # Use FeatureUnion to combine the features
    return FeatureUnion(extractors)

### Evaluación de clasificadores 

#### Árbol de decisiones

In [None]:
parameters_grid = {
    'features_extractor__subject__tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'features_extractor__subject__tfidf__max_df': [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0],
    'features_extractor__subject__tfidf__min_df': [0.0, 0.001, 0.01, 0.05, 0.1, 0.5, 0.75, 0.9], 
    'features_extractor__subject__tfidf__binary': [False, True],
    'features_extractor__subject__tfidf__sublinear_tf': [False, True],
    'features_extractor__body__tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'features_extractor__body__tfidf__max_df': [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0],
    'features_extractor__body__tfidf__min_df': [0.0, 0.001, 0.01, 0.05, 0.1, 0.5, 0.75, 0.9], 
    'features_extractor__body__tfidf__binary': [False, True],
    'features_extractor__body__tfidf__sublinear_tf': [False, True],
    'tree_classifier__criterion': ['gini', 'entropy'],
    'tree_classifier__max_features': ['sqrt', 'log2', 0.5, None],
    'tree_classifier__max_depth': [3, 5, 10, None],
    'tree_classifier__min_samples_split': [1, 3, 5, 10],
    'tree_classifier__min_samples_leaf': [1, 3, 5, 10]    
}

dt = GridSearchCV(Pipeline([
  ('features_extractor', features_extractors()),
  ('tree_classifier', DecisionTreeClassifier())
]), parameters_grid, cv=10, n_jobs=-1, verbose=1)

In [None]:
dt.fit(train_set, train_set['label'])

Fitting 10 folds for each of 231211008 candidates, totalling 2312110080 fits
