# Aprendizaje Automatico - TP 1

In [7]:
%matplotlib inline

In [4]:
import numpy as np
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer

In [6]:
import simple_features as sf
import custom_features as cf
from util import *

## Features

Las siguientes features componen el conjunto de features simples(?) a utilizar:

In [8]:
def simple_features():
    # Extraigo dos atributos simples: 
    # 1) Longitud del mail.
    # 2) Cantidad de espacios en el mail.
    # 3) Tiene el mail contenido HTML?
    # 4) Tiene el mail imágenes?
    # 5) Cantidad de oraciones
    extractors = [ ('length', sf.body_length), 
      ('count_spaces', sf.count_spaces), 
      ('has_html', sf.has_html), 
      ('has_image', sf.has_image), 
      ('number_of_sentences', sf.number_of_sentences) ]
    feature_extractor = sf.SimpleFeaturesExtractor(extractors)

    return "SimpleFeaturesExtractor", feature_extractor

A continuación, definimos nuestro pipeline para la extracción de features.
1. Se parsean los emails.
2. Se realiza la extracción de las simple features descriptas anteriormente.
3. Se computa la matriz de term frequency–inverse document frequency para:
    - El sujeto de los mails.
    - El cuerpo de los mails.
4. Se utiliza el sentiment analyzer de NLTK para extraer la intención del mensaje.

In [9]:
features_extraction = Pipeline([
  # Parse the emails
  ('parser', cf.EmailParser()),

  # Use FeatureUnion to combine the features
  ('extraction', FeatureUnion(
      transformer_list=[
          # Simple features extactor
          simple_features(),

          # Pipeline for pulling features from the post's subject
          ('subject', Pipeline([
              ('selector', cf.ItemSelector(key = 'subject')),
              ('tfidf', TfidfVectorizer(stop_words='english')),
          ])),

          # Pipeline for pulling features from the post's body
          ('subject', Pipeline([
              ('selector', cf.ItemSelector(key = 'body')),
              ('tfidf', TfidfVectorizer(stop_words='english')),
          ])),

          # Pipeline for the sentiment analysis feature
          ('sentiment_analysis', Pipeline([
              ('selector', cf.ItemSelector(key = 'body')),
              ('stats', cf.SentimentsStats()),
              ('vect', DictVectorizer()),  # list of dicts -> feature matrix
          ]))
      ]
  ))
])

## Experimentacion

### Carga de datos

Cargamos y spliteamos el dataset

In [None]:
data, labels = load_data()

### Evaluación de clasificadores 

#### Árbol de decisiones

In [None]:
dt_pipeline = Pipeline([
  ('feature_extractor', features_extraction),
  ('tree_classifier', DecisionTreeClassifier())
])

cross_validate(dt_pipeline, 'DecisionTree', data, labels, cv_folds=10, n_jobs=-1)

### Pruebas de Features

Aca probamos distintos tipos de features.

#### Features del Baseline Example

In [None]:
run_ml_pipeline(
    simple_features(), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

#### Features Vectorizer de SkLearn

In [None]:
run_ml_pipeline(
    vectorizer_features('count'), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

In [None]:
run_ml_pipeline(
    vectorizer_features('tfidf'), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

In [None]:
run_ml_pipeline(
    vectorizer_features('hashing'), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

#### Pipeline (Sklearn)

In [None]:
simple_feats, simple_feats_descr = simple_features()
tfidf_vect, tfidf_vect_descr = vectorizer_features(vectorizer_type='tfidf')
count_vect, count_vect_descr = vectorizer_features(vectorizer_type='count')
hashing_vect, hashing_vect_descr = vectorizer_features(vectorizer_type='hashing')

pipeline = Pipeline([
    ('feats', FeatureUnion([("simple_feats", simple_feats), ("tfidfvect", tfidf_vect), ('count',count_vect), ('hashing', hashing_vect)])),
    ('clf', DecisionTreeClassifier())  # classifier
])

scores = cross_val_score(pipeline,
                         data,  
                         labels,
                         cv=5,  
                         scoring='accuracy', 
                         n_jobs=8,  
                         )
print scores