# Aprendizaje Automatico - TP 1

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score

In [4]:
import features as cf
from util import *



## Features

Las siguientes features componen el conjunto de features simples(?) a utilizar:

In [23]:
def simple_extractors():
    # Extraigo dos atributos simples: 
    # 1) Longitud del mail.
    # 2) Cantidad de espacios en el mail.
    # 3) Tiene el mail contenido HTML?
    # 4) Tiene el mail imágenes?
    # 5) Cantidad de oraciones
    
    return [ ('body_length', cf.body_length), 
      ('count_spaces', cf.count_spaces), 
      ('has_html', cf.has_html), 
      ('has_image', cf.has_image), 
      ('number_of_sentences', cf.number_of_sentences) ]

## Experimentación

### Carga de datos
Cargamos y spliteamos el dataset

In [7]:
train_set, test_set = load_data()

Loading data from dataset/ham_dev.json
Done in 5.484000s
Loaded 45000(465.272MB) mails
Parsing mails
Done in 0.125000s
Parsed 500 mails
Loading data from dataset/spam_dev.json
Done in 3.206000s
Loaded 45000(200.517MB) mails
Parsing mails
Done in 0.200000s
Parsed 500 mails
Generating Pandas DataFrame
Done in 0.531000s
Splitting into Training and Test Set
Done in 0.535000s
Train Set: 800 samples - Ham: 401(0.50%) Spam: 399(0.50%)
Test Set:  200 samples - Ham: 99(0.49%) Spam: 101(0.51%)


### Extracción de atributos

A continuación, definimos nuestro pipeline para la extracción de features.
1. Se realiza la extracción de las simple features descriptas anteriormente.
2. Se computa la matriz de term frequency–inverse document frequency para:
    - El sujeto de los mails.
    - El cuerpo de los mails.
3. Se utiliza el sentiment analyzer de NLTK para extraer la intención del mensaje.

In [24]:
features_extractor = FeatureUnion(
    # Use FeatureUnion to combine the features
    [
        # Simple features extactor
        ('simple_features', cf.SimpleFeaturesExtractor(simple_extractors())),

        # Pipeline for pulling features from the post's subject
        ('subject', Pipeline([
            ('selector', ColumnSelectorExtractor('subject')),
            ('tfidf', TfidfVectorizer(stop_words='english')),
        ])),

        # Pipeline for pulling features from the post's body
        ('subject', Pipeline([
            ('selector', ColumnSelectorExtractor('body')),
            ('tfidf', TfidfVectorizer(stop_words='english')),
        ])),

        # Pipeline for the sentiment analysis feature
        ('sentiment_analysis', Pipeline([
            ('selector', ColumnSelectorExtractor('body')),
            ('stats', cf.SentimentsStats()),
            ('vect', DictVectorizer()),  # list of dicts -> feature matrix
        ]))
    ]
)

### Evaluación de clasificadores 

#### Árbol de decisiones

In [None]:
dt_pipeline = Pipeline([
  ('features_extractor', features_extractor),
  ('tree_classifier', DecisionTreeClassifier())
])

cross_val_score(dt_pipeline, train_set, train_set['label'], cv=10, n_jobs=-1)