# Aprendizaje Automatico - TP 1

In [1]:
%matplotlib inline

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

In [4]:
import simple_features as sf
import email_text_retrieval as etr
from util import *

## Features

Aca definimos distintos tipos de features para hacer las pruebas

In [5]:
def simple_features():
    # Extraigo dos atributos simples: 
    # 1) Longitud del mail.
    # 2) Cantidad de espacios en el mail.
    extractors = [ ('length', len), ('count_spaces', sf.count_spaces) ]
    feature_extractor = sf.SimpleFeaturesExtractor(extractors)

    return feature_extractor, "SimpleFeaturesExtractor"

In [6]:
def vectorizer_features(vectorizer_type, use_custom_tokenizer=False, **kwargs):
    tokenizer = etr.LemmaTokenizer() if use_custom_tokenizer else None    
    if vectorizer_type == "count":
        vectorizer = CountVectorizer(tokenizer=tokenizer, stop_words='english', **kwargs)
        vectorizer_descr = "BagOfWords"        
    elif vectorizer_type == "tfidf":
        vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', **kwargs)
        vectorizer_descr = "TfIdf"
    elif vectorizer_type == "hashing":
        vectorizer = HashingVectorizer(tokenizer=tokenizer, stop_words='english', **kwargs)
        vectorizer_descr = "HashingBagOfWords"
    else:
        raise ValueError('Invalid vectorizer_type. Expected \'count\', \'tfidf\' or \'hashing\'')
    
    return vectorizer, "%sSparseVectorizer" % vectorizer_descr

## Experimentacion

### Carga de datos

Cargamos y spliteamos el dataset

In [7]:
data, labels = load_data()

Dataset: 90000 samples(665.789MB) - Ham: 45000(50.00%) Spam: 45000(50.00%)


### Pruebas de Features

Aca probamos distintos tipos de features. Mediante la funcion 'train_benchmark_and_save' entrenamos, benchmarkeamos y grabamos en disco los resultados y modelos de cada combinacion que decidimos probar

#### Features del Baseline Example

In [8]:
run_ml_pipeline(
    simple_features(), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

Running ML Pipeline for SimpleFeaturesExtractor-DecisionTree(20160903-200133)
Extracting features from the dataset using a SimpleFeaturesExtractor
Done in 0.588000s
Set: 90000 samples 2 features

Fitting a DecisionTree Classifier
Done in 0.197000s

Running 10-Fold Cross Validation for DecisionTree
Done in 1.410000s
CV Score: mean 0.687811 std 0.018451


NameError: global name 'os' is not defined

#### Features Vectorizer de SkLearn

In [None]:
run_ml_pipeline(
    vectorizer_features('count', use_custom_tokenizer=False), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

In [None]:
run_ml_pipeline(
    vectorizer_features('tfidf', use_custom_tokenizer=False, sublinear_tf=True, min_df=0.01, max_df=0.7), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

In [None]:
run_ml_pipeline(
    vectorizer_features('hashing', use_custom_tokenizer=False, non_negative=True, n_features=2 ** 18), 
    (DecisionTreeClassifier(), 'DecisionTree'), 
    data, labels, cv_folds=10, n_jobs=8)

#### Features Nuestros

In [None]:
DecisionTreeClassifier().fit(np.array([[1, 2], [0, 5]]), labels[0:2])

In [None]:
data_sub = data[1:1000]
labels_sub = labels[1:1000]

In [None]:
extractor, _ = vectorizer_features('count', use_custom_tokenizer=False)
clf = DecisionTreeClassifier()

In [None]:
X = extractor.fit_transform(data_sub)

In [None]:
type(X)

In [None]:
clf.fit(X, labels_sub)