In [None]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords'])

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Prepare data

engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('messages_and_categories', engine)

X = df.message #.values
y = df.drop(['message', 'original', 'genre'], axis=1) #.values

target_names = y.columns.tolist()

def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    # replace each url in text string with placeholder
    text = re.sub(url_regex, "urlplaceholder", text)
    
    # normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # tokenize text
    tokens = word_tokenize(text)
    
    # remove stop words, strip whitespaces and lemmatize
    lemmatizer = WordNetLemmatizer()
#     stop_words = stopwords.words("english")
    tokens = [lemmatizer.lemmatize(w.strip()) for w in tokens] # if w not in stop_words]

    
    tokens = pd.Series(tokens).drop_duplicates().tolist()
    
    return tokens

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### **RandomForestClassifier**

**First Round**

In [None]:
# default n_estimators=100, min_samples_split=2, max_features='auto'
model = RandomForestClassifier(random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model, n_jobs=-1))
])

parameters = {'vect__max_df': [0.75, 1],
              'clf__estimator__class_weight' : [None, 'balanced', 'balanced_subsample']
              }

cv = GridSearchCV(pipeline, cv=2, param_grid=parameters, scoring='f1_macro', 
                  verbose=3, n_jobs=-1)


cv.fit(X_train, y_train)

In [None]:
# class_weight= None, max_df= 0.75
cv.best_estimator_.get_params()

In [None]:
cv.best_score_

0.21374973580019146

**Second Round**

In [None]:
# default n_estimators=100, min_samples_split=2, max_features='auto'
# class_weight= None, max_df= 0.75

model = RandomForestClassifier(random_state=42, class_weight= None)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, max_df= 0.75)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model, n_jobs=-1))
])

parameters = {'clf__estimator__min_samples_split': [2, 4, 6, 8, 10, 12]}


cv = GridSearchCV(pipeline, cv=2, param_grid=parameters, scoring='f1_macro', 
                  verbose=2, n_jobs=-1)


cv.fit(X_train, y_train)

In [None]:
# min_samples_split=4
cv.best_estimator_.get_params()

In [None]:
cv.best_score_

0.21915281658478253

### **AdaBoostClassifier**

In [None]:
# parameters tried

parameters = {'vect__max_df': [0.75, 1],
              'clf__estimator__n_estimators': [10, 15, 20, 30, 40, 50, 60, 100, 150],
              'clf__estimator__learning_rate': [0.8, 1.0, 1.1, 1.2, 1.3, 1.4],
              }

**First Round**

In [None]:
# default n_estimators=50, learning_rate=1.0

model = AdaBoostClassifier(random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model))
])

parameters = {'vect__max_df': [0.75, 1]}
              # 'clf__estimator__n_estimators': [50, 100, 150],
              # 'clf__estimator__learning_rate': [0.5, 0.8, 1.0, 1.2]


cv = GridSearchCV(pipeline, cv=3, param_grid=parameters, scoring='f1_macro', 
                  verbose=2, n_jobs=2)


cv.fit(X_train, y_train)

In [None]:
# max_df= 0.75
cv.best_estimator_.get_params()

In [None]:
cv.best_score_

0.39845310393333366

**Second Round**

In [None]:
# max_df = 0.75

model = AdaBoostClassifier(random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, max_df = 0.75)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model))
])

parameters = {'clf__estimator__n_estimators': [50, 100, 150],
              'clf__estimator__learning_rate': [0.8, 1.0, 1.2] }


cv = GridSearchCV(pipeline, cv=3, param_grid=parameters, scoring='f1_macro', 
                  verbose=2, n_jobs=2)


cv.fit(X_train, y_train)

In [None]:
# n_estimators = 50, learning_rate = 1.2
cv.best_estimator_.get_params()

In [None]:
cv.best_score_

0.40387315390927725

**Third round**

In [None]:
# max_df = 0.75

model = AdaBoostClassifier(random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, max_df = 0.75)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model))
])

parameters = {'clf__estimator__n_estimators': [40, 50, 60],
              'clf__estimator__learning_rate': [1.0, 1.2, 1.4]}


cv = GridSearchCV(pipeline, cv=3, param_grid=parameters, scoring='f1_macro', 
                  verbose=2, n_jobs=2)


cv.fit(X_train, y_train)

In [None]:
# n_estimators= 40, learning_rate= 1.2
cv.best_estimator_.get_params()

In [None]:
cv.best_score_

0.4045105421381187

**Fourth round**

In [None]:
# max_df = 0.75

model = AdaBoostClassifier(random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, max_df = 0.75)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model))
])

parameters = {'clf__estimator__n_estimators': [20, 30, 40],
              'clf__estimator__learning_rate': [1.2, 1.3, 1.4]}


cv = GridSearchCV(pipeline, cv=3, param_grid=parameters, scoring='f1_macro', 
                  verbose=2, n_jobs=2)


cv.fit(X_train, y_train)

In [None]:
# n_estimators': 20, learning_rate': 1.2
cv.best_estimator_.get_params()

In [None]:
cv.best_score_

0.41101411092824636

**Fifth round**

In [None]:
# max_df = 0.75

model = AdaBoostClassifier(random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, max_df = 0.75)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model))
])

parameters = {'clf__estimator__n_estimators': [10, 15, 20, 30],
              'clf__estimator__learning_rate': [1.1, 1.2, 1.3]}


cv = GridSearchCV(pipeline, cv=3, param_grid=parameters, scoring='f1_macro', 
                  verbose=2, n_jobs=2)


cv.fit(X_train, y_train)

In [None]:
# n_estimators=20, learning_rate=1.2
cv.best_estimator_.get_params()

In [None]:
cv.best_score_

0.41101411092824636

**Final check**

In [None]:
# max_df = 0.75

model = AdaBoostClassifier(random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, max_df = 0.75)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model))
])

parameters = {
    'clf__estimator__n_estimators': [100, 150, 200], #10, 50, 500
    'clf__estimator__learning_rate': [1.0, 1.5, 2.0]
        }

cv = GridSearchCV(pipeline, cv=3, param_grid=parameters, scoring='f1_macro', 
                  verbose=2, n_jobs=2)


cv.fit(X_train, y_train)

        

In [None]:
# n_estimators': 150, learning_rate': 1.0 but LESS score
cv.best_estimator_.get_params()

In [None]:
cv.best_score_

0.40358428911806615

**Best parameters for AdaBoost**
- n_estimators = 20
- learning_rate = 1.2

### **Resumen para Udacity**

Parameters tried

In [None]:
model = AdaBoostClassifier(random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model, n_jobs=-1))
])

parameters = {'vect__max_df': [0.75, 1],
              'clf__estimator__n_estimators': [50,100, 150, 200],
              'clf__estimator__learning_rate': [0.5, 1.0, 1.5]}


cv = GridSearchCV(pipeline, cv=3, param_grid=parameters, scoring='f1_macro', 
                  verbose=2, n_jobs=-1)

with active_session():
    cv.fit(X_train, y_train)

Este es el mejor modelo y el score:

No entiendo por que sigue sin clasificar bien estos mensajes que vi en otros proyectos que si son clasificados correctamente.

Tambien probe optimizar random forest:

In [None]:
model = RandomForestClassifier(random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(model, n_jobs=-1))
])

parameters = {'vect__max_df': [0.75, 1],
              'clf__estimator__min_samples_split': [2, 4, 6, 8, 10, 12],
              'clf__estimator__max_features' : ['auto', 'sqrt']
              'clf__estimator__class_weight' = ['balanced', 'balanced_subsample']}


cv = GridSearchCV(pipeline, cv=3, param_grid=parameters, scoring='f1_macro', 
                  verbose=2, n_jobs=-1)

with active_session():
    cv.fit(X_train, y_train)

Este es el mejor modelo y el score:

Entiendo que esto no es necesario para el proyecto, pero me gustaria entender como podria mejorar este modelo como si se tratara de un caso real.

Encontre que podria tratar de amplificar las categorias que tienen poca data, pero parece que no puedo incluir mlsmote dentro de la pipeline.
Tiene sentido aplicarlo a todo el dataset y luego dividir en training y test set?
