# ML Pipeline Preparation

###  Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine 
import re

# sklearn 
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.externals import joblib

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download(['punkt', 'wordnet', 'stopwords' ,
               'averaged_perceptron_tagger'])

# Enable pep8 code style test
%load_ext pycodestyle_magic

[nltk_data] Downloading package punkt to /home/eddadsi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eddadsi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eddadsi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eddadsi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [11]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql('Messages', con=engine)
X = df.message
Y = df.drop(labels=['id', 'message', 'original', 'genre'], axis=1)

###  Write a tokenization function to process your text data

In [9]:
def tokenize(text):
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # tokenize text
    tokens = word_tokenize(text)

    # lemmatize and remove stop words
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens
              if word not in stop_words]

    return tokens

## Write a starting verbe extractor

In [13]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        if len(sentence_list) != 0:
            for sentence in sentence_list:
                pos_tags = nltk.pos_tag(tokenize(sentence))
                if len(pos_tags) != 0:
                    first_word, first_tag = pos_tags[0]
                    if first_tag in ['VB', 'VBP', 'VBG'] or first_word == 'RT':
                        return True
            return False
        else:
            return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

## Write a length message extractor

In [14]:
class LenMessageExtractor(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        return len(tokenize(X))

## Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [16]:
pipeline = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

        ('starting_verb', StartingVerbExtractor()),

        ('length_text', StartingVerbExtractor())

    ])),

    ('clf', MultiOutputClassifier(
        RandomForestClassifier(
            n_estimators=10, random_state=42)))
])

###  Train pipeline
- Split data into train and test sets
- Train pipeline

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X.values, Y.values)
pipeline.fit(X_train, y_train)

###  Test  model

In [24]:
# predict
y_pred = pipeline.predict(X_test[:20])

class_names = list(Y.columns)


def calcul_metrics(y_test, y_pred):
    dic_metrics = {'precision': [], 'recall': [], 'f1-score': []}
    for column in range(y_pred.shape[1]):
        metrics = classification_report(y_test[:, 0], y_pred[:, 0],
                                        output_dict=True)['micro avg']
        dic_metrics['precision'].append(metrics['precision'])
        dic_metrics['recall'].append(metrics['recall'])
        dic_metrics['f1-score'].append(metrics['f1-score'])

    # Stock metrics in a Dataframe
    df_metrics = pd.DataFrame(dic_metrics, index=class_names)
    return df_metrics.mean()

In [61]:
# metrics
calcul_metrics(y_test, y_pred)

  'precision', 'predicted', average, warn_for)


precision    0.9
recall       0.9
f1-score     0.9
dtype: float64

### Improve your model
Use grid search to find better parameters. 

In [29]:
# parametres
parameters = {
    'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
    'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
    'features__text_pipeline__vect__max_features': (None, 5000, 10000),
    'features__text_pipeline__tfidf__use_idf': (True, False),
    'clf__estimator__n_estimators': [10, 20, 30, 50, 100],
    'clf__estimator__min_samples_split': [2, 3, 4, 5]
}

# GridSearchCV
cv = GridSearchCV(pipeline, param_grid=parameters, cv=3, n_jobs=-1)

### Test your model
Show the accuracy, precision, and recall of the tuned model.  

In [30]:
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)
calcul_metrics(y_test, y_pred)

In [31]:
# print results metrics
calcul_metrics(y_test, y_pred)

In [76]:
# print model best Parameters
print("\nBest Parameters:", cv.best_params_)


Best Parameters: {'clf__estimator__min_samples_split': 2, 'clf__estimator__n_estimators': 50, 'features__text_pipeline__tfidf__use_idf': True, 'features__text_pipeline__vect__max_df': 0.5, 'features__text_pipeline__vect__max_features': None, 'features__text_pipeline__vect__ngram_range': (1, 2)}


### Export your model as a pickle file

In [77]:
# save the classifier
filename = 'digits_classifier.joblib.pkl'
_ = joblib.dump(cv, filename, compress=9)

In [78]:
# load it again
#clf2 = joblib.load(filename)

### Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.