In [7]:
#import libraries 

import numpy as np
import pandas as pd
import os
import sys
import pickle
import re
import nltk
from sqlalchemy import create_engine
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, fbeta_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.stats import hmean
from scipy.stats.mstats import gmean
import nltk
import ssl
nltk.download(['averaged_perceptron_tagger', 'wordnet', 'punkt'])

#database_filepath = "../Data/disasterResponse.db"

def load_data(database_filepath):
    """
    Loading Data function  
    this function load the dataset and create variables for the model
    Input:
         database_filepath -> filepath 
    Output:
        X, y, category names -> X variable, y variable, category list names 
    """
    
    engine = create_engine('sqlite:///' + database_filepath)
    df = pd.read_sql('disasterResponse', engine)
    X = df['message']
    y = df.drop(['id', 'message', 'original', 'genre', 'categories', 'child_alone'], axis=1)
    category_names = y.columns
    
    return X, y, category_names

def tokenize(text):
    """
    Tokenization fucntion 
    This function tokenize the data 
    Input:
        Text -> Text messages
    Output:
        Clean_tokens -> Tokens extracted from the provided texts messages
    """
    #Replace the urls with a urlplaceholder string
    url_rgx = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    #Extract the urls from the provided text 
    detected_urls = re.findall(url_rgx, text)
    
    #Replace url with a url placeholder string
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
        
    #Extract the word tokens from the provided text
    tokens = nltk.word_tokenize(text)
    
    #Lemmanitizer to remove inflectional and derivationally related forms of a word
    lemmatizer = nltk.WordNetLemmatizer()

    #Clean tokens
    clean_tkns = []
    for tkn in tokens:
        clean_tkn = lemmatizer.lemmatize(tkn).lower().strip()
        clean_tkns.append(clean_tkn)
        
    return clean_tkns

def build_pipeline():
    """
    Build machine learning pipeline function 
    This function create the machine learning pipeline and apply gridsearch 
    Output:
    model -> Train and gridsearch 
    """
    #build machine learning pipeline
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
                ('tfidf_transformer', TfidfTransformer())
            ]))      
        ])),
        ('classifier', MultiOutputClassifier(RandomForestClassifier()))
    ]) 
    #Look for the bes parameters 
#    parameters = {
#    'features__text_pipeline__count_vectorizer__ngram_range': ((1, 1), (1, 2)),
#    'features__text_pipeline__count_vectorizer__max_df': (0.5, 1.0),
#    'features__text_pipeline__count_vectorizer__max_features': (None, 20),
#    'features__text_pipeline__tfidf_transformer__use_idf': (True, False),
#    'clf__n_estimators': [10, 100],
#    'clf__learning_rate': [0.01, 0.1],
#    'features__transformer_weights': (
#        {'text_pipeline': 1, 'starting_verb': 0.5},
#        {'text_pipeline': 0.5, 'starting_verb': 1},
#        {'text_pipeline': 0.8, 'starting_verb': 1},
#    )
#    }
    #Apply Gridsearch and fit 
#    model_pipeline = GridSearchCV(pipeline, param_grid= parameters, verbose = 2, n_jobs = -1)

    return pipeline #model_pipeline

def evaluate_model(model_pipeline, X_test, y_test, category_names):
    """
    Evaluate model function
    This function evaluate the model with classification reports 
    Input: 
    Model, X_test, y_test, category_names -> model, X and y data for test, category list names
    Output:
    Classification report -> classification report tables 
    """
    #predict on the test data
    y_preds_tst = model_pipeline.predict(X_test)

    #classification report
    for i, col in enumerate(y_test.columns): 
            print('-------------------------------------------------------')
            print("-->", col)
            print(classification_report(y_test.iloc[:,i], y_preds_tst[:,i]))
            

def save_model(model_pipeline, model_filepath):
    """
    Save Pipeline function
    This function saves trained model as pickle file, to be loaded later.
    Input:
        model -> GridSearchCV object
        model_filepath -> destination path to save .pkl file
    """
    #saved the model
    
    filenamepath = model_filepath + 'classifier.pkl'
    pickle.dump(model_pipeline, open(filenamepath, 'wb'))
    
    
def main():
    """
    Train Classifier Main function
    This function applies the Machine learning pipeline 
    1) Extract data con Sqlite db
    2) Train model 
    3) Estimate model perfromance on test data 
    4) Save trained model 
    """
    if len(sys.argv) == 3:
        
        #database_filepath = 
        #model_filepath = 
        
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n from {}'.format(database_filepath))
        #split data 
        #X, y, category_names = load_data(database_filepath)
        X, y, category_names = load_data(database_filepath)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        

        print('Building the machine learning pipeline ...')
        model_pipeline = build_pipeline()
        
        print('Training the machine learning pipeline ...')
        model_pipeline.fit(X_train, y_train)
        
        print('Evaluating model...')
        evaluate_model(model_pipeline, X_test, y_test, category_names)
       
        print('Saving machine learning pipeline to {} ...'.format(model_filepath))
        save_model(model_pipeline, model_filepath)
        print('Saved!')
        
        #print an error messagge 
    else:
        print("Please provide arguments correctly:\nFilepath of the disaster messages database as the first \n\
    and the filepath of the pickle file to save the model to as the second.\n\n\
    Arguments: \n\
    1) Path to Sqlite destination database \n\
    2) Path to pickle file name (where ML model needs to be saved)")
        
if __name__ == '__main__':
    main()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mikhaildiazandrade/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mikhaildiazandrade/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mikhaildiazandrade/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading data...
 from -f
Building the machine learning pipeline ...
Training the machine learning pipeline ...
Evaluating model...
-------------------------------------------------------
--> related
              precision    recall  f1-score   support

           0       0.75      0.27      0.39      1212
           1       0.81      0.97      0.88      4002
           2       0.80      0.13      0.23        30

    accuracy                           0.80      5244
   macro avg       0.79      0.46      0.50      5244
weighted avg       0.80      0.80      0.77      5244

-------------------------------------------------------
--> request
              precision    recall  f1-score   support

           0       0.89      0.98      0.94      4324
           1       0.86      0.44      0.58       920

    accuracy                           0.89      5244
   macro avg       0.88      0.71      0.76      5244
weighted avg       0.89      0.89      0.87      5244

-------------------------

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.95      1.00      0.97      4811
           1       0.95      0.37      0.54       433

    accuracy                           0.95      5244
   macro avg       0.95      0.69      0.75      5244
weighted avg       0.95      0.95      0.94      5244

-------------------------------------------------------
--> storm
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      4755
           1       0.79      0.41      0.54       489

    accuracy                           0.93      5244
   macro avg       0.86      0.70      0.75      5244
weighted avg       0.93      0.93      0.92      5244

-------------------------------------------------------
--> fire
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5180
           1       0.00      0.00      0.00        64

    accuracy                           0.99      52