In [None]:
# ## 2. ML Pipeline
# In a Python script, train_classifier.py, write a machine learning pipeline that:

# Loads data from the SQLite database
# Splits the dataset into training and test sets
# Builds a text processing and machine learning pipeline
# Trains and tunes a model using GridSearchCV
# Outputs results on the test set
# Exports the final model as a pickle file

# ## Requirements
# The machine learning script, train_classifier.py, runs in the terminal without errors. 
# The script takes the database file path and model file path, creates and trains a classifier, and stores the classifier into a pickle file to the specified model file path.

# The script uses a custom tokenize function using nltk to case normalize, lemmatize, and tokenize text. 
# This function is used in the machine learning pipeline to vectorize and then apply TF-IDF to the text.

# The script builds a pipeline that processes text and then performs multi-output classification on the 36 categories in the dataset. 
# GridSearchCV is used to find the best parameters for the model.

# The TF-IDF pipeline is only trained with the training data. The f1 score, precision and recall for the test set is outputted for each category.

In [14]:
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV

from train_model_functions import load_data, build_model, show_results


X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X.values, y)

pipeline = build_model()

parameters = {'clf__estimator__n_estimators': [25,50, 100],
              'clf__estimator__min_samples_split': [2, 3],
             }

cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x000002AB947CF040>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             param_grid={'clf__estimator__min_samples_split': [2],
                         'clf__estimator__n_estimators': [50, 100]})

In [1]:
y_pred = cv.predict(X_test)

show_results(y_test=y_test, y_pred=y_pred)

with open('filename.pickle', 'wb') as handle:
    pickle.dump(cv, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('filename.pickle', 'rb') as handle:
#     cv = pickle.load(handle)
    


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ylc.mariman.MVGM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ylc.mariman.MVGM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ylc.mariman.MVGM\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\ylc.mariman.MVGM\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


KeyboardInterrupt: 

In [None]:
import sys
import pickle
import nltk
from pathlib import Path
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'tagsets', 'stopwords'])

import os
import re
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sqlalchemy import create_engine
from sklearn.multioutput import MultiOutputClassifier
    

def load_data(database_filepath):
    """Load the data from the SQL lite database"""
    engine = create_engine(f'sqlite:///{database_filepath}')
    sql_query = "SELECT * FROM DisasterResponse"
#     try:
    df = pd.read_sql(sql_query, engine)
#     except Exception as e:
#         sql_query = "Select * FROM sqlite_master WHERE type='table'"
#         print(pd.read_sql(sql_query, engine))
    X = df.message
    category_names = ['related', 'request', 'offer',
           'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
           'security', 'military', 'child_alone', 'water', 'food', 'shelter',
           'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
           'infrastructure_related', 'transport', 'buildings', 'electricity',
           'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
           'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
           'other_weather', 'direct_report']
    y = df[category_names]
    return X, y, category_names

def tokenize(text):
    """
    Tokenization of the text. Includes:
    -url replacing
    -tokenization
    -all lower text
    -removing stopwords
    -selecting adjectives, verbs, nouns and adverbs
    -Lemmatization
    
    Args:
    text: str text to be tokenized
    
    Returns:
    list of cleaned tokens
    """ 
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_regex, 'urlplaceholder', text)

    # tokenize
    text = re.sub(r"[^A-Za-z]", " ", text.lower())
    tokens = text.split(" ")
    
    # nltk stopwords + urlplaceholder
    stopwords_new = stopwords.words('english')
    tokens = [word for word in tokens if word not in stopwords.words('english')+ ['urlplaceholder']]
    
    tokens_tagged = nltk.pos_tag(tokens)
    words = [word for word, tag in tokens_tagged if tag in ["JJ", "JJR", "JJS", # Adjectives
                                                            "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", # Verbs
                                                            "NN", "NNP", "NNPS", "NNS", #  Nouns
                                                            "RB", "RBR", "RBS", # Adverbs
                                                            ]]
    
    clean_tokens = [WordNetLemmatizer().lemmatize(w, pos="v") for w in words] # v for verbs
    return clean_tokens

def build_model():
    """
    Creating the pipeeline with countvectorization,
    TF-IDF transformation and randomforestclassification with a multioutputclassifier
    """
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    parameters = {'clf__estimator__n_estimators': [25,50, 100],
              'clf__estimator__min_samples_split': [2, 3],
             }

    cv = GridSearchCV(pipeline, param_grid=parameters)
    
    return cv

def evaluate_model(model, X_test, Y_test, category_names):
    """
    Create a function that shows the precision, recall and f1. 
    SKlearn package is not used, because it was not working properly.
    Displays the measures in an DataFrame.
    
    Args:
    y_test: pd.DataFrame of the ground truth
    y_pred: numpy list of the predicted outcomes
    
    Returns:
    pd.DataFrame containing the precision, recall and f1 score
    
    """
    
    Y_pred = cv.predict(X_test)

    report = pd.DataFrame(data=[], index=['precision','recall','f1'], columns=category_names)    

    for col in category_names:
        test_col = Y_test[col].reset_index(drop=True).rename("test")
        pred_col = pd.DataFrame(Y_pred, columns=Y_test.columns)[col].rename("predict")
        df_scores = pd.DataFrame([pd.to_numeric(test_col), pd.to_numeric(pred_col)]).T
        tp = np.where((df_scores.test==1) & (df_scores.predict==1), 1, 0).sum()
        fp = np.where((df_scores.test==0) & (df_scores.predict==1), 1, 0).sum()
        fn = np.where((df_scores.test==1) & (df_scores.predict==0), 1, 0).sum()
        
        if tp==0:
            precision=0
            recall=0
            f1=0
        else:
            precision = tp / ( tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * (precision*recall) / (precision + recall)

        report.loc['precision'][col] = precision
        report.loc['recall'][col] = recall
        report.loc['f1'][col] = f1
    display(report)
    return report

def save_model(model, model_filepath):
    with open(Path(model_filepath) / "model.pkl", 'wb') as handle:
        pickle.dump(cv, handle, protocol=pickle.HIGHEST_PROTOCOL)

def main():        
    print(os.getcwd())
    database_filepath, model_filepath = ["../data/DisasterResponse.db", "../models/classifier.pkl"]
    print('Loading data...\n    DATABASE: {}'.format(database_filepath))
    X, Y, category_names = load_data(database_filepath)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    print('Building model...')
    model = build_model()

    print('Training model...')
    model.fit(X_train, Y_train)

    print('Evaluating model...')
    evaluate_model(model, X_test, Y_test, category_names)

    print('Saving model...\n    MODEL: {}'.format(model_filepath))
    save_model(model, model_filepath)

    print('Trained model saved!')


if __name__ == '__main__':
    main()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ylc.mariman.MVGM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ylc.mariman.MVGM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ylc.mariman.MVGM\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\ylc.mariman.MVGM\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ylc.mariman.MVGM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


D:\OneDrive - MVGM\WerkbestandenYannick\Werkmap_Python\Projecten\Zelf-studie\Udacity\Nanodegree_data_scientist\DisasterResponsePipeline\models
Loading data...
    DATABASE: ../data/DisasterResponse.db
Building model...
Training model...
