In [None]:
# import libraries
import numpy as np
import nltk
import re
import pandas as pd

from sqlalchemy import create_engine

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV

import pickle
import time

# download NLTK data
nltk.download(['punkt', 'wordnet', 'stopwords'])

In [None]:
# Create a function to load in the data from the database
def load_database_data():

    engine = create_engine('sqlite:///UdacityProject2.db')
    df = pd.read_sql_table('DisasterResponseClean', engine)

    X = df.message
    Y = df.drop(['message', 'original', 'id', 'genre'], axis = 1)
    target_names = Y.columns

    return X, Y, target_names

X, Y, category_names = load_database_data()

In [None]:
def tokenize(text):

    # initialize WordNetLemmatizer   
    lemmatizer = WordNetLemmatizer()

    # replace URLs with blanks
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
   
    # get list of all urls using regex
    detected_urls = re.findall(url_regex, text)
    
    # replace each url in text string with blanks
    for url in detected_urls:
        text = text.replace(url, "")

    # Tokenize text
    tokens = text.split()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tok = re.sub(r"[^a-zA-Z0-9]", " ", clean_tok)
        clean_tokens.append(clean_tok)  
    
    # Remove stop words
    final_clean_tokens = [w for w in clean_tokens if w not in stopwords.words("english")]

    return final_clean_tokens
    

In [None]:
# Function to build the model using a machine learning pipeline
def build_pipeline():
    pipeline = Pipeline(steps = [
        ('vector',CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('classifier', MultiOutputClassifier(RandomForestClassifier()))
    ])

    return pipeline

In [None]:
def test_model(model, X_test, Y_test, category_names):
    Y_pred = model.predict(X_test)
    report  = classification_report(Y_test, Y_pred, target_names=category_names)
    print(report)

In [None]:
def main():
    print('Loading the data from the database:')
    start = time.time()
    X, Y, target_names = load_database_data()
    end = time.time()
    print('Data successfully loaded! Runtime (sec): ', (end - start))

    print('Split test-train:')
    start = time.time()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
    end = time.time()
    print('Test-train split successful! Runtime (sec): ', (end - start))

    print('Build pipeline:')
    start = time.time()
    pipeline = build_pipeline()
    end = time.time()
    print('Build successful! Runtime: (sec)', (end - start))

    print('Train the model:')
    start = time.time()
    pipeline.fit(X_train, Y_train)
    end = time.time()
    print('Training successful! Runtime: (min)', (end - start)/60)

    print('Testing the model and generating report:')
    test_model(pipeline, X_test, Y_test, target_names)

In [None]:
if __name__ == '__main__':
    main()