# Import the libraries and load data from sqlite database

In [1]:
# Import libraries
import joblib
import re
import pandas as pd
from sqlalchemy import create_engine

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to C:\Users\Michael
[nltk_data]     Fuchs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Michael
[nltk_data]     Fuchs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load data from database
engine = create_engine('sqlite:///../data/messages.db')
df = pd.read_sql('SELECT * FROM messages', engine)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X = df['message']
Y = df.loc[:, 'related':'direct_report']

In [4]:
print(X.shape)
X.head()

(26028,)


0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

In [5]:
print(Y.shape)
Y.head()

(26028, 36)


Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Tokenization function to process text data

In [6]:
def tokenize(text):
    #Case Normalization & remove punctuation 
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) 
    
    #tokenization methods 
    
    words = word_tokenize(text)
    
    words = [w for w in words if w not in stopwords.words("english")]
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
    words = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]

    
    return words

In [7]:
# Test the tokenizer-function
for message in X[:5]:
    tokens = tokenize(message)
    print(message)
    print(tokens, '\n')

Weather update - a cold front from Cuba that could pass over Haiti
['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti'] 

Is the Hurricane over or is it not over
['hurricane'] 

Looking for someone but no name
['look', 'someone', 'name'] 

UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.
['un', 'report', 'leogane', '80', '90', 'destroy', 'hospital', 'st', 'croix', 'function', 'need', 'supply', 'desperately'] 

says: west side of Haiti, rest of the country today and tonight
['say', 'west', 'side', 'haiti', 'rest', 'country', 'today', 'tonight'] 



# Build a machine learning pipeline

In [8]:
dt_clf = MultiOutputClassifier(DecisionTreeClassifier())

pipeline_dt_clf = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', dt_clf)
])

# Train pipeline

In [9]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((18219,), (7809,), (18219, 36), (7809, 36))

In [10]:
pipeline_dt_clf.fit(X_train, Y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x000002494AE3DE18>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=DecisionTreeClassifier()))])

# Evaluate your model

In [11]:
# Define a function to evaluate models
def evaluate_model(model, X_test, Y_test, label_names, print_reports=False):
    
    pred = pd.DataFrame(model.predict(X_test), columns=label_names)
    
    metrics = []
    for col in label_names:

        # Store metrics in a list
        report = classification_report(Y_test[col], pred[col])
        scores = report.split('accuracy')[1].split()
        metrics.append([float(scores[i]) for i in [0, 4, 5, 6, 10, 11, 12]])

        # Print classification report
        if print_reports:
            print('-' * 53)
            print(f'Label: {col}')
            print(report)
            
    # Convert the metrics list into a dataframe
    metric_names = ['accuracy', 'macro_avg_precision', 'macro_avg_recall', 'macro_avg_f1', 'weighted_avg_precision', 'weighted_avg_recall', 'weighted_avg_f1']
    return pd.DataFrame(metrics, columns=metric_names, index=label_names)

In [12]:
# Evaluate pipeline
report_dt_clf = evaluate_model(pipeline_dt_clf, X_test, Y_test, Y_test.columns)
report_dt_clf

Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.77,0.68,0.67,0.68,0.77,0.77,0.77
request,0.86,0.76,0.74,0.75,0.85,0.86,0.86
offer,0.99,0.53,0.53,0.53,0.99,0.99,0.99
aid_related,0.71,0.71,0.71,0.71,0.71,0.71,0.71
medical_help,0.9,0.66,0.64,0.65,0.89,0.9,0.89
medical_products,0.94,0.7,0.67,0.68,0.93,0.94,0.94
search_and_rescue,0.96,0.62,0.58,0.6,0.95,0.96,0.96
security,0.97,0.54,0.53,0.54,0.96,0.97,0.97
military,0.96,0.71,0.68,0.69,0.96,0.96,0.96
child_alone,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
# Calculate the mean value of the metrics
report_dt_clf.mean()

accuracy                  0.932778
macro_avg_precision       0.687500
macro_avg_recall          0.673333
macro_avg_f1              0.679722
weighted_avg_precision    0.928611
weighted_avg_recall       0.932778
weighted_avg_f1           0.931389
dtype: float64

# Improve your model

In [15]:
parameters = {'clf__estimator__max_depth': [10, 50, None],
              'clf__estimator__min_samples_leaf':[2, 5, 10]}

cv = GridSearchCV(estimator=pipeline_dt_clf, param_grid=parameters, n_jobs=-1)
cv.fit(X_train, Y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x000002494AE3DE18>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=DecisionTreeClassifier()))]),
             n_jobs=-1,
             param_grid={'clf__estimator__max_depth': [10, 50, None],
                         'clf__estimator__min_samples_leaf': [2, 5, 10]})

In [16]:
# Evaluate decision tree clf with best estimators
dt_clf_grid = cv.best_estimator_
report_dt_clf_grid = evaluate_model(dt_clf_grid, X_test, Y_test, Y_test.columns)
report_dt_clf_grid

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.78,0.69,0.68,0.69,0.78,0.78,0.78
request,0.88,0.81,0.73,0.76,0.87,0.88,0.87
offer,1.0,0.5,0.5,0.5,0.99,1.0,0.99
aid_related,0.73,0.72,0.72,0.72,0.73,0.73,0.73
medical_help,0.92,0.74,0.63,0.66,0.9,0.92,0.91
medical_products,0.95,0.8,0.66,0.7,0.94,0.95,0.95
search_and_rescue,0.97,0.75,0.58,0.61,0.96,0.97,0.96
security,0.98,0.66,0.51,0.52,0.97,0.98,0.97
military,0.97,0.76,0.67,0.71,0.96,0.97,0.97
child_alone,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
# Calculate the mean value of the metrics
report_dt_clf_grid.mean()

accuracy                  0.944722
macro_avg_precision       0.755833
macro_avg_recall          0.662222
macro_avg_f1              0.687500
weighted_avg_precision    0.937222
weighted_avg_recall       0.944722
weighted_avg_f1           0.939167
dtype: float64

# Try Random Forest Classifier

In [18]:
rf_clf = MultiOutputClassifier(RandomForestClassifier())

pipeline_rf_clf = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', rf_clf)
])

In [19]:
pipeline_rf_clf.fit(X_train, Y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x000002494AE3DE18>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

In [20]:
# Evaluate the random forest classifier
report_rf_clf = evaluate_model(pipeline_rf_clf, X_test, Y_test, Y_test.columns)
report_rf_clf

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.82,0.78,0.67,0.7,0.81,0.82,0.8
request,0.89,0.87,0.74,0.78,0.89,0.89,0.88
offer,1.0,0.5,0.5,0.5,0.99,1.0,0.99
aid_related,0.78,0.78,0.77,0.77,0.78,0.78,0.78
medical_help,0.92,0.76,0.53,0.53,0.89,0.92,0.88
medical_products,0.95,0.89,0.56,0.59,0.94,0.95,0.93
search_and_rescue,0.97,0.83,0.52,0.53,0.96,0.97,0.96
security,0.98,0.99,0.5,0.5,0.98,0.98,0.97
military,0.97,0.88,0.52,0.52,0.96,0.97,0.95
child_alone,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
# Calculate the mean value of the metrics
report_rf_clf.mean()

accuracy                  0.948611
macro_avg_precision       0.795278
macro_avg_recall          0.601389
macro_avg_f1              0.618333
weighted_avg_precision    0.939444
weighted_avg_recall       0.948611
weighted_avg_f1           0.933889
dtype: float64

# Improve your model

In [22]:
# Declaring parameters
parameters = {'clf__estimator__min_samples_leaf': [2, 3, 4]}


cv = GridSearchCV(estimator=pipeline_rf_clf, param_grid=parameters, n_jobs=-1)
cv.fit(X_train, Y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x000002494AE3DE18>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             n_jobs=-1,
             param_grid={'clf__estimator__min_samples_leaf': [2, 3, 4]})

In [23]:
cv.best_params_

{'clf__estimator__min_samples_leaf': 2}

In [24]:
# Evaluate decision tree clf with best estimators
rf_clf_grid = cv.best_estimator_
report_rf_clf_grid = evaluate_model(rf_clf_grid, X_test, Y_test, Y_test.columns)
report_rf_clf_grid

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.82,0.78,0.67,0.69,0.81,0.82,0.8
request,0.88,0.89,0.68,0.73,0.88,0.88,0.86
offer,1.0,0.5,0.5,0.5,0.99,1.0,0.99
aid_related,0.79,0.78,0.78,0.78,0.79,0.79,0.79
medical_help,0.92,0.76,0.52,0.51,0.89,0.92,0.88
medical_products,0.94,0.76,0.51,0.5,0.93,0.94,0.92
search_and_rescue,0.97,0.99,0.5,0.5,0.97,0.97,0.96
security,0.98,0.49,0.5,0.5,0.96,0.98,0.97
military,0.97,0.98,0.51,0.51,0.97,0.97,0.95
child_alone,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
# Calculate the mean value of the metrics
report_rf_clf_grid.mean()

accuracy                  0.945278
macro_avg_precision       0.748333
macro_avg_recall          0.576111
macro_avg_f1              0.582778
weighted_avg_precision    0.935556
weighted_avg_recall       0.945278
weighted_avg_f1           0.929167
dtype: float64

# Save models

In [26]:
#joblib.dump(pipeline_dt_clf, '../models/dt_model.pkl')

['../models/dt_model.pkl']

In [27]:
#joblib.dump(dt_clf_grid, '../models/dt_model_grid.pkl')

['../models/dt_model_grid.pkl']

In [28]:
joblib.dump(pipeline_rf_clf, '../models/rf_model.pkl')

['../models/rf_model.pkl']

In [29]:
#joblib.dump(rf_clf_grid, '../models/rf_model_grid.pkl')

['../models/rf_model_grid.pkl']

# Create `train.py`

In [48]:
import sys
import joblib
import re
import pandas as pd
from sqlalchemy import create_engine

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# List of stopwords
stop = stopwords.words('english')


def load_data(database_filepath):

    '''
    Load in the clean dataset from the SQLite database.

    Args:
        database_filepath (str): path to the SQLite database

    Returns:
        (DataFrame) X: Independent Variables , array which contains the text messages
        (DataFrame) Y: Dependent Variables , array which contains the labels to the messages
        (DataFrame) categories: Data Column Labels , a list with the target column names, i.e. the category names
    '''
        
    engine = create_engine('sqlite:///' + database_filepath)
    df = pd.read_sql('SELECT * FROM messages', engine)
    X = df['message'].copy()
    Y = df.loc[:, 'related':'direct_report']
    categories = Y.columns.tolist()
    return X, Y, categories



def tokenize(text):
    
    '''
    Tokenizes message data

    Args:
        text (str): Text to tokenize

    Returns:
        (DataFrame) clean_messages: array of tokenized message data
    '''
    
    #Case Normalization & remove punctuation 
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) 
    
    #tokenization methods 
    
    words = word_tokenize(text)
    
    words = [w for w in words if w not in stopwords.words("english")]
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
    words = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]

    
    return words


def build_model():

    '''
    Build a machine learning pipeline that converts text data into a numeric vector then classifies multiple binary
    target labels.

    Args:
        None

    Returns:
        (Sklearn pipeline) pipeline estimator
    '''
    
    pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    return pipeline



def evaluate_model(model, X_test, Y_test, category_names):

    '''
    Evaluate the machine learning model using a test dataset and print the classification report metrics for each label.

    Args:
        model (Sklearn estimator): machine learning model
        X_test (list-like object): test set text data
        Y_test (Pandas dataframe): test set target labels
        category_names (list): names of target labels

    Returns:
        (Pandas dataframe) Classification report metrics
    '''
        
    pred = pd.DataFrame(model.predict(X_test), columns=category_names)

    metrics = []

    for col in category_names:
        report = classification_report(Y_test[col], pred[col])
        scores = report.split('accuracy')[1].split()
        metrics.append([float(scores[i]) for i in [0, 4, 5, 6, 10, 11, 12]])

    metric_names = ['accuracy', 'macro_avg_precision', 'macro_avg_recall', 'macro_avg_f1', 'weighted_avg_precision',
                    'weighted_avg_recall', 'weighted_avg_f1']
    metrics_df = pd.DataFrame(metrics, columns=metric_names, index=category_names)

    print(metrics_df)
    print(metrics_df.sum)
    return metrics_df
        

def save_model(model, model_filepath):

    '''
    Save the machine learning model as a pickle file.

    Args:
        model (Sklearn estimator): machine learning model
        model_filepath (str): path to save the model

    Returns:
        None
    '''
        
    joblib.dump(model, model_filepath)
    return


def main():

    '''
    This file is the ML pipeline that trains the classifier and saves it as a pickle file.

    From this project's root directory, run this file with:
    python models/train.py data/messages.db models/classifier.pkl
    '''
        
    if len(sys.argv) == 3:

        database_filepath, model_filepath = sys.argv[1:]

        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
        
        print('Building model...')
        model = build_model()
        
        print('Training model...')
        model.fit(X_train, Y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:

        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train.py ../data/messages.db classifier.pkl')


# if __name__ == '__main__':
#     main()



[nltk_data] Downloading package stopwords to C:\Users\Michael
[nltk_data]     Fuchs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Michael
[nltk_data]     Fuchs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
