In [27]:
import sys
import pandas as pd
import warnings
import sqlite3
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
import pickle
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

warnings.filterwarnings('ignore')


In [2]:
def load_data(file_path):
    """
    :param file_path: This variable takes in the filepath of the database to read.
    :return: The function returns the table as a dataframe

    """

    conn = sqlite3.connect(file_path)

    df = pd.read_sql_query("SELECT * from Disaster_data", con=conn)

    X = df['message']
    #Y = df['genre']
    Y = df.drop(['id', 'message', 'original', 'genre'], axis=1)
    category_names = Y.columns

    conn.close()

    return X, Y, category_names


In [3]:
X,Y,category_names = load_data('../data/DisasterResponse.db')
Y

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26213,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26214,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def tokenize(text):
    """
    :param text: takes in a string value and using nltk methods normalizes,tokenizes and lemmatizes it.
    :return:
    
    """

    # Find all urls if any exists in the text and replace it with the word 'url_placeholder'
    url_format = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    all_urls = re.findall(url_format, text) 

    for u in all_urls:
        text = text.replace(u, 'url_placeholder')

    # Tokenize the text
    tokenized_text = word_tokenize(text.lower())

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []

    for token in tokenized_text:
        lemmatized_text = lemmatizer.lemmatize(token).strip()
        clean_tokens.append(lemmatized_text)

    return clean_tokens


In [5]:
category_names

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [6]:
def build_model():
    """
        This function uses pipeline to feed into GridSearchCV in order to determine the best parameters.

    """

    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(
            RandomForestClassifier()))
    ])
    parameters = {
                'clf__estimator__n_estimators': [5,10],
                'clf__estimator__min_samples_split': [2,4]}
    
    grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=5, n_jobs=-1, verbose=2)

    return grid_search


In [7]:
X,Y,category_names = load_data('../data/DisasterResponse.db')

X_train,X_test,y_train,y_test = train_test_split(X,Y)

In [8]:
category_names

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [9]:
model = build_model()

In [10]:
m = model.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:  1.5min remaining:  2.2min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.2min finished


In [11]:
m.best_params_

{'clf__estimator__min_samples_split': 2, 'clf__estimator__n_estimators': 10}

In [41]:
def evaluate_model(model, X_test, Y_test, category_names):
    """
    INPUT:
    model - ML model
    X_test - test messages
    y_test - categories for test messages
    category_names - category name for y
    
    OUTPUT:
    none - print scores (precision, recall, f1-score) for each output category of the dataset.
    """
    accuracy_scores = []
    precision_scores = []
    f1_scores = []
    all_recall = []

    y_pred = model.predict(X_test)

    for i, cat in enumerate(category_names):

        accuracy_scores.append(accuracy_score(
            Y_test.values[:, i], y_pred[:, i])*100)
        precision_scores.append(precision_score(
            Y_test.values[:, i], y_pred[:, i], average='weighted')*100)
        f1_scores.append(
            f1_score(Y_test.values[:, i], y_pred[:, i], average='weighted')*100)
        all_recall.append(recall_score(
            Y_test.values[:, i], y_pred[:, i], average='weighted')*100)

    all_scores_dict = dict(zip(category_names, zip(
        accuracy_scores, precision_scores, f1_scores, all_recall)))

    all_scores_df = pd.DataFrame(all_scores_dict).T

    all_scores_df.columns = ['Accuracy', 'Precision', 'F1', 'Recall']

    all_scores_df = all_scores_df.reset_index().rename(
        columns={'index': 'Feature'})


    print(all_scores_df.to_string(index=False),flush=True)

In [42]:
evaluate_model(m,X_train,y_train,category_names)

               Feature   Accuracy  Precision         F1     Recall
               related  99.104872  99.103932  99.101202  99.104872
               request  98.804801  98.820626  98.787833  98.804801
                 offer  99.898281  99.898385  99.891702  99.898281
           aid_related  98.652223  98.674008  98.649205  98.652223
          medical_help  98.708168  98.723659  98.655896  98.708168
      medical_products  99.155732  99.160882  99.119435  99.155732
     search_and_rescue  99.374428  99.378424  99.334458  99.374428
              security  99.537178  99.539350  99.504137  99.537178
              military  99.521920  99.522514  99.504937  99.521920
           child_alone 100.000000 100.000000 100.000000 100.000000
                 water  99.435459  99.438842  99.423192  99.435459
                  food  99.303224  99.307749  99.293593  99.303224
               shelter  99.196419  99.203433  99.178790  99.196419
              clothing  99.735530  99.735232  99.724222  99.73

In [17]:
def evaluate_model(model, X_test, Y_test, category_names):
    """
    INPUT:
    model - ML model
    X_test - test messages
    y_test - categories for test messages
    category_names - category name for y
    
    OUTPUT:
    none - print scores (precision, recall, f1-score) for each output category of the dataset.
    """
    Y_pred_test = model.predict(X_test)

    for i, cat in enumerate(category_names):
        print(f'*********************** {cat.upper()} ***********************')
        print(classification_report(Y_test.values[:,i],Y_pred_test[:,i]))



In [132]:
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score


def evaluate_model(m, X_test, y_test, category_names):

    y_pred = m.predict(X_test)

    accuracy_scores = []
    precision_scores = []
    f1_scores = []
    all_recall = []

    for i, cat in enumerate(category_names):
        # print(f'*********************** {cat.upper()} ***********************')
        # print(classification_report(y_test.values[:,i],y_pred[:,i]))

        accuracy_scores.append(accuracy_score(
            y_test.values[:, i], y_pred[:, i])*100)
        precision_scores.append(precision_score(
            y_test.values[:, i], y_pred[:, i], average='weighted')*100)
        f1_scores.append(
            f1_score(y_test.values[:, i], y_pred[:, i], average='weighted')*100)
        all_recall.append(recall_score(
            y_test.values[:, i], y_pred[:, i], average='weighted')*100)

    all_scores_dict = dict(zip(category_names, zip(
        accuracy_scores, precision_scores, f1_scores, all_recall)))

    all_scores_df = pd.DataFrame(all_scores_dict).T

    all_scores_df.columns = ['Accuracy', 'Precision', 'F1', 'Recall']

    all_scores_df = all_scores_df.reset_index().rename(columns={'index': 'Feature'})


    return all_scores_df

In [133]:
evaluate_model(m,X_test,y_test,category_names)

Unnamed: 0,Feature,Accuracy,Precision,F1,Recall
0,related,94.812328,94.78159,94.693868,94.812328
1,request,96.032957,96.068992,95.875691,96.032957
2,offer,99.816906,99.817242,99.795019,99.816906
3,aid_related,92.416845,92.624409,92.345615,92.416845
4,medical_help,97.085749,97.109039,96.80156,97.085749
5,medical_products,98.29112,98.321289,98.118847,98.29112
6,search_and_rescue,98.901434,98.900739,98.745539,98.901434
7,security,99.176076,99.17035,99.060029,99.176076
8,military,98.71834,98.697228,98.602319,98.71834
9,child_alone,100.0,100.0,100.0,100.0


In [134]:
def save_model(model, model_filepath):
    with open(model_filepath, 'wb') as file:  
        pickle.dump(model, file)    