In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import sys
from sqlalchemy import create_engine
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import hamming_loss, accuracy_score 
from sklearn.metrics import multilabel_confusion_matrix
import xgboost as xgb
import warnings
warnings.filterwarnings(action="ignore", message=r'.*Use subset.*of np.ndarray is not recommended')
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import pickle




In [2]:
database_filepath, model_filepath = ("../data/DisasterResponse.db", "../models/classifier.pkl")

### Define load, model and and evaluation functions

In [3]:
def load_data(database_filepath):
    '''
    load data from database and return X and y

    database_filename: name of database

    returns
    X: Predictors
    y: targets
    categories: names of target categories
    '''
    database_name =  'sqlite:///' + database_filepath
    engine = create_engine(database_name)
    df = pd.read_sql_table('disaster_clean', engine)

    X = df.message.values
    X = X[0:500]
    y = df.drop(['id', 'message', 'original', 'genre'], axis=1)
    y = y.iloc[0:500, :]

    category_names = list(y.columns.values)

    return X,y,category_names


def tokenize(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    words = word_tokenize(text)

    words = [w for w in words if not w in stop_words]
    words = [word.lower() for word in words if word.isalpha()]
    words = [stemmer.stem(word) for word in words]

    return words

def build_model(y_train):
    
    model = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf',MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softprob',
                                  eval_metric='mlogloss',
                                  use_label_encoder=False,
                                  num_class= y_train.shape[1])))
    ])
    
    return model


def evaluate_model(model, X_test, Y_test, category_names):
    '''
    label_df: pandas dataframe containing test targets
    preds_array: numpy array containing predicted targets

    Function to print classification report for each target column
    '''
    y_pred = model.predict(X_test)

    for i, col in enumerate(category_names):
        print('{} category metrics: '.format(col))
        print(classification_report(Y_test.iloc[:,i], y_pred[:,i]))

        
def save_model(model, model_filepath):
    pickle.dump(model, open(model_filepath, 'wb'))

### Split data and fit initial model with default parameters


In [4]:
X, Y, category_names = load_data(database_filepath)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

for col in y_test.columns:
    y_train[col] = y_train[col].map(lambda x: 1 if x > 0 else 0)

# one column with a none binary value - cleaned here
for col in y_test.columns:
    y_test[col] = y_test[col].map(lambda x: 1 if x > 0 else 0)
    
model = build_model(y_train)


In [70]:
model.fit(X_train, y_train.values)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7ff400531830>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                               booster=None,
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               eval_metric='mlogloss',
                                                               gamma=None,
                                                               gpu_id=None,
                                                               importance_type='gain',
                                                               interaction_cons

### Predict and check some single value evaluation metrics

In [71]:
y_pred = model.predict(X_test)
print(hamming_loss(y_test.values,y_pred))
print(accuracy_score(y_test.values,y_pred))



0.05388888888888889
0.26


### Define Grid and optimize model

In [26]:
params = { 'clf__estimator__max_depth': [2,3,5],
           'clf__estimator__learning_rate': [0.1, 0.2, 0.3],
           'clf__estimator__subsample': np.arange(0.5, 1.0, 0.1),
           'clf__estimator__colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'clf__estimator__colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'clf__estimator__n_estimators': [30,50,100],
           'clf__estimator__reg_lambda': [0,0.25,0.5,0.75,1]         
         }


#cv = GridSearchCV(model, 
#                  param_grid=params,
#                  cv=4 )
                  
cv = RandomizedSearchCV(estimator=model,
                        param_distributions=params,
                        n_iter=15,
                        cv=3,
                        verbose=2)


cv.fit(X_train, y_train.values)


Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] END ........................clf__estimator__max_depth=2; total time=  33.7s
[CV] END ........................clf__estimator__max_depth=2; total time=  29.2s


RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('vect',
                                              CountVectorizer(tokenizer=<function tokenize at 0x7fca45de13b0>)),
                                             ('tfidf', TfidfTransformer()),
                                             ('clf',
                                              MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                                                            booster=None,
                                                                                            colsample_bylevel=None,
                                                                                            colsample_bynode=None,
                                                                                            colsample_bytree=None,
                                                                                            eval_metric='mlogloss',
  

In [27]:
print('Best CV parameters are')
print(cv.best_params_)

Best CV parameters are
{'clf__estimator__max_depth': 2}


In [28]:
final_model = cv.best_estimator_
y_pred = final_model.predict(X_test)

In [29]:
print(hamming_loss(y_test.values,y_pred))
print(accuracy_score(y_test.values,y_pred))

0.04861111111111111
0.24


In [13]:
evaluate_model(final_model, X_test, y_test, category_names)



related category metrics: 
              precision    recall  f1-score   support

           0       1.00      0.13      0.24        15
           1       0.87      1.00      0.93        85

    accuracy                           0.87       100
   macro avg       0.93      0.57      0.58       100
weighted avg       0.89      0.87      0.82       100

request category metrics: 
              precision    recall  f1-score   support

           0       0.79      0.81      0.80        47
           1       0.83      0.81      0.82        53

    accuracy                           0.81       100
   macro avg       0.81      0.81      0.81       100
weighted avg       0.81      0.81      0.81       100

offer category metrics: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        99
           1       0.00      0.00      0.00         1

    accuracy                           0.99       100
   macro avg       0.49      0.50      0.50      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        99
           1       0.00      0.00      0.00         1

    accuracy                           0.99       100
   macro avg       0.49      0.50      0.50       100
weighted avg       0.98      0.99      0.99       100

weather_related category metrics: 
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        89
           1       0.00      0.00      0.00        11

    accuracy                           0.89       100
   macro avg       0.45      0.50      0.47       100
weighted avg       0.79      0.89      0.84       100

floods category metrics: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        99
           1       0.00      0.00      0.00         1

    accuracy                           0.99       100
   macro avg       0.49      0.50      0.50       100
weighted avg 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def build_model(y_train):
    
    model = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf',MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softprob',
                                  eval_metric='mlogloss',
                                  use_label_encoder=False,
                                  num_class= y_train.shape[1])))
    ])
    

    params = { 'clf__estimator__max_depth': [2,3,5],
           'clf__estimator__learning_rate': [0.1, 0.2, 0.3],
           'clf__estimator__subsample': np.arange(0.5, 1.0, 0.1),
           'clf__estimator__colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'clf__estimator__colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'clf__estimator__n_estimators': [30,50,100],
           'clf__estimator__reg_lambda': [0,0.25,0.5,0.75,1]         
         }

                  
    cv = RandomizedSearchCV(estimator=model,
                        param_distributions=params,
                        n_iter=15,
                        cv=3,
                        verbose=2)

    
    return model

In [None]:
def evaluate_model(model, X_test, Y_test, category_names):
    '''
    label_df: pandas dataframe containing test targets
    preds_array: numpy array containing predicted targets

    Function to print classification report for each target column
    '''
    
    print('Best CV parameters are')
    print(model.best_params_)
    
    final_model = model.best_estimator_
    y_pred = final_model.predict(X_test)
    
    print(hamming_loss(y_test.values,y_pred))
    print(accuracy_score(y_test.values,y_pred))

    for i, col in enumerate(category_names):
        print('{} category metrics: '.format(col))
        print(classification_report(Y_test.iloc[:,i], y_pred[:,i]))

In [None]:
def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
        
        print('Building model...')
        model = build_model()
        
        print('Training model...')
        model.fit(X_train, Y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()