# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [1]:
# import libraries
import pandas as pd
import numpy as np
import re
import pickle
from sqlalchemy import create_engine


import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_recall_fscore_support


[nltk_data] Downloading package punkt to /Users/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/matt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/matt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///../data/DisasterResponse.db')
df = pd.read_sql('select * from Message_Category'
                ,engine)
# check top df rows                 
df.head()
# create feature matrix
X = df['message']
# response matrix
Y = df.iloc[:, 4:]

In [3]:
# there is one feature column to be used and 35 label columns due to dropping child_alone column in ETL process.
print(X.shape)
print(Y.shape)

(26028,)
(26028, 35)


### 2. Tokenization function to process text data

In [4]:
def tokenize(text):
    '''
    Title: Text pre-processing function.
    Input: Raw text data
    Output: Normalized, stop words removed, tokenized, stemmed and lemmatized text data.
    '''
    # Normalize text.
    text = re.sub(r'[^a-zA-Z0-9]', ' '
                 ,text.lower())
    # Instantiate stop words             
    stop_words = stopwords.words('english')
    # Tokenize words
    words = word_tokenize(text)
    # Stemming
    stemmed = [PorterStemmer().stem(w) for w in words]
    # Lemmatize
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in stemmed if w not in stop_words]

    return (lemmed)

### 3. Build a machine learning pipeline

In [5]:
# build a pipeline
pipeline = Pipeline([('vect', CountVectorizer(tokenizer = tokenize))
                    ,('tfidf', TfidfTransformer())
                    ,('clf', MultiOutputClassifier(RandomForestClassifier()))])

### 4. Train pipeline

In [6]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 22)

In [7]:
# train classifier
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                 MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                  

### 5. Test your model

In [11]:
# predict
Y_pred = pipeline.predict(X_test)

In [12]:
def get_results(Y_test, Y_pred):
    '''
    Title: Function to get f1 score, precision and recall for each label category column.
    Input: True Values (y_test), Predicted values (y_pred).
    Output: Results dataframe.
    '''
    # create empty dataframe with columns of interest
    results = pd.DataFrame(columns = ['Category', 'f_score', 'precision', 'recall'])
    num = 0

    # looping through y_test columns
    for cat in Y_test.columns:
        precision, recall, f_score, support = precision_recall_fscore_support(Y_test[cat], Y_pred[:, num], average = 'weighted')
        results.at[num + 1, 'Category'] = cat
        results.at[num+1, 'f_score'] = f_score
        results.at[num+1, 'precision'] = precision
        results.at[num+1, 'recall'] = recall
        num += 1
    # get Total average results    
    print('Average f_score:', results['f_score'].mean())
    print('Average precision:', results['precision'].mean())
    print('Average recall:', results['recall'].mean())
    return results

In [13]:
# get the results for our model accuracy
results = get_results(Y_test, Y_pred)
results

Average f_score: 0.9316024227007785
Average precision: 0.9324557542358791
Average recall: 0.9437748358910186


Unnamed: 0,Category,f_score,precision,recall
1,related,0.799154,0.796532,0.809897
2,request,0.874519,0.879846,0.887659
3,offer,0.99332,0.991106,0.995543
4,aid_related,0.751252,0.755542,0.756109
5,medical_help,0.902998,0.909394,0.927002
6,medical_products,0.943691,0.94498,0.956969
7,search_and_rescue,0.959316,0.958928,0.971415
8,security,0.974561,0.966471,0.982788
9,military,0.952916,0.954315,0.966498
10,water,0.93749,0.944256,0.950054


### 6. Improve model with grid and cross validation.

In [14]:
# check for pipeline parameters for grid search cv
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x10d40f950>, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                          class_weight=None,
                                                          criterion='gini',
                                                          max_depth=None,
                                                          max_features='auto',
            

### Due to long execution time I have limited number of parameters to cross validate to minimum.

In [15]:
# create parameters for grid
parameters = {'clf__estimator__min_samples_leaf': [1, 3, 5]} # minimum samples at each leaf node
    
# create grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

In [16]:
# check the grid
cv

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                       

### 7. Test model

In [17]:
# fit the grid (takes forever!)
cv.fit(X_train, Y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                       

In [18]:
# predict on grid
Y_pred = cv.predict(X_test)
print('Done!')

Done!


In [19]:
# get the results
cv_results = get_results(Y_test, Y_pred)
cv_results

Average f_score: 0.9314155019066263
Average precision: 0.9321533391701008
Average recall: 0.9433664844453227


Unnamed: 0,Category,f_score,precision,recall
1,related,0.797461,0.794491,0.807746
2,request,0.870816,0.874203,0.883664
3,offer,0.99332,0.991106,0.995543
4,aid_related,0.747741,0.751736,0.752574
5,medical_help,0.904758,0.908588,0.927002
6,medical_products,0.941381,0.941069,0.955586
7,search_and_rescue,0.958086,0.952986,0.970647
8,security,0.974407,0.966466,0.98248
9,military,0.954263,0.957221,0.967112
10,water,0.945742,0.950335,0.954818


In [20]:
# view the best parameters
cv.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                 MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                  

### 8. Try to improve model further.

In [21]:
# testing a Support Vector Classifier
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize))
                    ,('tfidf', TfidfTransformer())
                    ,('clf', MultiOutputClassifier(SVC()))
                    ])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=SVC(C=1.0, cache_size=200,
                                                     class_weight=None,
                               

In [22]:
Y_pred = pipeline.predict(X_test)

In [23]:
results_SVC = get_results(Y_test, Y_pred)
results_SVC

Average f_score: 0.889393331101731
Average precision: 0.8608332711427559
Average recall: 0.9236382796548773


Unnamed: 0,Category,f_score,precision,recall
1,related,0.660672,0.582441,0.763178
2,request,0.744971,0.679541,0.824343
3,offer,0.992629,0.990189,0.995082
4,aid_related,0.435239,0.345542,0.587828
5,medical_help,0.879329,0.84345,0.918396
6,medical_products,0.928893,0.906695,0.952205
7,search_and_rescue,0.955048,0.940664,0.969879
8,security,0.970362,0.960743,0.980175
9,military,0.955733,0.941559,0.97034
10,water,0.902005,0.872202,0.933917


### 9. Export your model as a pickle file

In [66]:
pickle.dump(cv, open('..models/model.pkl', 'wb'))