# ML Pipeline Preparation

## Importing libraries

In [2]:
import pandas as pd

from sqlalchemy import create_engine

import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support, make_scorer, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Loading data from database

In [3]:
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_table('DisasterResponse', con=engine)

## Defining feature and target variables X and Y

In [4]:
X = df.loc[:, 'message']
y = df.iloc[:, 4:]

## Tokenization function to process text data

In [5]:
def tokenize(text):
    """
    INPUT:
    OUTPUT:
    """
    # Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    # Split text into words using NLTK
    words = word_tokenize(text)
    # Remove stop words
    words = [w for w in words if w not in stopwords.words("english")]
    # Reduce words to their root form
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
    # Lemmatize verbs by specifying pos
    lemmed = [WordNetLemmatizer().lemmatize(w, pos='v').strip() for w in lemmed]
    # Reduce words to their stems
    stemmed = [PorterStemmer().stem(w) for w in lemmed]
    tokenized = stemmed
    return tokenized

## Building a machine learning pipeline
This machine pipeline takes in the `message` column as input and output classification results on the other 36 categories in the dataset.

In [6]:
random_forest_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

## Split data into train and test sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Training the pipeline

In [8]:
random_forest_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000197A83578B0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

## Testing the model
Reporting the f1 score, precision and recall for each output category of the dataset.

In [9]:
y_pred = random_forest_pipeline.predict(X_test)

In [10]:
print(classification_report(y_test, y_pred, zero_division=0, target_names=y.columns))

                        precision    recall  f1-score   support

               related       0.84      0.95      0.89      5026
               request       0.83      0.49      0.62      1184
                 offer       0.00      0.00      0.00        32
           aid_related       0.77      0.68      0.73      2819
          medical_help       0.67      0.08      0.13       533
      medical_products       0.77      0.07      0.14       324
     search_and_rescue       0.67      0.03      0.06       180
              security       0.33      0.01      0.02       122
              military       0.77      0.08      0.14       221
           child_alone       0.00      0.00      0.00         0
                 water       0.91      0.35      0.50       465
                  food       0.87      0.63      0.73       770
               shelter       0.88      0.36      0.51       566
              clothing       0.75      0.09      0.16        99
                 money       0.80      

### Trying other machine learning algorithms

#### Decision Tree

In [11]:
decision_tree_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(DecisionTreeClassifier()))
])

In [12]:
decision_tree_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000197A83578B0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=DecisionTreeClassifier()))])

In [13]:
y_pred = decision_tree_pipeline.predict(X_test)

Reporting the f1 score, precision and recall for each output category of the dataset.

In [14]:
print(classification_report(y_test, y_pred, zero_division=0, target_names=y.columns))

                        precision    recall  f1-score   support

               related       0.85      0.85      0.85      5026
               request       0.59      0.55      0.57      1184
                 offer       0.00      0.00      0.00        32
           aid_related       0.67      0.63      0.65      2819
          medical_help       0.37      0.32      0.34       533
      medical_products       0.40      0.41      0.41       324
     search_and_rescue       0.31      0.24      0.27       180
              security       0.06      0.05      0.06       122
              military       0.40      0.40      0.40       221
           child_alone       0.00      0.00      0.00         0
                 water       0.68      0.62      0.65       465
                  food       0.73      0.72      0.73       770
               shelter       0.63      0.62      0.62       566
              clothing       0.55      0.53      0.54        99
                 money       0.43      

#### KNeighbors

In [15]:
kn_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(KNeighborsClassifier()))
])

In [16]:
kn_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000197A83578B0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=KNeighborsClassifier()))])

In [17]:
y_pred = kn_pipeline.predict(X_test)

Reporting the f1 score, precision and recall for each output category of the dataset.

In [18]:
print(classification_report(y_test, y_pred, zero_division=0, target_names=y.columns))

                        precision    recall  f1-score   support

               related       0.83      0.93      0.88      5026
               request       0.75      0.45      0.56      1184
                 offer       0.00      0.00      0.00        32
           aid_related       0.71      0.42      0.53      2819
          medical_help       0.63      0.09      0.15       533
      medical_products       0.62      0.09      0.15       324
     search_and_rescue       0.90      0.05      0.09       180
              security       0.00      0.00      0.00       122
              military       0.59      0.09      0.15       221
           child_alone       0.00      0.00      0.00         0
                 water       0.77      0.20      0.31       465
                  food       0.74      0.33      0.45       770
               shelter       0.73      0.19      0.30       566
              clothing       0.70      0.16      0.26        99
                 money       0.60      

Since this dataset is imbalanced (ie some labels like water have few examples). I'm going to choose the machine learning algorithm that performs best at the f1_macro score, because some labels that have few frequency can actually be very important, like water. And with the f1_macro score we also take into account precision and recall

## Improving the model further

### Trying adding other features

In [19]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        # tokenize by sentences
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            # tokenize each sentence into words and tag part of speech
            pos_tags = nltk.pos_tag(tokenize(sentence))
            if len(pos_tags) > 1:
                # index pos_tags to get the first word and part of speech tag
                first_word, first_tag = pos_tags[0]
                # return true if the first word is an appropriate verb or RT for retweet
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    return 1
        return 0

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # apply starting_verb function to all values in X
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [20]:
class TextLengthExtractor(BaseEstimator, TransformerMixin):

    def textlength(self, text):
        text_len = len(text.strip())
        return text_len

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.textlength)
        return pd.DataFrame(X_tagged)

In [21]:
pipeline = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),
        
        ('txt_length', TextLengthExtractor()),
        ('starting_verb', StartingVerbExtractor())
    ])),

    ('clf', MultiOutputClassifier(DecisionTreeClassifier()))
])

In [22]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer(tokenizer=<function tokenize at 0x00000197A83578B0>)),
                                                                 ('tfidf',
                                                                  TfidfTransformer())])),
                                                ('txt_length',
                                                 TextLengthExtractor()),
                                                ('starting_verb',
                                                 StartingVerbExtractor())])),
                ('clf',
                 MultiOutputClassifier(estimator=DecisionTreeClassifier()))])

In [23]:
y_pred = pipeline.predict(X_test)

Reporting the f1 score, precision and recall for each output category of the dataset.

In [24]:
print(classification_report(y_test, y_pred, zero_division=0, target_names=y.columns))

                        precision    recall  f1-score   support

               related       0.84      0.84      0.84      5026
               request       0.59      0.55      0.57      1184
                 offer       0.00      0.00      0.00        32
           aid_related       0.66      0.64      0.65      2819
          medical_help       0.35      0.32      0.33       533
      medical_products       0.40      0.41      0.41       324
     search_and_rescue       0.29      0.25      0.27       180
              security       0.08      0.07      0.07       122
              military       0.42      0.43      0.43       221
           child_alone       0.00      0.00      0.00         0
                 water       0.66      0.60      0.63       465
                  food       0.74      0.72      0.73       770
               shelter       0.65      0.64      0.65       566
              clothing       0.56      0.52      0.54        99
                 money       0.39      

Adding these 2 features actually decreased the f1_macro score, I'm going to try removing the Starting with Verb Feature, since this feature is not very robust

In [25]:
pipeline = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),
        
        ('txt_length', TextLengthExtractor())
    ])),

    ('clf', MultiOutputClassifier(DecisionTreeClassifier()))
])

In [26]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer(tokenizer=<function tokenize at 0x00000197A83578B0>)),
                                                                 ('tfidf',
                                                                  TfidfTransformer())])),
                                                ('txt_length',
                                                 TextLengthExtractor())])),
                ('clf',
                 MultiOutputClassifier(estimator=DecisionTreeClassifier()))])

In [27]:
y_pred = pipeline.predict(X_test)

Reporting the f1 score, precision and recall for each output category of the dataset.

In [28]:
print(classification_report(y_test, y_pred, zero_division=0, target_names=y.columns))

                        precision    recall  f1-score   support

               related       0.84      0.83      0.84      5026
               request       0.58      0.55      0.56      1184
                 offer       0.00      0.00      0.00        32
           aid_related       0.66      0.64      0.65      2819
          medical_help       0.37      0.34      0.35       533
      medical_products       0.40      0.41      0.40       324
     search_and_rescue       0.26      0.23      0.25       180
              security       0.06      0.04      0.05       122
              military       0.38      0.37      0.38       221
           child_alone       0.00      0.00      0.00         0
                 water       0.68      0.60      0.64       465
                  food       0.75      0.71      0.73       770
               shelter       0.63      0.61      0.62       566
              clothing       0.56      0.55      0.55        99
                 money       0.37      

Still the Original pipeline performed better than with these 2 features added

## Improving the model

### Fine tuning the model for accuracy, precision and recall

In [29]:
scoring = {"Accuracy": "accuracy", "Precision": make_scorer(precision_score, average='macro', zero_division=0), 
           "Recall": make_scorer(recall_score, average='macro', zero_division=0), 
           "F1": make_scorer(f1_score, average='macro', zero_division=0)}

### Using grid search to find better parameters

In [30]:
# USE THIS PARAMETERS AND GRIDSEARCHCV WHEN COMPUTATIONAL POWER AVAILABLE
# parameters = {
#     'vect__ngram_range': ((1, 1), (1, 2)),
#     'vect__max_df': (0.5, 0.75, 1.0),
#     'vect__max_features': (None, 5000, 10000),
#     'tfidf__use_idf': (True, False),
#     'clf__estimator__criterion': ['gini','entropy'],
#     'clf__estimator__min_samples_split': [2, 3, 4],
# }
#
# decision_tree_cv = GridSearchCV(random_forest_pipeline, param_grid=parameters, return_train_score=True, 
#                         refit='F1', scoring = scoring)

parameters = {
    'clf__estimator__criterion': ['gini','entropy']
}

decision_tree_cv = GridSearchCV(decision_tree_pipeline, param_grid=parameters, verbose=3, cv=2, return_train_score=True, 
                        refit='F1', scoring = scoring)

In [31]:
decision_tree_cv.fit(X_train, y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2] END clf__estimator__criterion=gini; Accuracy: (train=0.997, test=0.174) F1: (train=0.971, test=0.366) Precision: (train=0.972, test=0.386) Recall: (train=0.969, test=0.352) total time=11.3min
[CV 2/2] END clf__estimator__criterion=gini; Accuracy: (train=0.997, test=0.166) F1: (train=0.971, test=0.370) Precision: (train=0.972, test=0.382) Recall: (train=0.971, test=0.359) total time=12.0min
[CV 1/2] END clf__estimator__criterion=entropy; Accuracy: (train=0.997, test=0.175) F1: (train=0.971, test=0.372) Precision: (train=0.972, test=0.394) Recall: (train=0.969, test=0.355) total time=10.2min
[CV 2/2] END clf__estimator__criterion=entropy; Accuracy: (train=0.997, test=0.170) F1: (train=0.971, test=0.371) Precision: (train=0.972, test=0.391) Recall: (train=0.971, test=0.355) total time= 9.4min


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x00000197A83578B0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=DecisionTreeClassifier()))]),
             param_grid={'clf__estimator__criterion': ['gini', 'entropy']},
             refit='F1', return_train_score=True,
             scoring={'Accuracy': 'accuracy',
                      'F1': make_scorer(f1_score, average=macro, zero_division=0),
                      'Precision': make_scorer(precision_score, average=macro, zero_division=0),
                      'Recall': make_scorer(recall_score, average=macro, zero_division=0)},
             verbose=3)

## Showing the accuracy, precision, and recall of the models.  

In [32]:
rows = []
cvres = decision_tree_cv.cv_results_
for mean_test_Accuracy, mean_test_Precision, mean_test_Recall, mean_test_F1, params in zip(cvres['params'], 
                                                                                           cvres['mean_test_Accuracy'], 
                                                                                           cvres['mean_test_Precision'], 
                                                                                           cvres['mean_test_Recall'], 
                                                                                           cvres['mean_test_F1']):
    rows.append([mean_test_Accuracy, mean_test_Precision, mean_test_Recall, mean_test_F1, params])
scores = pd.DataFrame(rows, columns=["Params", "Accuracy", "Precision", "Recall", "F1"])
scores

Unnamed: 0,Params,Accuracy,Precision,Recall,F1
0,{'clf__estimator__criterion': 'gini'},0.170022,0.383888,0.355569,0.367836
1,{'clf__estimator__criterion': 'entropy'},0.172378,0.39246,0.355098,0.371523


## Testing the best model

In [33]:
decision_tree_cv

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x00000197A83578B0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=DecisionTreeClassifier()))]),
             param_grid={'clf__estimator__criterion': ['gini', 'entropy']},
             refit='F1', return_train_score=True,
             scoring={'Accuracy': 'accuracy',
                      'F1': make_scorer(f1_score, average=macro, zero_division=0),
                      'Precision': make_scorer(precision_score, average=macro, zero_division=0),
                      'Recall': make_scorer(recall_score, average=macro, zero_division=0)},
             verbose=3)

In [34]:
best_estimator = decision_tree_cv.best_estimator_
best_estimator

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000197A83578B0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=DecisionTreeClassifier(criterion='entropy')))])

In [35]:
y_pred = best_estimator.predict(X_test)

Reporting the f1 score, precision and recall for each output category of the dataset.

In [36]:
print(classification_report(y_test, y_pred, zero_division=0, target_names=y.columns))

                        precision    recall  f1-score   support

               related       0.85      0.86      0.86      5026
               request       0.58      0.54      0.56      1184
                 offer       0.00      0.00      0.00        32
           aid_related       0.67      0.64      0.66      2819
          medical_help       0.38      0.31      0.34       533
      medical_products       0.46      0.40      0.43       324
     search_and_rescue       0.26      0.19      0.22       180
              security       0.12      0.08      0.10       122
              military       0.43      0.38      0.40       221
           child_alone       0.00      0.00      0.00         0
                 water       0.69      0.60      0.64       465
                  food       0.76      0.73      0.74       770
               shelter       0.63      0.61      0.62       566
              clothing       0.54      0.51      0.52        99
                 money       0.37      

### Exporting the model as a pickle file

In [38]:
import joblib
joblib.dump(best_estimator, "models/classifier.pkl")

['models/classifier.pkl']