# ML Pipeline Preparation

**1. Import libraries and load data from database.**

In [10]:
# Import libraries

import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\izzit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\izzit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\izzit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [21]:
# Import more libraries

import re
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.base import BaseEstimator, TransformerMixin


In [25]:
# Load data from database

engine = create_engine('sqlite:///disasterMessage.db')
df = pd.read_sql("SELECT * FROM myMessage", engine)
X = df.message.values
y = df.drop(columns =['id', 'message', 'genre', 'categories'], axis=1).values


In [22]:
df.head()

Unnamed: 0,id,message,genre,categories,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,direct,related-1;request-0;offer-0;aid_related-0;medi...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,direct,related-1;request-0;offer-0;aid_related-1;medi...,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,direct,related-1;request-0;offer-0;aid_related-0;medi...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",direct,related-1;request-0;offer-0;aid_related-0;medi...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
print(X)

['Weather update - a cold front from Cuba that could pass over Haiti'
 'Is the Hurricane over or is it not over'
 'Looking for someone but no name' ...
 "Proshika, operating in Cox's Bazar municipality and 5 other unions, Ramu and Chokoria, assessment, 5 kg rice, 1,5 kg lentils to 700 families."
 'Some 2,000 women protesting against the conduct of the elections were teargassed as they tried to converge on the local electoral commission offices in the southern oil city of Port Harcourt.'
 'A radical shift in thinking came about as a result of this meeting, recognizing that HIV/AIDS is at the core of the humanitarian crisis and identifying the crisis itself as a function of the HIV/AIDS pandemic.']


In [24]:
print(y)

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [26]:
y_columns = list(df.columns[4:,])
y_columns

['related',
 'request',
 'offer',
 'aid_related',
 'medical_help',
 'medical_products',
 'search_and_rescue',
 'security',
 'military',
 'child_alone',
 'water',
 'food',
 'shelter',
 'clothing',
 'money',
 'missing_people',
 'refugees',
 'death',
 'other_aid',
 'infrastructure_related',
 'transport',
 'buildings',
 'electricity',
 'tools',
 'hospitals',
 'shops',
 'aid_centers',
 'other_infrastructure',
 'weather_related',
 'floods',
 'storm',
 'fire',
 'earthquake',
 'cold',
 'other_weather',
 'direct_report']

**2. Write a tokenization function to process your text data**

In [27]:
def tokenizes(phrase):

    token = word_tokenize(phrase)
    lemmatize = WordNetLemmatizer()

    tidytoken = []
    for t in token:
        newtoken = lemmatize.lemmatize(t).lower().strip()
        tidytoken.append(newtoken)

    return tidytoken


**3. Build a machine learning pipeline**

In [28]:
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenizes)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultiOutputClassifier(RandomForestClassifier
                                                   (n_estimators=10, random_state=1, n_jobs=2)))]) 


**4. Train pipeline**

In [29]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Train pipeline
pipeline.fit(X_train, y_train)


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenizes at 0x0000011C8A62D160>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(n_estimators=10,
                                                                        n_jobs=2,
                                                                        random_state=1)))])

**5. Test your model**

In [30]:
pred = pipeline.predict(X_test)

for i in range(len(y_columns)):
    print('Category: {} '.format(y_columns[i]))
    print(classification_report(y_test[:, i], pred[:, i]))
    print('Accuracy {}\n'.format(accuracy_score(y_test[:, i], pred[:, i])))


Category: related 
              precision    recall  f1-score   support

           0       0.62      0.35      0.44      1527
           1       0.82      0.93      0.87      4984
           2       0.30      0.07      0.11        43

    accuracy                           0.79      6554
   macro avg       0.58      0.45      0.48      6554
weighted avg       0.77      0.79      0.77      6554

Accuracy 0.7914250839182179

Category: request 
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      5462
           1       0.80      0.36      0.49      1092

    accuracy                           0.88      6554
   macro avg       0.84      0.67      0.71      6554
weighted avg       0.87      0.88      0.86      6554

Accuracy 0.878089716203845

Category: offer 
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6516
           1       0.00      0.00      0.00        38

    accuracy    

  _warn_prf(average, modifier, msg_start, len(result))



Accuracy 0.9552944766554776

Category: weather_related 
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      4742
           1       0.83      0.50      0.63      1812

    accuracy                           0.83      6554
   macro avg       0.83      0.73      0.76      6554
weighted avg       0.83      0.83      0.82      6554

Accuracy 0.8347574000610314

Category: floods 
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      6030
           1       0.87      0.19      0.31       524

    accuracy                           0.93      6554
   macro avg       0.90      0.59      0.64      6554
weighted avg       0.93      0.93      0.91      6554

Accuracy 0.9327128471162649

Category: storm 
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      5965
           1       0.71      0.39      0.50       589

    accuracy                    

**6. Improve your model**

In [20]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)


def load_data():
    engine = create_engine('sqlite:///disasterMessage.db')
    df = pd.read_sql("SELECT * FROM myMessage", engine)
    X = df.message.values
    y = df.drop(columns =['id', 'message', 'genre', 'categories'], axis=1).values
    return X, y


def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def build_model():
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
            ('starting_verb', StartingVerbExtractor())
        ])),
        ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=10, random_state=1, n_jobs=2)))
    ])
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'clf__estimator__min_samples_split': [2, 3, 4],
        'features__transformer_weights': (
            {'text_pipeline': 1, 'starting_verb': 0.5},
        )
    }
    print(pipeline.get_params().keys())
    cv = GridSearchCV(pipeline, param_grid=parameters, verbose=3)
    return cv


def display_results(cv, y_test, y_pred):
    for i in range(len(y_columns)):    
        labels = np.unique(y_pred[:, i])
        confusion_mat = confusion_matrix(y_test[:, i], y_pred[:, i], labels=labels)
        accuracy = (y_pred[:, i] == y_test[:, i]).mean()

        print("Labels:", labels)
        print("Confusion Matrix:\n", confusion_mat)
        print("Accuracy:", accuracy)
        print("\nBest Parameters:", cv.best_params_)


def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = build_model()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    display_results(model, y_test, pred)

main()



dict_keys(['memory', 'steps', 'verbose', 'features', 'clf', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__verbose', 'features__text_pipeline', 'features__starting_verb', 'features__text_pipeline__memory', 'features__text_pipeline__steps', 'features__text_pipeline__verbose', 'features__text_pipeline__vect', 'features__text_pipeline__tfidf', 'features__text_pipeline__vect__analyzer', 'features__text_pipeline__vect__binary', 'features__text_pipeline__vect__decode_error', 'features__text_pipeline__vect__dtype', 'features__text_pipeline__vect__encoding', 'features__text_pipeline__vect__input', 'features__text_pipeline__vect__lowercase', 'features__text_pipeline__vect__max_df', 'features__text_pipeline__vect__max_features', 'features__text_pipeline__vect__min_df', 'features__text_pipeline__vect__ngram_range', 'features__text_pipeline__vect__preprocessor', 'features__text_pipeline__vect__stop_words', 'features__text_pipeline__vect__strip_accents

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_range=(1, 1), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5}, score=0.232, total= 1.0min
[CV] clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_range=(1, 1), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5} 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_range=(1, 1), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5}, score=0.230, total= 1.1min
[CV] clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_range=(1, 1), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5} 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.1min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_range=(1, 1), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5}, score=0.218, total= 1.0min
[CV] clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_range=(1, 1), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5} 
[CV]  clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_range=(1, 1), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5}, score=0.217, total=  58.1s
[CV] clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_range=(1, 1), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5} 
[CV]  clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_range=(1, 1), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5}, score=0.237, total=  56.9s
[CV] clf__estimator__min_samples_split=2, features__text_pipeline__vect__ngram_ra

[CV]  clf__estimator__min_samples_split=4, features__text_pipeline__vect__ngram_range=(1, 2), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5}, score=0.219, total= 1.3min
[CV] clf__estimator__min_samples_split=4, features__text_pipeline__vect__ngram_range=(1, 2), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5} 
[CV]  clf__estimator__min_samples_split=4, features__text_pipeline__vect__ngram_range=(1, 2), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5}, score=0.222, total= 1.3min
[CV] clf__estimator__min_samples_split=4, features__text_pipeline__vect__ngram_range=(1, 2), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5} 
[CV]  clf__estimator__min_samples_split=4, features__text_pipeline__vect__ngram_range=(1, 2), features__transformer_weights={'text_pipeline': 1, 'starting_verb': 0.5}, score=0.215, total= 1.3min
[CV] clf__estimator__min_samples_split=4, features__text_pipeline__vect__ngram_ra

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 34.2min finished


Labels: [0 1 2]
Confusion Matrix:
 [[ 545 1016    2]
 [ 316 4616    5]
 [   3   43    8]]
Accuracy: 0.7886786695148001

Best Parameters: {'clf__estimator__min_samples_split': 2, 'features__text_pipeline__vect__ngram_range': (1, 1), 'features__transformer_weights': {'text_pipeline': 1, 'starting_verb': 0.5}}
Labels: [0 1]
Confusion Matrix:
 [[5376   82]
 [ 696  400]]
Accuracy: 0.8812938663411657

Best Parameters: {'clf__estimator__min_samples_split': 2, 'features__text_pipeline__vect__ngram_range': (1, 1), 'features__transformer_weights': {'text_pipeline': 1, 'starting_verb': 0.5}}
Labels: [0]
Confusion Matrix:
 [[6515]]
Accuracy: 0.9940494354592615

Best Parameters: {'clf__estimator__min_samples_split': 2, 'features__text_pipeline__vect__ngram_range': (1, 1), 'features__transformer_weights': {'text_pipeline': 1, 'starting_verb': 0.5}}
Labels: [0 1]
Confusion Matrix:
 [[3378  467]
 [1386 1323]]
Accuracy: 0.7172718950259384

Best Parameters: {'clf__estimator__min_samples_split': 2, 'feat

Labels: [0 1]
Confusion Matrix:
 [[6218    2]
 [ 328    6]]
Accuracy: 0.9496490692706744

Best Parameters: {'clf__estimator__min_samples_split': 2, 'features__text_pipeline__vect__ngram_range': (1, 1), 'features__transformer_weights': {'text_pipeline': 1, 'starting_verb': 0.5}}
Labels: [0 1]
Confusion Matrix:
 [[5184  114]
 [ 880  376]]
Accuracy: 0.8483368935001526

Best Parameters: {'clf__estimator__min_samples_split': 2, 'features__text_pipeline__vect__ngram_range': (1, 1), 'features__transformer_weights': {'text_pipeline': 1, 'starting_verb': 0.5}}
successful


**7. Test your model**