# ML Pipeline Preparation
I will build an effective machine learning pipeline that help me classifies the disaster messages
### 1. Import libraries and load data from database.
- Importing Python libraries
- Loading dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Defining feature and target variables X and Y

In [81]:
# import libraries
import nltk
nltk.download(['punkt', 'averaged_perceptron_tagger', 'wordnet', 'stopwords'])
import json
import joblib
import numpy as np
import pandas as pd
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sqlalchemy import create_engine
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [76]:
# load data from database
def load_data(database = '../data/disaster-response-messages.db', sqltable= 'messages'):
    engine = create_engine('sqlite:///'+database)
    df = pd.read_sql_table(sqltable, con = engine)
    X = df['message']
    Y = df.iloc[:, 4:]
    return X,Y


In [None]:
X, y = load_data()

In [83]:
y

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26213,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26214,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Writing a tokenization function to process my text data

In [84]:
#define the regular expression for the URL
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
punctuations_list = string.punctuation

In [85]:
def tokenize(text):
            """
            Tokenize function
            
            Arguments:
                text -> list of text messages (Arabic)
            Output:
                clean_tokens -> tokenized text, clean for ML modeling
            """
            
            #replacing any url with "urlplaceholder" string
            detected_urls = re.findall(url_regex, text)
            for url in detected_urls:
                text = text.replace(url, "urlplaceholder")
                
            #normalize text
            text = text.lower()
            
            #tokenizing the text message
            tokens = word_tokenize(text)
            
            #defining the lemmatization object 
            lemmatizer = WordNetLemmatizer()
            
            #defining a translator object to remove all punctations
            table = str.maketrans('', '', punctuations_list)

            clean_tokens = []
            #cleaning every token by stemming and removing punctations and appending to the clean list
            for tok in tokens:
                clean_tok = lemmatizer.lemmatize(tok)     
                clean_tok = clean_tok.translate(table)        
                clean_tokens.append(clean_tok)
            
            #removing the stopwords from the clean_tokens list
            clean_tokens = [w for w in clean_tokens if w != '' and w not in stopwords.words('english')]


            return clean_tokens

In [86]:
#test tokenize function on the first five messages
for x in X[:5]:
    print(x)
    print(tokenize(x))

Weather update - a cold front from Cuba that could pass over Haiti
['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti']
Is the Hurricane over or is it not over
['hurricane']
Looking for someone but no name
['looking', 'someone', 'name']
UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.
['un', 'report', 'leogane', '8090', 'destroyed', 'hospital', 'st', 'croix', 'functioning', 'need', 'supply', 'desperately']
says: west side of Haiti, rest of the country today and tonight
['say', 'west', 'side', 'haiti', 'rest', 'country', 'today', 'tonight']


### 3. Building a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. 

In [96]:
pipeline = Pipeline([
    ('tfidfvect',TfidfVectorizer(tokenizer=tokenize)),
    ('clf', MultiOutputClassifier(AdaBoostClassifier()))
])

### 4. Training pipeline
- Spliting data into train and test sets
- Training pipeline

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [98]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidfvect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at 0x00000265D0C4C950>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 MultiOutputClassifier(estimator=AdaBoostClassifier(algorithm='SAMME.R',
                 

### 5. Testing my model
Reporting the f1 score, precision and recall for each output category of the dataset. 

In [103]:
Y_pred = pipeline.predict(X_test)


In [104]:

for i,col in enumerate(y_test.columns):
    print(col+'\n' ,classification_report(y_test.loc[:,col], Y_pred[:,i]))
        

related
               precision    recall  f1-score   support

           0       0.67      0.12      0.20      1563
           1       0.77      0.98      0.87      4944
           2       0.46      0.13      0.20        47

    accuracy                           0.77      6554
   macro avg       0.64      0.41      0.42      6554
weighted avg       0.75      0.77      0.70      6554

request
               precision    recall  f1-score   support

           0       0.90      0.97      0.94      5443
           1       0.77      0.50      0.61      1111

    accuracy                           0.89      6554
   macro avg       0.84      0.73      0.77      6554
weighted avg       0.88      0.89      0.88      6554

offer
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6521
           1       0.00      0.00      0.00        33

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      

### 6. Improving your model
Use grid search to find better parameters. 

In [107]:
parameters = {
        'tfidfvect__ngram_range': ((1, 1), (1, 2)),
        'tfidfvect__max_df': (0.5, 1.0),
        'tfidfvect__max_features': (None, 5000),
        'clf__estimator__n_estimators': [50, 100] 
}

cv = GridSearchCV(pipeline, param_grid=parameters)


In [108]:
cv.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidfvect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                

### 7. Testing my best model
Showing the accuracy, precision, and recall of the tuned model.  



In [109]:
Y_pred = cv.predict(X_test)

#Test the model using accuracy, precision, recall
for i,col in enumerate(y_test.columns):
    accuracy = accuracy_score(y_test.iloc[:,i], Y_pred[:,i])
    precision = precision_score(y_test.iloc[:,i], Y_pred[:,i], average='micro')
    recall = recall_score(y_test.iloc[:,i], Y_pred[:,i], average='micro')
    print(col+'\n' , "accuracy: {}\tprecision: {}\trecall: {} \n".format(accuracy, precision, recall))

related
 accuracy: 0.7718950259383582	precision: 0.7718950259383582	recall: 0.7718950259383582 

request
 accuracy: 0.8947207812023192	precision: 0.8947207812023192	recall: 0.8947207812023192 

offer
 accuracy: 0.9945071711931645	precision: 0.9945071711931645	recall: 0.9945071711931645 

aid_related
 accuracy: 0.7691486115349405	precision: 0.7691486115349405	recall: 0.7691486115349405 

medical_help
 accuracy: 0.9252364967958498	precision: 0.9252364967958498	recall: 0.9252364967958498 

medical_products
 accuracy: 0.954226426609704	precision: 0.954226426609704	recall: 0.954226426609704 

search_and_rescue
 accuracy: 0.9752822703692402	precision: 0.9752822703692402	recall: 0.9752822703692402 

security
 accuracy: 0.9800122062862374	precision: 0.9800122062862374	recall: 0.9800122062862374 

military
 accuracy: 0.9711626487641135	precision: 0.9711626487641135	recall: 0.9711626487641135 

child_alone
 accuracy: 1.0	precision: 1.0	recall: 1.0 

water
 accuracy: 0.9642966127555691	precision:

### 8. Try improving my model further:
* trying other machine learning algorithms (RandomForest Classifier)
* adding other feature besides the TF-IDF (message length)

In [111]:
class MessageLength(BaseEstimator, TransformerMixin):
    """
     Message Length Extractor class
    
    This class extract the message length ,
    creating a new feature for the ML classifier
    """

    def computing_message_length(self, text):
        return len(text)

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.computing_message_length)
        return pd.DataFrame(X_tagged)

In [112]:
pipeline = Pipeline([
        ('features', FeatureUnion([
            ('tfidfvect',TfidfVectorizer(tokenizer=tokenize)),
            ('computing_message_length',MessageLength())       
            
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
parameters = {
        'features__tfidfvect__ngram_range': ((1, 1), (1, 2)),
        'features__tfidfvect__max_df': (0.75, 1.0),
        'features__tfidfvect__max_features': (None, 5000),
        'clf__estimator__n_estimators': [50, 100] ,
        'features__transformer_weights': (
            {'tfidfvect': 1,  'computing_message_length' : 0.5}, 
            {'tfidfvect': 0.5,   'computing_message_length' : 1})
             
        
    
    }

cv1 = GridSearchCV(pipeline, param_grid=parameters)


In [None]:
cv1.fit(X_train, y_train)



In [None]:
Y_pred = cv.predict(X_test)


In [None]:
for i,col in enumerate(y_test.columns):
    
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(Y_test.iloc[:,i], y_pred[:,i], labels=labels)
    accuracy = (y_pred[:,i] == Y_test.iloc[:,i]).mean()
    class_report = classification_report(Y_test.iloc[:,i], y_pred[:,i])
    f1_sc = f1_score(Y_test.iloc[:,i], y_pred[:,i])
    prec = precision_score(Y_test.iloc[:,i], y_pred[:,i])

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy: ", accuracy)
    print("precision: ", prec)
    print("f1_score: ", f1_sc)
    print("\nClassification report:\n ", class_report ) 

print("\nBest Parameters: ", cv.best_params_)


### 9. Export your model as a pickle file

In [110]:
joblib.dump(cv, './models/clf.pkl')

['./models/clf.pkl']