# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [2]:
# import libraries
from sqlalchemy import create_engine
import pickle

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk import ne_chunk
from nltk.stem.wordnet import WordNetLemmatizer

# nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [3]:
# load data from database
engine = create_engine('sqlite:///DisasterResponsePipeline.db')
df = pd.read_sql('SELECT * FROM DisasterResponse', engine)

### 2. Write a tokenization function to process your text data

In [8]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
        
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower().strip())

    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [11]:
X = df['message']
y = df.iloc[:,4:]

In [None]:
def pipeline(clf):
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_transform', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ]))
        ])),
        ('clf', MultiOutputClassifier(clf))
    ])
    return pipeline

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [14]:
clf_random_forest = pipeline(clf=RandomForestClassifier())
clf_random_forest.fit(X_train, y_train)
y_pred_clf_random_forest = clf_random_forest.predict(X_test)



### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [17]:
print("accuracy is ", accuracy_score(y_test,y_pred_clf_random_forest)*100)
print("The recall is ", recall_score(y_test,y_pred_clf_random_forest, average = 'weighted')*100)
print("The precision is ", precision_score(y_test,y_pred_clf_random_forest, average = 'weighted')*100)

print(classification_report(y_test, y_pred_clf_random_forest,digits=2,target_names=y.columns.values))

accuracy is  16.21375436465766
The recall is  29.146896484942385
The precision is  41.690876728384715
                        precision    recall  f1-score   support

               related       0.77      0.97      0.86      5018
               request       0.45      0.06      0.10      1103
                 offer       0.00      0.00      0.00        31
           aid_related       0.46      0.22      0.29      2680
          medical_help       0.10      0.00      0.01       523
      medical_products       0.08      0.01      0.01       288
     search_and_rescue       0.00      0.00      0.00       186
              security       0.00      0.00      0.00       129
              military       0.00      0.00      0.00       210
                 water       0.08      0.00      0.00       423
                  food       0.25      0.01      0.03       740
               shelter       0.13      0.01      0.02       548
              clothing       0.20      0.01      0.02       111
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 6. Improve your model
Use grid search to find better parameters. 

In [18]:
clf_random_forest.get_params()

{'memory': None,
 'steps': [('features',
   FeatureUnion(transformer_list=[('text_transform',
                                   Pipeline(steps=[('vect',
                                                    CountVectorizer(tokenizer=<function tokenize at 0x000001ACB852FC40>)),
                                                   ('tfidf',
                                                    TfidfTransformer())]))])),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'features': FeatureUnion(transformer_list=[('text_transform',
                                 Pipeline(steps=[('vect',
                                                  CountVectorizer(tokenizer=<function tokenize at 0x000001ACB852FC40>)),
                                                 ('tfidf',
                                                  TfidfTransformer())]))]),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'features__n_jobs': None,
 'features__transforme

#### Used Random Search instead of Grid Search to save some computational costs

In [19]:
clf_random_forest = pipeline(RandomForestClassifier())

# Define the parameter distribution for RandomizedSearchCV
param_distributions = {
    'clf__estimator__max_depth': [None, 10, 20],         # Maximum depth of each tree
    'clf__estimator__min_samples_split': [2, 3, 4]                 # Minimum samples required to split a node
}

# Set up RandomizedSearchCV with the pipeline and randomized parameter search
clf_random_forest_cv = RandomizedSearchCV(clf_random_forest, param_distributions=param_distributions, 
                                          cv=2, n_iter=10, random_state=42)

# Fit the randomized search to the training data
clf_random_forest_cv.fit(X_train, y_train)

# Get the best parameters
print("Best Parameters:", clf_random_forest_cv.best_params_)

# Predict using the best model
y_pred_clf_random_forest_cv = clf_random_forest_cv.predict(X_test)




Best Parameters: {'clf__estimator__min_samples_split': 3, 'clf__estimator__max_depth': 20}


### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [20]:
print("accuracy is ", accuracy_score(y_test,y_pred_clf_random_forest_cv)*100)
print("The recall is ", recall_score(y_test,y_pred_clf_random_forest_cv, average = 'weighted')*100)
print("The precision is ", precision_score(y_test,y_pred_clf_random_forest_cv, average = 'weighted')*100)

print(classification_report(y_test, y_pred_clf_random_forest_cv,digits=4,target_names=y.columns.values))

accuracy is  20.494914224988612
The recall is  24.5424615086666
The precision is  29.15888293187074
                        precision    recall  f1-score   support

               related     0.7618    1.0000    0.8648      5018
               request     0.0000    0.0000    0.0000      1103
                 offer     0.0000    0.0000    0.0000        31
           aid_related     0.0000    0.0000    0.0000      2680
          medical_help     0.0000    0.0000    0.0000       523
      medical_products     0.0000    0.0000    0.0000       288
     search_and_rescue     0.0000    0.0000    0.0000       186
              security     0.0000    0.0000    0.0000       129
              military     0.0000    0.0000    0.0000       210
                 water     0.0000    0.0000    0.0000       423
                  food     0.0000    0.0000    0.0000       740
               shelter     0.0000    0.0000    0.0000       548
              clothing     0.0000    0.0000    0.0000       111
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [21]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(word_tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [22]:
def pipeline_2(clf):
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_transform', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
            ('starting_verb', StartingVerbExtractor())
        ])),
        ('clf', MultiOutputClassifier(clf))
    ])
    return pipeline

In [23]:
# 2. Ada Boost Classifier

clf_ada = pipeline_2(clf=AdaBoostClassifier(n_estimators=50, random_state=42))
clf_ada.fit(X_train, y_train)
y_pred_clf_ada = clf_ada.predict(X_test)

print("accuracy is ", accuracy_score(y_test,y_pred_clf_ada)*100)
print("The recall is ", recall_score(y_test,y_pred_clf_ada, average = 'weighted')*100)
print("The precision is ", precision_score(y_test,y_pred_clf_ada, average = 'weighted')*100)

print(classification_report(y_test, y_pred_clf_ada,digits=4,target_names=y.columns.values))



accuracy is  17.23090936693487
The recall is  28.696620509344438
The precision is  41.418767538586785
                        precision    recall  f1-score   support

               related     0.7663    0.9843    0.8617      5018
               request     0.4069    0.1287    0.1956      1103
                 offer     0.0000    0.0000    0.0000        31
           aid_related     0.4852    0.0978    0.1627      2680
          medical_help     0.0000    0.0000    0.0000       523
      medical_products     0.0000    0.0000    0.0000       288
     search_and_rescue     0.0000    0.0000    0.0000       186
              security     0.0000    0.0000    0.0000       129
              military     0.0000    0.0000    0.0000       210
                 water     0.2500    0.0024    0.0047       423
                  food     0.3750    0.0162    0.0311       740
               shelter     0.3333    0.0018    0.0036       548
              clothing     0.1250    0.0090    0.0168       111
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# 3. Extreme Gradient Boosting Classifier

clf_XGB = pipeline_2(XGBClassifier())
clf_XGB.fit(X_train, y_train)
y_pred_clf_XGB = clf_XGB.predict(X_test)

print("accuracy is ", accuracy_score(y_test,y_pred_clf_XGB)*100)
print("The recall is ", recall_score(y_test,y_pred_clf_XGB, average = 'weighted')*100)
print("The precision is ", precision_score(y_test,y_pred_clf_XGB, average = 'weighted')*100)

print(classification_report(y_test, y_pred_clf_XGB,digits=4,target_names=y.columns.values))



accuracy is  16.259298618490966
The recall is  30.10070688486492
The precision is  42.47060533692547
                        precision    recall  f1-score   support

               related     0.7654    0.9845    0.8612      5018
               request     0.4260    0.1278    0.1967      1103
                 offer     0.0000    0.0000    0.0000        31
           aid_related     0.4513    0.1832    0.2606      2680
          medical_help     0.0000    0.0000    0.0000       523
      medical_products     0.3333    0.0035    0.0069       288
     search_and_rescue     0.0000    0.0000    0.0000       186
              security     0.0000    0.0000    0.0000       129
              military     0.0000    0.0000    0.0000       210
                 water     0.0000    0.0000    0.0000       423
                  food     0.3548    0.0149    0.0285       740
               shelter     0.0000    0.0000    0.0000       548
              clothing     0.5000    0.0180    0.0348       111
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 9. Export your model as a pickle file

In [26]:
with open('classifier.pkl', 'wb') as f:
    pickle.dump(clf_XGB, f)