# ML Pipeline Preparation

### 1. Import libraries and load data from database

In [1]:
# import libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
from nltk.stem import WordNetLemmatizer

import re 
from nltk import word_tokenize 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kangle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kangle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kangle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///../data/disaster.db')
df = pd.read_sql_table('disaster', con = engine)
X = df.iloc[:,1].values
y = df.iloc[:,5:-1].values

### 2. define tokenization function

In [7]:
def tokenize(text):
    # sentence tokenize 
    sentences = sent_tokenize(text)
    
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for sent_ in sentences:
        # normalization 
        text = re.sub(r"[^a-zA-Z0-9]", " ", sent_.lower())
        # tokenize 
        words = word_tokenize(text)
        # remove stop words 
        words = [word for word in words if not word in stop_words]
        # lemmatization
        for word in words:
            clean_tok = lemmatizer.lemmatize(word).lower().strip()
            clean_tokens.append(clean_tok)
    return clean_tokens           

### 3. Build a machine learning pipeline

In [8]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),      
    ])),

    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

In [9]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'features', 'clf', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__verbose', 'features__text_pipeline', 'features__text_pipeline__memory', 'features__text_pipeline__steps', 'features__text_pipeline__verbose', 'features__text_pipeline__vect', 'features__text_pipeline__tfidf', 'features__text_pipeline__vect__analyzer', 'features__text_pipeline__vect__binary', 'features__text_pipeline__vect__decode_error', 'features__text_pipeline__vect__dtype', 'features__text_pipeline__vect__encoding', 'features__text_pipeline__vect__input', 'features__text_pipeline__vect__lowercase', 'features__text_pipeline__vect__max_df', 'features__text_pipeline__vect__max_features', 'features__text_pipeline__vect__min_df', 'features__text_pipeline__vect__ngram_range', 'features__text_pipeline__vect__preprocessor', 'features__text_pipeline__vect__stop_words', 'features__text_pipeline__vect__strip_accents', 'features__text_pipeline

### 4. Use grid search to find better parameters

In [11]:
parameters = {
        #'features__text_pipeline__vect__max_df': (0.5, 1.0),
        #'features__text_pipeline__vect__max_features': (None, 5000, 10000),
        # 'clf__estimator__n_estimators': [50, 100, 200],
        'clf__estimator__class_weight': ['balanced_subsample'],
        'clf__estimator__min_samples_split': [2, 4]
}

cv_model = GridSearchCV(pipeline, param_grid = parameters,n_jobs=-1)

### 5. Train pipeline
- Split data into train and test sets
- Train pipeline

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.3, random_state = 0)
cv_model.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text_pipeline',
                                                                        Pipeline(steps=[('vect',
                                                                                         CountVectorizer(tokenizer=<function tokenize at 0x0000022878FF3CA0>)),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer())]))])),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             n_jobs=-1,
             param_grid={'clf__estimator__class_weight': ['balanced_subsample'],
                         'clf__estimator__min_samples_split': [2, 4]})

### 6. Test 
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [15]:
category_names = list(df.columns.values)[5:-1]
y_pred = cv_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=category_names))

                        precision    recall  f1-score   support

               request       0.78      0.60      0.68      1384
           aid_related       0.73      0.74      0.74      3293
          medical_help       0.64      0.21      0.32       635
      medical_products       0.77      0.16      0.26       397
     search_and_rescue       0.75      0.07      0.12       224
              security       0.00      0.00      0.00       129
              military       0.73      0.28      0.40       240
                 water       0.82      0.51      0.63       504
                  food       0.83      0.65      0.73       866
               shelter       0.82      0.43      0.56       708
              clothing       0.79      0.15      0.25       127
                 money       1.00      0.05      0.09       165
        missing_people       0.00      0.00      0.00        96
              refugees       0.53      0.04      0.07       262
                 death       0.82      

### 7. Export model as a pickle file

In [16]:
# from sklearn.externals import joblib
import joblib 
filename = 'classifier.pkl'
joblib.dump(cv_model, filename)

['classifier.pkl']