In [1]:
# Import Libraries

import sys
import sqlite3
import pandas as pd
from sqlalchemy import create_engine
from string import punctuation
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.ensemble import AdaBoostClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anubhav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anubhav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def load_data(database_filepath):
    """
    This function loads data from given database path 
    and returns a dataframe
    Input:
        database_filepath: database file path
    Output:
        X: traing message list
        Y: training target
        category names  
    """
    # load data from database
    engine = create_engine('sqlite:///'+ database_filepath)
    df = pd.read_sql_table('messages',engine)
    
    # define features and target
    X = df.message
    y = df.iloc[:,4:]
    category_names = list(df.columns[4:])
    
    return X, y, category_names

In [3]:
def tokenize(text):
    """
    Tokenization function to process the text data to normalize, lemmatize, and tokenize text. 
    Input: Text data
    Output: List of clean tokens 
    """
     # remove punctations
    #text =  ''.join([c for c in text if c not in punctuation])
    
    #tokenize text
    tokens = word_tokenize(text)
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for token in tokens:
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(token).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

In [4]:
def build_model():
    """
    Build Machine learning pipleine using Adaboost Classifier
    Input:
       None
    Output: 
        clf: gridSearch Model
    """
    ada_pipeline =  Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier((AdaBoostClassifier())))
    ])
    # grid search parameters
    parameters = {
    'tfidf__norm':['l2','l1'],
    'vect__stop_words': ['english',None],
    'clf__estimator__learning_rate' :[0.1, 0.5, 1, 2],
    'clf__estimator__n_estimators' : [50, 60, 70],
    }
    #create grid search object
    clf_grid_model = GridSearchCV(ada_pipeline, parameters)
    return clf_grid_model

In [5]:
def evaluate_model(model, X_test, Y_test, category_names):
    """
    Prints the classification report for the given model and test data
    Input:
        model: trained model
        X_test: test data for the predication 
        Y_test: true test labels for the X_test data
    Output:
        None 
    """
    # predict 
    y_pred = model.predict(X_test)
    # print the metrics
    for i, col in enumerate(category_names):
        print('{} category metrics: '.format(col))
        print(classification_report(Y_test.iloc[:,i], y_pred[:,i]))

In [6]:
def save_model(model, model_filepath):
    """
    This method is used to export a model as a pickle file
    Input:
        model: trained model 
        model_filepath: location to store the model
    Output: None
    """
    joblib.dump(model, model_filepath)

In [7]:
def main(database_filepath, model_filepath):
    print('Loading data...\n    DATABASE: {}'.format(database_filepath))
    X, Y, category_names = load_data(database_filepath)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    print('Building model...')
    model = build_model()

    print('Training model...')
    model.fit(X_train, Y_train)

    print('Evaluating model...')
    evaluate_model(model, X_test, Y_test, category_names)

    print('Saving model...\n    MODEL: {}'.format(model_filepath))
    save_model(model, model_filepath)

    print('Trained model saved!')

In [None]:
database_filepath = 'data/DisasterResponse.db'
model_filepath = 'data/classifier.pkl'
main(database_filepath,model_filepath)

Loading data...
    DATABASE: data/DisasterResponse.db
Building model...
Training model...


  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
