In [2]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [3]:
import sys
import pandas as pd
from sqlalchemy import create_engine


def load_data(messages_filepath, categories_filepath):
    """
    Function to :
        - Loads data from csv files into dataframes and merge them afterwards
    
    Args:
        messages_filepath (str): File path of messages
        categories_filepath (str): File pathe of categories
    
    Returns:
        pandas dataframe: Merged dataframe containing messages and categories
    """
    messages = pd.read_csv(messages_filepath)
    categories = pd.read_csv(categories_filepath)
        
    return pd.merge(messages, categories, how = 'inner', on = ['id'])


def clean_data(df):
    """
    Function to :
        - Clean the data loaded as merged pandas dataframe
    
    Args:
        df (pandas dataframe): Dataframe containing messages and categories
    
    Returns:
        pandas dataframe: Cleaned dataframe
    """
    # create a dataframe of the 36 individual category columns
    categories = df['categories'].str.split( ";", expand = True )
    
    # extract a list of new column names for categories.
    category_colnames = categories.iloc[0].apply( lambda x: x[:-2] ).tolist()
    
    # rename the columns of `categories`
    categories.columns = category_colnames
    
    for column in categories:
    
        # set each value to be the last character of the string
        categories[column] = categories[column].str[-1]
        
        # convert column from string to numeric
        categories[column] = pd.to_numeric(categories[column])
    
    # drop the original categories column from `df`
    df = df.drop( columns = ['categories'], axis = 1 )
    
    # concatenate the original dataframe with the new `categories` dataframe
    df = pd.concat( [df, categories], axis = 1 )
    
    # drop duplicates
    df = df.drop_duplicates()
    
    return df


def save_data(df, database_filename):
    """
    Function to :
        - Save cleaned data as sqlite database
    
    Args:
        df (pandas dataframe): Dataframe containing messages and categories
        database_filename (str): Database name
    
    Returns:
        None
    """
    engine = create_engine('sqlite:///' + database_filename)
    
    database_name = database_filename.replace(".db","")
    df.to_sql(database_name, engine, index = False)


def main():
    
    if len(sys.argv) == 4:

        messages_filepath, categories_filepath, database_filepath = sys.argv[1:]

        print('Loading data...\n    MESSAGES: {}\n    CATEGORIES: {}'
              .format(messages_filepath, categories_filepath))
        df = load_data(messages_filepath, categories_filepath)

        print('Cleaning data...')
        df = clean_data(df)
        
        print('Saving data...\n    DATABASE: {}'.format(database_filepath))
        save_data(df, database_filepath)
        
        print('Cleaned data saved to database!')
    
    else:
        
        print('Please provide the filepaths of the messages and categories '\
              'datasets as the first and second argument respectively, as '\
              'well as the filepath of the database to save the cleaned data '\
              'to as the third argument. \n\nExample: python process_data.py '\
              'disaster_messages.csv disaster_categories.csv '\
              'DisasterResponse.db')


if __name__ == '__main__':
    main()

Please provide the filepaths of the messages and categories datasets as the first and second argument respectively, as well as the filepath of the database to save the cleaned data to as the third argument. 

Example: python process_data.py disaster_messages.csv disaster_categories.csv DisasterResponse.db


In [5]:
import pandas as pd


In [7]:
data = pd.read_csv('data\\disaster_categories.csv')

In [8]:
data.head()

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medi...
1,7,related-1;request-0;offer-0;aid_related-1;medi...
2,8,related-1;request-0;offer-0;aid_related-0;medi...
3,9,related-1;request-1;offer-0;aid_related-1;medi...
4,12,related-1;request-0;offer-0;aid_related-0;medi...


In [10]:
data = pd.read_csv('data\\disaster_messages.csv')
data.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [43]:
import numpy as np
import sqlite3 as sql

In [47]:
database = "DisasterResponse.db"
connection = sql.connect(database)

In [48]:
query = '''SELECT * FROM DisasterResponse'''

In [49]:
df = pd.read_sql_query(query,connection)
df.head()

DatabaseError: Execution failed on sql 'SELECT * FROM DisasterResponse': no such table: DisasterResponse

In [50]:
import re
import sys
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download(['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger'])

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import make_scorer, f1_score,\
                            precision_recall_fscore_support,\
                            classification_report
from sklearn.model_selection import GridSearchCV, train_test_split

import pickle


def load_data(database_filepath):
    """
    Function to :
        - Load data from database
    
    Args:
        database_filepath (str): File path of database
    
    Returns:
        pandas dataframe: Merged dataframe containing messages and categories
    """
    engine = create_engine('sqlite:///' + database_filepath)
    
    indexes = [x.start() for x in re.finditer('/', database_filepath)]
    database_name = database_filepath[indexes[-1] + 1: -3]
    
    df = pd.read_sql_table(database_name, engine)
    X = df["message"]
    Y = df.iloc[:,4:]
    category_names = Y.columns.tolist()
    
    return X, Y, category_names


def tokenize(text):
    """
    Function to :
        - process text data
    
    Args:
        text (str): string of messages
    
    Returns:
        clean_tokens (list): list of tokenized text data
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation characters; prefixed with r to indicate that 
    # it is a regular expression
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    # tokenize text / Split text into words using NLTK
    tokens = word_tokenize(text)
    
    tokens = [w for w in tokens if w not in stopwords.words("english")]
    
    # Reduce words to their root form and remove white space
    clean_tokens = [WordNetLemmatizer().lemmatize(w).strip() for w in tokens]
            
    return clean_tokens


class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    
    """ 
    Class for:
        -extracting whether each sentence started with a verb,
         creating a new feature
            
    """
    
    def starting_verb(self, text):

        sentence_list = nltk.sent_tokenize(text) # tokenize by sentences

        for sentence in sentence_list:
            
            # tokenize each sentence into words and tag part of speech
            pos_tags = nltk.pos_tag(tokenize(sentence))
            
            # Check if pos_tags is empty; true if pos_tags is not empty
            if pos_tags:
                
                # index pos_tags to get the first word and part of speech tag
                first_word, first_tag = pos_tags[0]
                
                # return true if the first word is an appropriate verb 
                # or RT for retweet
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    return True

        return False

    def fit(self, x, y = None):
        return self

    def transform(self, X):
        
        # apply starting_verb function to all values in X
        X_tagged = pd.Series(X).apply(self.starting_verb)
        
        return pd.DataFrame(X_tagged)


def multi_output_fscore(Y_true, Y_pred):
    """
    Function to :
        - be the input of function make_scorer(), thus being the scoring method 
          of the grid search object created by GridSearchCV()
    
    Args:
        Y_true (pandas dataframe): labels
        Y_pred (pandas dataframe): predictions
        average (str): this determines the type of averaging performed on the 
                       data
    
    Returns:
        fscore_list.mean() (float): mean of f1-score
    """
    fscore_list = []
    
    for i in range(0, Y_true.shape[1]):

        f_score = f1_score(y_true = Y_true.iloc[:, i],\
                           y_pred = Y_pred[:, i],\
                           average = 'weighted',\
                           zero_division = 0)
                            
        fscore_list.append(f_score)

    fscore_list = np.array(fscore_list)

    return fscore_list.mean()


def build_model():
    """
    Function to :
        - Build the model with the parameters selected by grid search
    
    Args:
        None
    
    Returns:
        cv (estimator): machine learning model trained with the parameters 
                        selected by grid search
    """    
    # build pipeline
    pipeline = Pipeline([
        ('features', FeatureUnion([
    
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer = tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
    
            ('starting_verb', StartingVerbExtractor())
        ])),
    
        ('clf', MultiOutputClassifier(AdaBoostClassifier(random_state = 42)))
    ])

    # specify parameters for grid search
    parameters = {
                    'features__text_pipeline__vect__ngram_range': [(1, 2)],\
                    'features__text_pipeline__vect__max_features': [5000],\
                    'clf__estimator__algorithm': ['SAMME.R'],\
                    'clf__estimator__base_estimator': [None],\
                    'clf__estimator__learning_rate': [1],\
                    'clf__estimator__n_estimators': [50, 100]
    }
    
    scorer = make_scorer(multi_output_fscore, greater_is_better = True)

    # create grid search object
    cv = GridSearchCV(estimator = pipeline,\
                      param_grid = parameters,\
                      scoring = scorer,\
                      n_jobs = 1,\
                      refit = True,\
                      cv = 2,\
                      verbose = 4,\
                      error_score = 'raise')
    
    return cv


def evaluate_model(model, X_test, Y_test, category_names, average = 'weighted'):
    """
    Function to :
        - Evaluate model
    
    Args:
        model (estimator): machine learning model
        X_test (pandas series): test data set of X
        Y_test (pandas dataframe): test data set of Y
        category_names (list): name of categories
        average (str): this determines the type of averaging performed 
                       on the data
        
    Returns:
        None
    """
    # Predict test data
    Y_pred = model.predict(X_test)
    
    results = pd.DataFrame(columns = ['Category', 'Precision', 'Recall',\
                                      'F-score'])

    for i in range(len(category_names)):

        category = category_names[i]
        
        precision, recall, f_score, support =\
        precision_recall_fscore_support(Y_test[category],\
                                        Y_pred[:, i],\
                                        average = average,\
                                        zero_division = 0 
        )
        
        results = results.append({'Category': category,\
                                  'Precision': precision,\
                                  'Recall': recall,\
                                  'F-score': f_score},\
                                  ignore_index = True)

    print('Mean Precision:', results['Precision'].mean())
    print('Mean Recall:', results['Recall'].mean())
    print('Mean F_score:', results['F-score'].mean())
    print('\n--------------------Classification Report--------------------\n')
    
    for i in range(len(category_names)):

        category = category_names[i]
        print(category)
        print(classification_report(Y_test[category],\
                                    Y_pred[:, i],\
                                    zero_division = 0))


def save_model(model, model_filepath):
    """
    Function to :
        - Saves model as pickle file
    
    Args:
        model (estimator): machine learning model
        model_filepath (str): path where model will be saved
    
    Returns:
        None
    """
    with open(model_filepath, 'wb') as file:  
        pickle.dump(model, file)


def main():

    if len(sys.argv) == 3:
        
        database_filepath, model_filepath = sys.argv[1:]
        
        print('\n<----- Loading data... ----->\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y,\
                                                            test_size = 0.2)
        
        print('\n<----- Building model... ----->')
        model = build_model()
        
        print('\n<----- Training model... ----->')
        model.fit(X_train, Y_train)
        print('\nBest parameters found by grid search:\n{}'.format(model.best_params_))
        
        print('\n<----- Evaluating model... ----->')
        evaluate_model(model, X_test, Y_test, category_names)

        print('\n<----- Saving model... ----->\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('\n\nTrained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()


<----- Loading data... ----->
    DATABASE: -f


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Win\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Win\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Win\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Win\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


IndexError: list index out of range

In [51]:
import json
import plotly
import pandas as pd
import joblib

from flask import Flask
from flask import render_template, request, jsonify
from plotly.graph_objs import Bar
from sqlalchemy import create_engine

from utils.plotting import return_figure
from utils.custom_scorer import multi_output_fscore
from utils.custom_transformer import tokenize, StartingVerbExtractor


app = Flask(__name__)

# load data
database_name = 'DisasterResponse'
engine = create_engine('sqlite:///../data/{}.db'.format(database_name))
df = pd.read_sql_table('{}'.format(database_name), engine)

# load model
model_name = 'classifier'
model = joblib.load("../models/{}.pkl".format(model_name))


# index webpage displays cool visuals and receives user input text for model
@app.route('/')
@app.route('/index')
def index():
    
    # create visuals
    graphs = return_figure(df = df)
    
    # encode plotly graphs in JSON
    ids = ["graph-{}".format(i) for i, _ in enumerate(graphs)]
    graphJSON = json.dumps(graphs, cls = plotly.utils.PlotlyJSONEncoder)
    
    # render web page with plotly graphs
    return render_template('master.html', ids = ids, graphJSON = graphJSON)


# web page that handles user query and displays model results
@app.route('/go')
def go():
    # save user input in query
    query = request.args.get('query', '') 

    # use model to predict classification for query
    classification_labels = model.predict([query])[0]
    classification_results = dict(zip(df.columns[4:], classification_labels))

    # This will render the go.html Please see that file. 
    return render_template(
        'go.html',
        query=query,
        classification_result=classification_results
    )


def main():
    app.run(host = '0.0.0.0', port = 3001, debug = True)


if __name__ == '__main__':
    main()

ModuleNotFoundError: No module named 'utils'

In [37]:
!pip install ipython-sql

