# Data Cleaning (__process_data.py__)

In [78]:
import pandas as pd
from sqlalchemy import create_engine

In [73]:
def load_data(messages_filepath, categories_filepath):
    message_df = pd.read_csv(messages_filepath)
    categories_df = pd.read_csv(categories_filepath)
    merged_df = pd.merge(message_df, categories_df, how='inner', on='id')
    return merged_df


In [74]:
def clean_data(df):
    df['categories'] = df['categories'].str.split(';')
    df['categories'] = df['categories'].apply(lambda x: dict(s.split('-') for s in x))
    df = pd.concat([df, pd.json_normalize(df['categories'])], axis='columns')
    df = df.drop(labels=['categories'], axis='columns')
    return df



In [75]:
def save_data(df, database_filepath):
    engine = create_engine(f'sqlite:///{database_filepath}', echo=True)
    conn = engine.connect()
    table_name = 'DisasterMessage'
    df.to_sql(table_name, conn, if_exists='fail')
    conn.close()
    return True

In [79]:
messages_filepath = 'data/disaster_messages.csv'
categories_filepath = 'data/disaster_categories.csv'
database_filepath = 'data/DisasterResponse.db'
print('Loading data...\n    MESSAGES: {}\n    CATEGORIES: {}'
      .format(messages_filepath, categories_filepath))
df = load_data(messages_filepath, categories_filepath)

print('Cleaning data...')
df = clean_data(df)

print('Saving data...\n    DATABASE: {}'.format(database_filepath))
save_data(df, database_filepath)



Loading data...
    MESSAGES: data/disaster_messages.csv
    CATEGORIES: data/disaster_categories.csv
Cleaning data...
Saving data...
    DATABASE: data/DisasterResponse.db
2023-08-03 10:16:22,634 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-08-03 10:16:22,641 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("DisasterMessage")
2023-08-03 10:16:22,642 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-08-03 10:16:22,642 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("DisasterMessage")
2023-08-03 10:16:22,642 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-08-03 10:16:22,644 INFO sqlalchemy.engine.Engine 
CREATE TABLE "DisasterMessage" (
	"index" BIGINT, 
	id BIGINT, 
	message TEXT, 
	original TEXT, 
	genre TEXT, 
	related TEXT, 
	request TEXT, 
	offer TEXT, 
	aid_related TEXT, 
	medical_help TEXT, 
	medical_products TEXT, 
	search_and_rescue TEXT, 
	security TEXT, 
	military TEXT, 
	child_alone TEXT, 
	water TEXT, 
	food TEXT, 
	shelter TEXT, 
	clothing TEXT, 
	money TEXT,

True

# Machine Learning Pipeline (train_classifier.py)

In [344]:
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohammedghawanni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohammedghawanni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mohammedghawanni/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [273]:
def load_data(database_filepath):
    """Loads the data from the specified database filepath.

    Args:
        database_filepath (str): The filepath of the database file.

    Returns:
        (list, list, list): The messages, categories, and category names.
    """

    conn = create_engine(f'sqlite:///{database_filepath}').connect()
    df = pd.read_sql_table('DisasterMessage', conn)
    df = df.drop(labels=['index', 'id'], axis='columns')
    messages = df.message.values
    categories = df.iloc[:, 3:].values
    category_names = df.iloc[:, 3:].columns
    conn.close()
    return messages, categories, category_names


In [274]:
def tokenize(text):
    """Tokenizes a text string and returns a list of clean tokens.

    Args:
        text (str): The text string to tokenize.

    Returns:
        list: A list of clean tokens.
    """

    # get word tokens
    tokens = word_tokenize(text)

    # Lemmatize every word (token) and remove whitespace and convert to lowercase
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

In [335]:
def build_model():
    """Builds a machine learning model for multi-label classification.

    Returns:
        GridSearchCV: A grid search object that can be used to fit the model.
    """

    pipeline = Pipeline(
        [('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ("vect", CountVectorizer(tokenizer=tokenize)),
                ("tfidf", TfidfTransformer())
            ]))
        ])),
         ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=2)))
         ]
    )

    # Commented out some params to allow the code to run faster
    parameters = {
        # 'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        # 'clf__n_estimators': [50, 100, 200],
        'clf__estimator__n_estimators': [50],
        # 'clf__min_samples_split': [2, 3, 4],
        # 'clf__max_depth': [5, 10, 20]
        # 'clf__max_depth': [5, 10]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)
    return cv

In [349]:
def evaluate_model(model, X_test, Y_test, category_names):
    """Evaluates a machine learning model on a test set.

    Args:
        model (sklearn.model): The machine learning model to evaluate.
        X_test (numpy.ndarray): The test data.
        Y_test (numpy.ndarray): The ground truth labels for the test data.
        category_names (list): The names of the 36 categories.

    Returns:
        None.
    """

    # predict messages category
    y_pred = model.predict(X_test)
    Y_pred_df = pd.DataFrame(y_pred, columns=category_names)
    # transform Y_test to df to loop over it
    Y_test = pd.DataFrame().from_records(Y_test)

    # loop over all categories and print classification_report for each category
    for i in range(len(category_names)):
        print('Category: {}'.format(category_names[i].upper()), "\n\n",
              classification_report(Y_test.iloc[:, i], Y_pred_df.iloc[:, i]))

    print("Best parameters: ", model.best_params_)
    return True

In [337]:
def save_model(model, model_filepath):
    """Saves a machine learning model to a file.

    Args:
        model (sklearn.model): The machine learning model to save.
        model_filepath (str): The filepath to the file where the model will be saved.

    Returns:
        None.
    """

    pickle.dump(model, open(model_filepath, 'wb'))

In [338]:
database_filepath = 'data/DisasterResponse.db'
model_filepath = 'models/classifier.pkl'
print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print('Building model...')
model = build_model()

print('Training model...')
model.fit(X_train, Y_train)

print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

print('Trained model saved!')

Loading data...
    DATABASE: data/DisasterResponse.db
Building model...
Training model...




Trained model saved!


In [350]:
print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)


Evaluating model...
Category: RELATED 

               precision    recall  f1-score   support

           0       0.72      0.29      0.41      1188
           1       0.82      0.97      0.89      4043
           2       0.71      0.21      0.33        47

    accuracy                           0.81      5278
   macro avg       0.75      0.49      0.54      5278
weighted avg       0.79      0.81      0.77      5278

Category: REQUEST 

               precision    recall  f1-score   support

           0       0.90      0.99      0.95      4431
           1       0.90      0.45      0.60       847

    accuracy                           0.90      5278
   macro avg       0.90      0.72      0.77      5278
weighted avg       0.90      0.90      0.89      5278

Category: OFFER 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5254
           1       0.00      0.00      0.00        24

    accuracy                           1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: AID_RELATED 

               precision    recall  f1-score   support

           0       0.76      0.89      0.82      3067
           1       0.80      0.62      0.70      2211

    accuracy                           0.77      5278
   macro avg       0.78      0.75      0.76      5278
weighted avg       0.78      0.77      0.77      5278

Category: MEDICAL_HELP 

               precision    recall  f1-score   support

           0       0.93      1.00      0.96      4875
           1       0.67      0.07      0.13       403

    accuracy                           0.93      5278
   macro avg       0.80      0.54      0.55      5278
weighted avg       0.91      0.93      0.90      5278

Category: MEDICAL_PRODUCTS 

               precision    recall  f1-score   support

           0       0.95      1.00      0.97      4997
           1       0.90      0.06      0.12       281

    accuracy                           0.95      5278
   macro avg       0.92      0.53      0.55    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: WATER 

               precision    recall  f1-score   support

           0       0.95      1.00      0.98      4924
           1       0.92      0.32      0.48       354

    accuracy                           0.95      5278
   macro avg       0.94      0.66      0.73      5278
weighted avg       0.95      0.95      0.94      5278

Category: FOOD 

               precision    recall  f1-score   support

           0       0.92      0.99      0.96      4663
           1       0.89      0.37      0.52       615

    accuracy                           0.92      5278
   macro avg       0.90      0.68      0.74      5278
weighted avg       0.92      0.92      0.91      5278

Category: SHELTER 

               precision    recall  f1-score   support

           0       0.93      1.00      0.96      4805
           1       0.85      0.26      0.40       473

    accuracy                           0.93      5278
   macro avg       0.89      0.63      0.68      5278
weighted avg    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: ELECTRICITY 

               precision    recall  f1-score   support

           0       0.98      1.00      0.99      5171
           1       0.90      0.08      0.15       107

    accuracy                           0.98      5278
   macro avg       0.94      0.54      0.57      5278
weighted avg       0.98      0.98      0.97      5278

Category: TOOLS 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5254
           1       0.00      0.00      0.00        24

    accuracy                           1.00      5278
   macro avg       0.50      0.50      0.50      5278
weighted avg       0.99      1.00      0.99      5278

Category: HOSPITALS 

               precision    recall  f1-score   support

           0       0.99      1.00      0.99      5216
           1       0.00      0.00      0.00        62

    accuracy                           0.99      5278
   macro avg       0.49      0.50      0.50      5278
weighte

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: SHOPS 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5260
           1       0.00      0.00      0.00        18

    accuracy                           1.00      5278
   macro avg       0.50      0.50      0.50      5278
weighted avg       0.99      1.00      0.99      5278

Category: AID_CENTERS 

               precision    recall  f1-score   support

           0       0.99      1.00      0.99      5218
           1       0.00      0.00      0.00        60

    accuracy                           0.99      5278
   macro avg       0.49      0.50      0.50      5278
weighted avg       0.98      0.99      0.98      5278

Category: OTHER_INFRASTRUCTURE 

               precision    recall  f1-score   support

           0       0.96      1.00      0.98      5046
           1       0.00      0.00      0.00       232

    accuracy                           0.96      5278
   macro avg       0.48      0.50      0.49      5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: WEATHER_RELATED 

               precision    recall  f1-score   support

           0       0.86      0.97      0.91      3782
           1       0.88      0.60      0.71      1496

    accuracy                           0.86      5278
   macro avg       0.87      0.78      0.81      5278
weighted avg       0.86      0.86      0.85      5278

Category: FLOODS 

               precision    recall  f1-score   support

           0       0.94      1.00      0.97      4834
           1       0.95      0.32      0.48       444

    accuracy                           0.94      5278
   macro avg       0.95      0.66      0.73      5278
weighted avg       0.94      0.94      0.93      5278

Category: STORM 

               precision    recall  f1-score   support

           0       0.94      0.99      0.97      4782
           1       0.82      0.41      0.55       496

    accuracy                           0.94      5278
   macro avg       0.88      0.70      0.76      5278
weight

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: FIRE 

               precision    recall  f1-score   support

           0       0.99      1.00      0.99      5219
           1       0.00      0.00      0.00        59

    accuracy                           0.99      5278
   macro avg       0.49      0.50      0.50      5278
weighted avg       0.98      0.99      0.98      5278

Category: EARTHQUAKE 

               precision    recall  f1-score   support

           0       0.97      0.99      0.98      4750
           1       0.89      0.71      0.79       528

    accuracy                           0.96      5278
   macro avg       0.93      0.85      0.88      5278
weighted avg       0.96      0.96      0.96      5278

Category: COLD 

               precision    recall  f1-score   support

           0       0.98      1.00      0.99      5172
           1       0.67      0.04      0.07       106

    accuracy                           0.98      5278
   macro avg       0.82      0.52      0.53      5278
weighted avg  

True

In [348]:
# df.drop(labels=['id']).iloc[:,1:3]
Y_test
df

array([['1', '0', '0', ..., '0', '0', '0'],
       ['1', '1', '0', ..., '0', '0', '1'],
       ['1', '1', '0', ..., '0', '0', '0'],
       ...,
       ['1', '0', '0', ..., '0', '0', '0'],
       ['1', '1', '0', ..., '0', '0', '0'],
       ['1', '0', '0', ..., '0', '0', '0']], dtype=object)