# Data Cleaning (__process_data.py__)

In [366]:
import pandas as pd
from sqlalchemy import create_engine

In [367]:
def load_data(messages_filepath, categories_filepath):
    message_df = pd.read_csv(messages_filepath)
    categories_df = pd.read_csv(categories_filepath)
    merged_df = pd.merge(message_df, categories_df, how='inner', on='id')
    return merged_df


In [368]:
def clean_data(df):
    df['categories'] = df['categories'].str.split(';')
    df['categories'] = df['categories'].apply(lambda x: dict(s.split('-') for s in x))
    df = pd.concat([df, pd.json_normalize(df['categories'])], axis='columns')
    df = df.drop(labels=['categories'], axis='columns')

    # check all columns and for any column with non-binary value, make the value 1
    for col in df.columns.values[4:]:
        print(df[col].value_counts())
    if len(df[col].value_counts()) > 2:
        df.loc[(df[col] != '1') & (df[col] != '0'), [col] ]= '1'

    return df



In [386]:
def save_data(df, database_filepath):
    engine = create_engine(f'sqlite:///{database_filepath}', echo=True)
    conn = engine.connect()
    table_name = 'DisasterMessage'
    df.to_sql(table_name, conn, if_exists='replace')
    conn.close()
    return True

In [387]:
messages_filepath = 'data/disaster_messages.csv'
categories_filepath = 'data/disaster_categories.csv'
database_filepath = 'data/DisasterResponse.db'
print('Loading data...\n    MESSAGES: {}\n    CATEGORIES: {}'
      .format(messages_filepath, categories_filepath))
df = load_data(messages_filepath, categories_filepath)

print('Cleaning data...')
df = clean_data(df)
for col in df.columns.values:
    df[col].value_counts()
print('Saving data...\n    DATABASE: {}'.format(database_filepath))
save_data(df, database_filepath)



Loading data...
    MESSAGES: data/disaster_messages.csv
    CATEGORIES: data/disaster_categories.csv
Cleaning data...
Saving data...
    DATABASE: data/DisasterResponse.db
2023-08-06 10:52:44,463 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-08-06 10:52:44,470 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("DisasterMessage")
2023-08-06 10:52:44,470 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-08-06 10:52:44,471 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("DisasterMessage")
2023-08-06 10:52:44,471 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-08-06 10:52:44,471 INFO sqlalchemy.engine.Engine SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite~_%' ESCAPE '~' ORDER BY name
2023-08-06 10:52:44,472 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-08-06 10:52:44,472 INFO sqlalchemy.engine.Engine SELECT name FROM sqlite_temp_master WHERE type='table' AND name NOT LIKE 'sqlite~_%' ESCAPE '~' ORDER BY name
2023-08-06 10:52:44,472 INFO sqlalchemy.

True

related
1    20042
0     6140
2      204
Name: count, dtype: int64
      related
117         2
221         2
307         2
462         2
578         2
...       ...
20465       2
20636       2
22481       2
23537       2
25385       2

[204 rows x 1 columns]
Empty DataFrame
Columns: [related]
Index: []
request
0    21873
1     4513
Name: count, dtype: int64
offer
0    26265
1      121
Name: count, dtype: int64
aid_related
0    15432
1    10954
Name: count, dtype: int64
medical_help
0    24287
1     2099
Name: count, dtype: int64
medical_products
0    25067
1     1319
Name: count, dtype: int64
search_and_rescue
0    25661
1      725
Name: count, dtype: int64
security
0    25915
1      471
Name: count, dtype: int64
military
0    25523
1      863
Name: count, dtype: int64
child_alone
0    26386
Name: count, dtype: int64
water
0    24702
1     1684
Name: count, dtype: int64
food
0    23430
1     2956
Name: count, dtype: int64
shelter
0    24044
1     2342
Name: count, dtype: int64
clothing

# Machine Learning Pipeline (train_classifier.py)

In [344]:
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohammedghawanni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohammedghawanni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mohammedghawanni/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [273]:
def load_data(database_filepath):
    """Loads the data from the specified database filepath.

    Args:
        database_filepath (str): The filepath of the database file.

    Returns:
        (list, list, list): The messages, categories, and category names.
    """

    conn = create_engine(f'sqlite:///{database_filepath}').connect()
    df = pd.read_sql_table('DisasterMessage', conn)
    df = df.drop(labels=['index', 'id'], axis='columns')
    messages = df.message.values
    categories = df.iloc[:, 3:].values
    category_names = df.iloc[:, 3:].columns
    conn.close()
    return messages, categories, category_names


In [274]:
def tokenize(text):
    """Tokenizes a text string and returns a list of clean tokens.

    Args:
        text (str): The text string to tokenize.

    Returns:
        list: A list of clean tokens.
    """

    # get word tokens
    tokens = word_tokenize(text)

    # Lemmatize every word (token) and remove whitespace and convert to lowercase
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

In [335]:
def build_model():
    """Builds a machine learning model for multi-label classification.

    Returns:
        GridSearchCV: A grid search object that can be used to fit the model.
    """

    pipeline = Pipeline(
        [('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ("vect", CountVectorizer(tokenizer=tokenize)),
                ("tfidf", TfidfTransformer())
            ]))
        ])),
         ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=2)))
         ]
    )

    # Commented out some params to allow the code to run faster
    parameters = {
        # 'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        # 'clf__n_estimators': [50, 100, 200],
        'clf__estimator__n_estimators': [50],
        # 'clf__min_samples_split': [2, 3, 4],
        # 'clf__max_depth': [5, 10, 20]
        # 'clf__max_depth': [5, 10]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)
    return cv

In [349]:
def evaluate_model(model, X_test, Y_test, category_names):
    """Evaluates a machine learning model on a test set.

    Args:
        model (sklearn.model): The machine learning model to evaluate.
        X_test (numpy.ndarray): The test data.
        Y_test (numpy.ndarray): The ground truth labels for the test data.
        category_names (list): The names of the 36 categories.

    Returns:
        None.
    """

    # predict messages category
    y_pred = model.predict(X_test)
    Y_pred_df = pd.DataFrame(y_pred, columns=category_names)
    # transform Y_test to df to loop over it
    Y_test = pd.DataFrame().from_records(Y_test)

    # loop over all categories and print classification_report for each category
    for i in range(len(category_names)):
        print('Category: {}'.format(category_names[i].upper()), "\n\n",
              classification_report(Y_test.iloc[:, i], Y_pred_df.iloc[:, i]))

    print("Best parameters: ", model.best_params_)
    return True

In [337]:
def save_model(model, model_filepath):
    """Saves a machine learning model to a file.

    Args:
        model (sklearn.model): The machine learning model to save.
        model_filepath (str): The filepath to the file where the model will be saved.

    Returns:
        None.
    """

    pickle.dump(model, open(model_filepath, 'wb'))

In [338]:
database_filepath = 'data/DisasterResponse.db'
model_filepath = 'models/classifier.pkl'
print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print('Building model...')
model = build_model()

print('Training model...')
model.fit(X_train, Y_train)

print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

print('Trained model saved!')

Loading data...
    DATABASE: data/DisasterResponse.db
Building model...
Training model...




Trained model saved!


In [350]:
print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)


Evaluating model...
Category: RELATED 

               precision    recall  f1-score   support

           0       0.72      0.29      0.41      1188
           1       0.82      0.97      0.89      4043
           2       0.71      0.21      0.33        47

    accuracy                           0.81      5278
   macro avg       0.75      0.49      0.54      5278
weighted avg       0.79      0.81      0.77      5278

Category: REQUEST 

               precision    recall  f1-score   support

           0       0.90      0.99      0.95      4431
           1       0.90      0.45      0.60       847

    accuracy                           0.90      5278
   macro avg       0.90      0.72      0.77      5278
weighted avg       0.90      0.90      0.89      5278

Category: OFFER 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5254
           1       0.00      0.00      0.00        24

    accuracy                           1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: AID_RELATED 

               precision    recall  f1-score   support

           0       0.76      0.89      0.82      3067
           1       0.80      0.62      0.70      2211

    accuracy                           0.77      5278
   macro avg       0.78      0.75      0.76      5278
weighted avg       0.78      0.77      0.77      5278

Category: MEDICAL_HELP 

               precision    recall  f1-score   support

           0       0.93      1.00      0.96      4875
           1       0.67      0.07      0.13       403

    accuracy                           0.93      5278
   macro avg       0.80      0.54      0.55      5278
weighted avg       0.91      0.93      0.90      5278

Category: MEDICAL_PRODUCTS 

               precision    recall  f1-score   support

           0       0.95      1.00      0.97      4997
           1       0.90      0.06      0.12       281

    accuracy                           0.95      5278
   macro avg       0.92      0.53      0.55    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: WATER 

               precision    recall  f1-score   support

           0       0.95      1.00      0.98      4924
           1       0.92      0.32      0.48       354

    accuracy                           0.95      5278
   macro avg       0.94      0.66      0.73      5278
weighted avg       0.95      0.95      0.94      5278

Category: FOOD 

               precision    recall  f1-score   support

           0       0.92      0.99      0.96      4663
           1       0.89      0.37      0.52       615

    accuracy                           0.92      5278
   macro avg       0.90      0.68      0.74      5278
weighted avg       0.92      0.92      0.91      5278

Category: SHELTER 

               precision    recall  f1-score   support

           0       0.93      1.00      0.96      4805
           1       0.85      0.26      0.40       473

    accuracy                           0.93      5278
   macro avg       0.89      0.63      0.68      5278
weighted avg    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: ELECTRICITY 

               precision    recall  f1-score   support

           0       0.98      1.00      0.99      5171
           1       0.90      0.08      0.15       107

    accuracy                           0.98      5278
   macro avg       0.94      0.54      0.57      5278
weighted avg       0.98      0.98      0.97      5278

Category: TOOLS 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5254
           1       0.00      0.00      0.00        24

    accuracy                           1.00      5278
   macro avg       0.50      0.50      0.50      5278
weighted avg       0.99      1.00      0.99      5278

Category: HOSPITALS 

               precision    recall  f1-score   support

           0       0.99      1.00      0.99      5216
           1       0.00      0.00      0.00        62

    accuracy                           0.99      5278
   macro avg       0.49      0.50      0.50      5278
weighte

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: SHOPS 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5260
           1       0.00      0.00      0.00        18

    accuracy                           1.00      5278
   macro avg       0.50      0.50      0.50      5278
weighted avg       0.99      1.00      0.99      5278

Category: AID_CENTERS 

               precision    recall  f1-score   support

           0       0.99      1.00      0.99      5218
           1       0.00      0.00      0.00        60

    accuracy                           0.99      5278
   macro avg       0.49      0.50      0.50      5278
weighted avg       0.98      0.99      0.98      5278

Category: OTHER_INFRASTRUCTURE 

               precision    recall  f1-score   support

           0       0.96      1.00      0.98      5046
           1       0.00      0.00      0.00       232

    accuracy                           0.96      5278
   macro avg       0.48      0.50      0.49      5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: WEATHER_RELATED 

               precision    recall  f1-score   support

           0       0.86      0.97      0.91      3782
           1       0.88      0.60      0.71      1496

    accuracy                           0.86      5278
   macro avg       0.87      0.78      0.81      5278
weighted avg       0.86      0.86      0.85      5278

Category: FLOODS 

               precision    recall  f1-score   support

           0       0.94      1.00      0.97      4834
           1       0.95      0.32      0.48       444

    accuracy                           0.94      5278
   macro avg       0.95      0.66      0.73      5278
weighted avg       0.94      0.94      0.93      5278

Category: STORM 

               precision    recall  f1-score   support

           0       0.94      0.99      0.97      4782
           1       0.82      0.41      0.55       496

    accuracy                           0.94      5278
   macro avg       0.88      0.70      0.76      5278
weight

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: FIRE 

               precision    recall  f1-score   support

           0       0.99      1.00      0.99      5219
           1       0.00      0.00      0.00        59

    accuracy                           0.99      5278
   macro avg       0.49      0.50      0.50      5278
weighted avg       0.98      0.99      0.98      5278

Category: EARTHQUAKE 

               precision    recall  f1-score   support

           0       0.97      0.99      0.98      4750
           1       0.89      0.71      0.79       528

    accuracy                           0.96      5278
   macro avg       0.93      0.85      0.88      5278
weighted avg       0.96      0.96      0.96      5278

Category: COLD 

               precision    recall  f1-score   support

           0       0.98      1.00      0.99      5172
           1       0.67      0.04      0.07       106

    accuracy                           0.98      5278
   macro avg       0.82      0.52      0.53      5278
weighted avg  

True

In [348]:
# df.drop(labels=['id']).iloc[:,1:3]
Y_test
df

array([['1', '0', '0', ..., '0', '0', '0'],
       ['1', '1', '0', ..., '0', '0', '1'],
       ['1', '1', '0', ..., '0', '0', '0'],
       ...,
       ['1', '0', '0', ..., '0', '0', '0'],
       ['1', '1', '0', ..., '0', '0', '0'],
       ['1', '0', '0', ..., '0', '0', '0']], dtype=object)

In [351]:
df

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26381,30261,The training demonstrated how to enhance micro...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26382,30262,A suitable candidate has been selected and OCH...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26383,30263,"Proshika, operating in Cox's Bazar municipalit...",,news,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26384,30264,"Some 2,000 women protesting against the conduc...",,news,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [361]:
for col in df.columns.values[4:]:
    print('CATEGORY - ', col, ': ',  len(df.loc[df[col] == '1'])/len(df)*100)


CATEGORY -  related :  75.95694686576215
CATEGORY -  request :  17.10376714924581
CATEGORY -  offer :  0.45857651785037523
CATEGORY -  aid_related :  41.51443947547942
CATEGORY -  medical_help :  7.954976123701964
CATEGORY -  medical_products :  4.998863033426818
CATEGORY -  search_and_rescue :  2.747669218524975
CATEGORY -  security :  1.7850375198969153
CATEGORY -  military :  3.27067384218904
CATEGORY -  child_alone :  0.0
CATEGORY -  water :  6.382172364132495
CATEGORY -  food :  11.202910634427347
CATEGORY -  shelter :  8.87591904797999
CATEGORY -  clothing :  1.5538543166830894
CATEGORY -  money :  2.296672477829152
CATEGORY -  missing_people :  1.140756461760024
CATEGORY -  refugees :  3.3351019480027286
CATEGORY -  death :  4.563025847040096
CATEGORY -  other_aid :  13.128174031683468
CATEGORY -  infrastructure_related :  6.47691957856439
CATEGORY -  transport :  4.593344955658304
CATEGORY -  buildings :  5.097400136435989
CATEGORY -  electricity :  2.0389600545743956
CATEGORY 

In [365]:
classification_labels = model.predict(['help food is needed'])[0]
classification_results = dict(zip(df.columns[4:], classification_labels))
print(classification_results)


{'related': '1', 'request': '1', 'offer': '0', 'aid_related': '1', 'medical_help': '0', 'medical_products': '0', 'search_and_rescue': '0', 'security': '0', 'military': '0', 'child_alone': '0', 'water': '0', 'food': '1', 'shelter': '0', 'clothing': '0', 'money': '0', 'missing_people': '0', 'refugees': '0', 'death': '0', 'other_aid': '0', 'infrastructure_related': '0', 'transport': '0', 'buildings': '0', 'electricity': '0', 'tools': '0', 'hospitals': '0', 'shops': '0', 'aid_centers': '0', 'other_infrastructure': '0', 'weather_related': '0', 'floods': '0', 'storm': '0', 'fire': '0', 'earthquake': '0', 'cold': '0', 'other_weather': '0', 'direct_report': '0'}
