### ETL Pipeline Preparation

In [1]:
# import libraries
import pandas as pd
from sqlalchemy import create_engine # to save the clean dataset into an sqlite database


In [2]:
# 1. load datasets from csv files.

# load messages dataset
messages = pd.read_csv(r"messages.csv") 
# messages.head()


# load categories dataset
categories = pd.read_csv(r"categories.csv")
# categories.head()



# 2. Merge datasets

# merge datasets messages and categories datasets using the common id
df = pd.merge(messages, categories, left_on='id', right_on='id', how='inner') 
# df.head()



# 3. Split categories into separate category columns

# create a dataframe of the 36 individual category columns 
# split the 'categories' column into separate columns

categories = df["categories"].str.split(';', expand=True)
# categories.head()

 
## select the first row of the categories dataframe
#row = categories[0:1]
#
## use this row to extract a list of new column names for categories.
## one way is to apply a lambda function that takes everything 
## up to the second to last character of each string with slicing
#category_col = row.apply(lambda x: x.str[:-2]).values.tolist()
#print(category_col)
#

# Better Alternative: 
# Extract new column names directly from the first row
categories.columns = categories.iloc[0].str[:-2].values

# Drop the first row since it was used for column names
categories = categories[1:]

# Display the first few rows of the categories DataFrame
# categories.head()



## rename the columns of `categories`
#categories.columns = category_col
#categories.head()
#



# 4. Convert category values to just numbers 0 or 1.

for column in categories:
    # set each value to be the last character of the string
    categories[column] = categories[column].str[-1]
    
    # convert column from string to numeric
    categories[column] = pd.to_numeric(categories[column])
# categories.head()

## Better Alternative
## Optimized extraction of the last character and conversion to numeric
#categories = categories.apply(lambda x: pd.to_numeric(x.str[-1]))
#
## Display the first few rows of the updated DataFrame
#categories.head()



# 5. Replace categories column in df with new category columns.

# drop the original categories column from `df`

df.drop(['categories'], axis=1, inplace = True)
#df.head()



# concatenate the original dataframe with the new `categories` dataframe
df = pd.concat([df, categories], axis=1)
# df.head()



# 6. Remove duplicates.

# check number of duplicates
# df.duplicated().sum()

# drop duplicates
df = df.drop_duplicates()

# check number of duplicates
#df.duplicated().sum()



# 7. Save the clean dataset into an sqlite database.

engine = create_engine('sqlite:///DisasterResponseProject.db')
df.to_sql('DisasterResponses', engine, index=False, if_exists='replace')


### Machine Learning Pipeline Preparation


In [3]:
# import libraries

# 1) read SQL
import pandas as pd
from sqlalchemy import create_engine

# 2) tokenization function 
import nltk
import re
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#3) machine learning pipeline
from sklearn.pipeline import Pipeline  # For creating the pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer  # For text processing
from sklearn.multioutput import MultiOutputClassifier  # For multi-output classification
from sklearn.ensemble import RandomForestClassifier  # For the Random Forest classifier

#4) training of pipeline
from sklearn.model_selection import train_test_split

#5 test training model
from sklearn.metrics import classification_report
import numpy as np

#8 Improve model with Grid Search
from sklearn.model_selection import GridSearchCV        #for using GridSearchCV

#9 Export your model as a pickle file
import pickle     # for ML-model export s a pickle file


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# 1. load data from database.

# IMPORTANT: Notebook from 5.5 ETL has to be run bevore!
# load data from database

engine = create_engine('sqlite:///DisasterResponseProject.db')
df = pd.read_sql('SELECT * FROM DisasterResponses', engine)
X = df['message']
y = df.iloc[:,4:]


In [5]:
df.head() #check df structure.

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,,,,,,,...,,,,,,,,,,
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Check text in column 'message'
list(df['message'][:20])


['Weather update - a cold front from Cuba that could pass over Haiti',
 'Is the Hurricane over or is it not over',
 'Looking for someone but no name',
 'UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.',
 'says: west side of Haiti, rest of the country today and tonight',
 'Information about the National Palace-',
 'Storm at sacred heart of jesus',
 'Please, we need tents and water. We are in Silo, Thank you!',
 'I would like to receive the messages, thank you',
 'I am in Croix-des-Bouquets. We have health issues. They ( workers ) are in Santo 15. ( an area in Croix-des-Bouquets )',
 "There's nothing to eat and water, we starving and thirsty.",
 'I am in Petionville. I need more information regarding 4636',
 'I am in Thomassin number 32, in the area named Pyron. I would like to have some water. Thank God we are fine, but we desperately need water. Thanks',
 "Let's do it together, need food in Delma 75, in didine area",
 'More informati

In [7]:
# 2. Write a tokenization function to process your text data


# Check text in column 'message'
list(df['message'][:20])

def tokenize(text):
    # Define a regex pattern to detect URLs
    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Replace URLs with a placeholder
    text = re.sub(url_regex, "urlplaceholder", text)
    
    # Normalize and tokenize text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    tokens = [w for w in tokens if w not in stopwords.words("english") and w.isalpha()]
    
    # Initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Iterate through each token
    clean_tokens = []
    for tok in tokens:
        # Lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


In [8]:
# Check text tokens in column 'message'
for message in X[:20]:
    tokens = tokenize(message)
    print(tokens,'\n')
 

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti'] 

['hurricane'] 

['looking', 'someone', 'name'] 

['un', 'report', 'leogane', 'destroyed', 'hospital', 'croix', 'functioning', 'need', 'supply', 'desperately'] 

['say', 'west', 'side', 'haiti', 'rest', 'country', 'today', 'tonight'] 

['information', 'national'] 

['storm', 'sacred', 'heart', 'jesus'] 

['please', 'need', 'tent', 'water', 'silo', 'thank'] 

['would', 'like', 'receive', 'message', 'thank'] 

['health', 'issue', 'worker', 'santo', 'area'] 

['nothing', 'eat', 'water', 'starving', 'thirsty'] 

['petionville', 'need', 'information', 'regarding'] 

['thomassin', 'number', 'area', 'named', 'pyron', 'would', 'like', 'water', 'thank', 'god', 'fine', 'desperately', 'need', 'water', 'thanks'] 

['let', 'together', 'need', 'food', 'delma', 'didine', 'area'] 

['information', 'number', 'order', 'participate', 'see', 'use'] 

['comitee', 'delmas', 'rue', 'street', 'janvier', 'impasse', 'charite', 'people', 't

In [9]:
# Replace all NaN values with 0

df.fillna(0, inplace=True)

# split the dataset

X = df.message
y = df.iloc[:,4:]
category_names = y.columns


In [10]:
#3. Build a machine learning pipeline

machine_learning_pipeline = Pipeline([
    ('cvect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])


In [11]:
# 4. Train pipeline

## train test split
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=55) # Split data into train and test sets
#machine_learning_pipeline.fit(X_train, y_train) # train test split

X_train, X_test, y_train, y_test = train_test_split(X, y)
machine_learning_pipeline.fit(X_train, y_train)

#y_pred = machine_learning_pipeline.predict(X_test)
#y_pred[55].shape


Pipeline(memory=None,
     steps=[('cvect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))])

In [12]:
# 5. Test your model

y_pred = machine_learning_pipeline.predict(X_test)

# Test shape of y_test and y_pred
print("Shape of y_test:", y_test.values.shape)
print("Shape of y_pred:", y_pred.shape)

# Test unique values in y_test and y_pred
print("Unique values in y_test:", np.unique(y_test))
print("Unique values in y_pred:", np.unique(y_pred))


Shape of y_test: (6554, 36)
Shape of y_pred: (6554, 36)
Unique values in y_test: [ 0.  1.  2.]
Unique values in y_pred: [ 0.  1.  2.]


In [13]:
# Generate predictions y_pred and print out classification_report
y_pred = machine_learning_pipeline.predict(X_test)

# Iterate through each column and print the classification report
for i in range(y_test.shape[1]):  # Use the number of columns in y_test
    print(i,")","#########################", y_test.columns[i], "#########################")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i]))
    

0 ) ######################### related #########################
             precision    recall  f1-score   support

        0.0       0.64      0.47      0.54      1552
        1.0       0.85      0.91      0.88      4944
        2.0       0.27      0.64      0.38        58

avg / total       0.80      0.80      0.79      6554

1 ) ######################### request #########################
             precision    recall  f1-score   support

        0.0       0.90      0.97      0.94      5502
        1.0       0.76      0.46      0.57      1052

avg / total       0.88      0.89      0.88      6554

2 ) ######################### offer #########################
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      6528
        1.0       0.00      0.00      0.00        26

avg / total       0.99      1.00      0.99      6554

3 ) ######################### aid_related #########################
             precision    recall  f1-score  

  'precision', 'predicted', average, warn_for)


In [14]:
#Alternative:
#
## Assuming y_test is a DataFrame and y_pred is a NumPy array
#y_test_array = y_test.values  # Convert y_test to a NumPy array if it's a DataFrame
#
## Iterate through each class and print the classification report
#for i in range(y_test_array.shape[1]):  # Use the number of columns in y_test
#    #print("=======================", y_test.columns[i], "======================")
#    print(classification_report(y_test_array[:, i], y_pred[:, i], target_names=[y_test.columns[i]]))


In [15]:
# 6. Improve your model with Grid Search to find better parameters

machine_learning_pipeline.get_params() # Displays the parameters of machine_learning_pipeline


{'memory': None,
 'steps': [('cvect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=<function tokenize at 0x7d6974b20e18>, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
               oob_score=False, random_state=None

In [16]:
## specify parameters for grid search
#   
#parameters = {
#    'clf__n_estimators': [100, 200],
#    'clf__min_samples_split': [2, 3],
#}

# create grid search object
#cv = GridSearchCV(machine_learning_pipeline, param_grid=parameters)


In [17]:
# Specify parameters for grid search
parameters = {
    'clf__estimator__n_estimators': [100],           # used for shorter run time  
#   'clf__estimator__n_estimators': [100, 200],      # better but runs longer     
    'clf__estimator__min_samples_split': [2, 3],
}

# Create grid search object
#cv = GridSearchCV(machine_learning_pipeline, param_grid=parameters)  #with warnings

cv = GridSearchCV(machine_learning_pipeline, param_grid=parameters, return_train_score=True, verbose=2)

cv.fit(X_train, y_train)


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100 
[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, total= 4.7min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, total= 4.6min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100 
[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, total= 4.7min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100 
[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, total= 4.3min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100 
[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, total= 4.2min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100 
[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, total= 4.2min


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 33.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cvect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__estimator__n_estimators': [100], 'clf__estimator__min_samples_split': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [18]:
cv.cv_results_


{'mean_fit_time': array([ 242.84114917,  216.28636003]),
 'std_fit_time': array([ 4.04964579,  2.66832469]),
 'mean_score_time': array([ 36.30760852,  35.75699576]),
 'std_score_time': array([ 0.46307743,  0.27322664]),
 'param_clf__estimator__min_samples_split': masked_array(data = [2 3],
              mask = [False False],
        fill_value = ?),
 'param_clf__estimator__n_estimators': masked_array(data = [100 100],
              mask = [False False],
        fill_value = ?),
 'params': [{'clf__estimator__min_samples_split': 2,
   'clf__estimator__n_estimators': 100},
  {'clf__estimator__min_samples_split': 3,
   'clf__estimator__n_estimators': 100}],
 'split0_test_score': array([ 0.26258773,  0.26274031]),
 'split1_test_score': array([ 0.26289289,  0.26197742]),
 'split2_test_score': array([ 0.26075679,  0.25740006]),
 'mean_test_score': array([ 0.26207914,  0.26070593]),
 'std_test_score': array([ 0.0009433 ,  0.00235826]),
 'rank_test_score': array([1, 2], dtype=int32),
 'split0_t

In [19]:
# Identification of the best parameters from GritSearch analysis

print(cv.best_params_)


{'clf__estimator__min_samples_split': 2, 'clf__estimator__n_estimators': 100}


In [20]:
#building new model

machine_learning_pipeline_optimized = cv.best_estimator_
print (cv.best_estimator_)

Pipeline(memory=None,
     steps=[('cvect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))])


In [21]:
# 7. test of the optimized machine learning model

# Generate predictions y_pred and print out classification_report
y_pred = machine_learning_pipeline_optimized.predict(X_test)

# Iterate through each column and print the classification report
for i in range(y_test.shape[1]):  # Use the number of columns in y_test
    print(i,")","#########################", y_test.columns[i], "#########################")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i]))

0 ) ######################### related #########################
             precision    recall  f1-score   support

        0.0       0.71      0.42      0.53      1552
        1.0       0.84      0.94      0.89      4944
        2.0       0.26      0.57      0.36        58

avg / total       0.80      0.81      0.80      6554

1 ) ######################### request #########################
             precision    recall  f1-score   support

        0.0       0.91      0.98      0.94      5502
        1.0       0.81      0.48      0.61      1052

avg / total       0.89      0.90      0.89      6554

2 ) ######################### offer #########################
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      6528
        1.0       0.00      0.00      0.00        26

avg / total       0.99      1.00      0.99      6554

3 ) ######################### aid_related #########################
             precision    recall  f1-score  

  'precision', 'predicted', average, warn_for)


In [23]:
# 9. Export your model as a pickle file
pickle.dump(machine_learning_pipeline_optimized, open('model.pkl', 'wb'))