# ML Pipeline Preparation

### 1. Importing libraries and loading data from data base

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime


from sqlalchemy import create_engine

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

### Optional display settings

In [3]:
plt.style.use('seaborn-dark')
pd.set_option("display.max_rows", 1000)

In [4]:
engine = create_engine('sqlite:///Pipeline_Project.db')
df = pd.read_sql_table('Messages', engine)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26028 entries, 0 to 26027
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      26028 non-null  int64  
 1   message                 26028 non-null  object 
 2   related                 26028 non-null  float64
 3   request                 26028 non-null  float64
 4   offer                   26028 non-null  float64
 5   aid_related             26028 non-null  float64
 6   medical_help            26028 non-null  float64
 7   medical_products        26028 non-null  float64
 8   search_and_rescue       26028 non-null  float64
 9   security                26028 non-null  float64
 10  military                26028 non-null  float64
 11  child_alone             26028 non-null  float64
 12  water                   26028 non-null  float64
 13  food                    26028 non-null  float64
 14  shelter                 26028 non-null

In [5]:
X = df.iloc[:, 1]
Y = df.iloc[:, 2:]

### 2. Processing the text data

In [6]:
def tokenize(text):
    #Normalization - lowercase  - no punctuation removal - the nltk.tokenize should interpret them by itself
    text = text.lower()
    ######text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #Toeknization
    ######words = text.split()
    words = word_tokenize(text)
    #Stop words removal
    words = [w for w in words if w not in stopwords.words("english")]
    
    #Named Entities??
    # Perhaps
    
    #Lemmatization and Stemming
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
    stemmed  = [PorterStemmer().stem(w) for w in lemmed]
    
    return stemmed

### 3. Building a machine learning pipeline

In [7]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(estimator = RandomForestClassifier()))
    ])    

### 4. Training the pipeline
##### Test size 0.3 as initial idea

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3)

time_before_training = datetime.datetime.now()
pipeline.fit(X_train, y_train)
time_after_training = datetime.datetime.now()

t_delta = (time_after_training - time_before_training).seconds

### 5. Testing the model

In [9]:
y_pred = pipeline.predict(X_test)

In [10]:
y_pred = pd.DataFrame(y_pred, columns = list(Y.columns))

In [11]:
y_test = y_test.reset_index(drop = True)

In [12]:
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred.iloc[:,i]))

related
              precision    recall  f1-score   support

         0.0       0.74      0.35      0.48      1882
         1.0       0.82      0.96      0.89      5927

    accuracy                           0.81      7809
   macro avg       0.78      0.66      0.68      7809
weighted avg       0.80      0.81      0.79      7809

request
              precision    recall  f1-score   support

         0.0       0.91      0.98      0.94      6486
         1.0       0.83      0.50      0.63      1323

    accuracy                           0.90      7809
   macro avg       0.87      0.74      0.79      7809
weighted avg       0.89      0.90      0.89      7809

offer
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      7777
         1.0       0.00      0.00      0.00        32

    accuracy                           1.00      7809
   macro avg       0.50      0.50      0.50      7809
weighted avg       0.99      1.00      0.99      780

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.94      0.99      0.97      7152
         1.0       0.84      0.35      0.50       657

    accuracy                           0.94      7809
   macro avg       0.89      0.67      0.73      7809
weighted avg       0.93      0.94      0.93      7809

clothing
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7681
         1.0       0.80      0.03      0.06       128

    accuracy                           0.98      7809
   macro avg       0.89      0.52      0.53      7809
weighted avg       0.98      0.98      0.98      7809

money
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7635
         1.0       0.50      0.01      0.02       174

    accuracy                           0.98      7809
   macro avg       0.74      0.51      0.51      7809
weighted avg       0.97      0.98      0.97      7809

miss

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. Providing a data frame in order to ease the model assessment

In [13]:
classification_reports = {}

In [14]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred.iloc[:,i]));

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
def df_from_sklearn_cl_reports(cl_reports):

    data_frame = pd.DataFrame()

    for feature in list(cl_reports.keys()):
        l = cl_reports[feature].split(' ')
        l = [x for x in l if x !='']
        l = [x for x in l if '\n' not in x]
        l = l[:l.index('accuracy')]
        columns = l[:3]
        l = [x for x in l if x not in columns]
        labels = []
        precisions = []
        recalls = []
        f1_scores = []
    
        for i, element in enumerate(l):
            if i == 0 or i % 4 == 0:
                labels.append(element)
                precisions.append(l[i+1])
                recalls.append(l[i+2])
                f1_scores.append(l[i+3])
            
        comunicates = [feature]*len(labels)
    
        if data_frame.shape[0] == 0:      
            data_frame['communicate'] = comunicates
            data_frame['label'] = labels
            data_frame['precisions'] = precisions
            data_frame['recalls'] = recalls
            data_frame['f1_scores'] = f1_scores
        
        else:
            auxiliary_df = pd.DataFrame()
            auxiliary_df['communicate'] = comunicates
            auxiliary_df['label'] = labels
            auxiliary_df['precisions'] = precisions
            auxiliary_df['recalls'] = recalls
            auxiliary_df['f1_scores'] = f1_scores 
        
            data_frame = pd.concat([data_frame, auxiliary_df])
        
            del auxiliary_df
    
    data_frame.set_index(["communicate"], inplace = True)
    
    #Chaning the data types
    data_frame.label = data_frame.label.astype("float")
    data_frame.label = data_frame.label.astype("int")
    data_frame.precisions = data_frame.precisions.astype("float")
    data_frame.recalls = data_frame.recalls.astype("float")
    data_frame.f1_scores = data_frame.f1_scores.astype("float")
    
    return data_frame    
        

    

In [38]:
Random_Forest = df_from_sklearn_cl_reports(classification_reports)

In [39]:
Random_Forest

Unnamed: 0_level_0,label,precisions,recalls,f1_scores
communicate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
related,0,0.74,0.35,0.48
related,1,0.82,0.96,0.89
request,0,0.91,0.98,0.94
request,1,0.83,0.5,0.63
offer,0,1.0,1.0,1.0
offer,1,0.0,0.0,0.0
aid_related,0,0.8,0.85,0.82
aid_related,1,0.76,0.69,0.72
medical_help,0,0.93,1.0,0.96
medical_help,1,0.73,0.08,0.14


In [46]:
Random_Forest.loc[Random_Forest.label == 1].groupby(["communicate"])["precisions"].mean().sort_values()

communicate
aid_centers               0.00
other_infrastructure      0.00
offer                     0.00
hospitals                 0.00
shops                     0.00
fire                      0.00
tools                     0.00
infrastructure_related    0.00
security                  0.00
other_weather             0.50
money                     0.50
military                  0.60
search_and_rescue         0.67
transport                 0.70
medical_help              0.73
refugees                  0.73
other_aid                 0.73
buildings                 0.76
aid_related               0.76
storm                     0.78
cold                      0.78
direct_report             0.79
clothing                  0.80
related                   0.82
request                   0.83
electricity               0.83
medical_products          0.83
shelter                   0.84
weather_related           0.85
food                      0.85
water                     0.86
earthquake                0

In [49]:
Random_Forest.loc[Random_Forest.label == 1].groupby(["communicate"])["recalls"].mean().sort_values()

communicate
aid_centers               0.00
tools                     0.00
shops                     0.00
security                  0.00
other_infrastructure      0.00
offer                     0.00
infrastructure_related    0.00
fire                      0.00
hospitals                 0.00
other_weather             0.01
money                     0.01
missing_people            0.01
other_aid                 0.02
refugees                  0.03
electricity               0.03
military                  0.03
clothing                  0.03
cold                      0.04
medical_products          0.05
search_and_rescue         0.05
transport                 0.06
medical_help              0.08
buildings                 0.09
death                     0.12
shelter                   0.35
direct_report             0.37
water                     0.38
storm                     0.49
floods                    0.49
request                   0.50
food                      0.58
weather_related           0

In [42]:
print("Fitting the Random Forest Model took: {} seconds".format(t_delta))

Fitting the Random Forest Model took: 460 seconds


### There are too many classes for the random forest to be efficient on a single PC. Therefore, for the next iterations i will use other models the fiiting time of which will hopefully be shorter

### Moreover, especially the recall levels for correctly identified actual messages are extremely poor. Thus, I will also try to optimize the given data a bit before reaching for other classifiers and playing with their parameters.