# ML Pipeline Preparation

### 1. Importing libraries and loading data from data base

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
import collections
import copy


from sqlalchemy import create_engine

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 

from sklearn.utils.class_weight import compute_sample_weight
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.multioutput import MultiOutputClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.metrics import classification_report

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression

import joblib

### Optional display settings

In [3]:
plt.style.use('seaborn-dark')
pd.set_option('display.max_rows', 50)

In [4]:
engine = create_engine('sqlite:///disaster_database.db')
df = pd.read_sql_table('Messages', engine)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26028 entries, 0 to 26027
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      26028 non-null  int64  
 1   message                 26028 non-null  object 
 2   genre                   26028 non-null  object 
 3   related                 26028 non-null  float64
 4   request                 26028 non-null  float64
 5   offer                   26028 non-null  float64
 6   aid_related             26028 non-null  float64
 7   medical_help            26028 non-null  float64
 8   medical_products        26028 non-null  float64
 9   search_and_rescue       26028 non-null  float64
 10  security                26028 non-null  float64
 11  military                26028 non-null  float64
 12  water                   26028 non-null  float64
 13  food                    26028 non-null  float64
 14  shelter                 26028 non-null

### General division into labels and features

In [5]:
X = df.iloc[:, 1]
Y = df.iloc[:, 3:]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, 
                                                    test_size = 0.05, 
                                                    random_state = 42)


### 2. Key functions

#### Tokenizer for processing the text data

In [7]:
def tokenize(text):
    
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #Toeknization
    words = text.split()
    words = word_tokenize(text)
    #Stop words removal
    words = [w for w in words if w not in stopwords.words("english")]

    
    #Lemmatization
    lemmed = [WordNetLemmatizer().lemmatize(w, pos = "v") for w in words]
    
    
    return lemmed

#### Since I want to test out many models, it seems reasonable to write a function which will make pipeline with keyword arguments

In [8]:
def make_sklearn_pipeline(Pipeline = Pipeline, memory = None, verbose = False, **kwargs):
    """
    Firt argument is sklearn Pipepline class. It should not be changed.
    
    Definition of pipeline steps happens EXPLICITLY within the instantiation!
    Definition of typical pipeline looks like following:
    example_pipeline = make_sklearn_pipeline(steps = [('name#1',transformer#1),
                                                      ('name#2',transformer#2),
                                                      ('name#3',transformer#3),
                                                      ('name#4',transformer#4),
                                                      ........................,
                                                      ('name#n',classifier#n)], verbose = ...,
                                                      memory = ...)
    
    It is not required to give values for ´verbose´ and ´memory´. They have default values as False
    and None respectively. For more information visit: 
    ´https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html´
    """
    pipeline = Pipeline(steps = kwargs['steps'], memory = memory, verbose = verbose)
    
    return pipeline

#### Function transforming the classification report into data frame with recall, precision and accuracy only 

In [9]:
def df_from_sklearn_cl_reports(cl_reports):

    data_frame = pd.DataFrame()

    for feature in list(cl_reports.keys()):
        l = cl_reports[feature].split(' ')
        l = [x for x in l if x !='']
        l = [x for x in l if '\n' not in x]
        l = l[:l.index('accuracy')]
        columns = l[:3]
        l = [x for x in l if x not in columns]
        labels = []
        precisions = []
        recalls = []
        f1_scores = []
    
        for i, element in enumerate(l):
            if i == 0 or i % 4 == 0:
                labels.append(element)
                precisions.append(l[i+1])
                recalls.append(l[i+2])
                f1_scores.append(l[i+3])
            
        comunicates = [feature]*len(labels)
    
        if data_frame.shape[0] == 0:      
            data_frame['communicate'] = comunicates
            data_frame['label'] = labels
            data_frame['precisions'] = precisions
            data_frame['recalls'] = recalls
            data_frame['f1_scores'] = f1_scores
        
        else:
            auxiliary_df = pd.DataFrame()
            auxiliary_df['communicate'] = comunicates
            auxiliary_df['label'] = labels
            auxiliary_df['precisions'] = precisions
            auxiliary_df['recalls'] = recalls
            auxiliary_df['f1_scores'] = f1_scores 
        
            data_frame = pd.concat([data_frame, auxiliary_df])
        
            del auxiliary_df
    
    data_frame.set_index(["communicate"], inplace = True)
    
    #Changing the data types
    data_frame.label = data_frame.label.astype("float")
    data_frame.label = data_frame.label.astype("int")
    data_frame.precisions = data_frame.precisions.astype("float")
    data_frame.recalls = data_frame.recalls.astype("float")
    data_frame.f1_scores = data_frame.f1_scores.astype("float")
    
    return data_frame    

In [10]:
classification_reports = {}

### 3. Building a machine learning pipeline

#### 3.1 SVC

In [11]:
SVC = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = LinearSVC
                                                         (class_weight = "balanced", 
                                                          dual = False, max_iter = 10000,
                                                          random_state = 42)))])

In [12]:
classification_reports.clear()

In [13]:
SVC.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.2min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   6.0s


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x000001C1B4F6AE50>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=LinearSVC(class_weight='balanced',
                                                           dual=False,
                                                           max_iter=10000,
                                                           random_state=42)))],
         verbose=True)

In [14]:
y_pred_svc = SVC.predict(X_test)
y_pred_svc = pd.DataFrame(y_pred_svc, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_svc.iloc[:,i]))

related
              precision    recall  f1-score   support

         0.0       0.56      0.71      0.63       321
         1.0       0.90      0.82      0.86       981

    accuracy                           0.79      1302
   macro avg       0.73      0.76      0.74      1302
weighted avg       0.81      0.79      0.80      1302

request
              precision    recall  f1-score   support

         0.0       0.95      0.90      0.92      1088
         1.0       0.59      0.73      0.66       214

    accuracy                           0.87      1302
   macro avg       0.77      0.82      0.79      1302
weighted avg       0.89      0.87      0.88      1302

offer
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1295
         1.0       0.00      0.00      0.00         7

    accuracy                           0.99      1302
   macro avg       0.50      0.50      0.50      1302
weighted avg       0.99      0.99      0.99      130

In [15]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_svc.iloc[:,i]));

In [16]:
svc_report = df_from_sklearn_cl_reports(classification_reports)

In [17]:
svc_precisions = svc_report.loc[
    svc_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)
svc_precisions.mean()

0.4259999999999999

In [18]:
svc_recalls = svc_report.loc[
    svc_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)
svc_recalls.mean()

0.5182857142857143

#### 3.2 DecisionTree

In [19]:
Tree = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = DecisionTreeClassifier
                                                         (max_depth=7, min_samples_leaf=1, 
                                                          class_weight = "balanced",
                                                          random_state = 42)))])

In [20]:
classification_reports.clear()

In [21]:
Tree.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.3min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   8.0s


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x000001C1B4F6AE50>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                        max_depth=7,
                                                                        random_state=42)))],
         verbose=True)

In [22]:
y_pred_tree = Tree.predict(X_test)
y_pred_tree = pd.DataFrame(y_pred_tree, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_tree.iloc[:,i]))

related
              precision    recall  f1-score   support

         0.0       0.34      0.86      0.48       321
         1.0       0.91      0.44      0.59       981

    accuracy                           0.55      1302
   macro avg       0.62      0.65      0.54      1302
weighted avg       0.77      0.55      0.57      1302

request
              precision    recall  f1-score   support

         0.0       0.94      0.88      0.91      1088
         1.0       0.55      0.72      0.62       214

    accuracy                           0.86      1302
   macro avg       0.74      0.80      0.77      1302
weighted avg       0.88      0.86      0.86      1302

offer
              precision    recall  f1-score   support

         0.0       1.00      0.92      0.96      1295
         1.0       0.02      0.29      0.04         7

    accuracy                           0.92      1302
   macro avg       0.51      0.61      0.50      1302
weighted avg       0.99      0.92      0.95      130

In [23]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_tree.iloc[:,i]));

In [24]:
tree_report = df_from_sklearn_cl_reports(classification_reports)

In [69]:
decision_tree_precisions = tree_report.loc[tree_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)
decision_tree_precisions.mean()

0.3871428571428573

In [68]:
decision_tree_recalls = tree_report.loc[tree_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)

decision_tree_recalls.mean()


0.5617142857142857

#### 3.2 RandomForest

In [27]:
classification_reports.clear()

In [28]:
forest = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = RandomForestClassifier
                                (n_estimators = 500, max_depth=7, min_samples_leaf=1, 
                                 class_weight = "balanced",random_state = 42)))])

In [29]:
forest.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.3min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 1.2min


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x000001C1B4F6AE50>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(class_weight='balanced',
                                                                        max_depth=7,
                                                                        n_estimators=500,
                                                                        random_state=42)))],
         verbose=True)

In [30]:
y_pred_forest = forest.predict(X_test)
y_pred_forest = pd.DataFrame(y_pred_forest, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_forest.iloc[:,i]))

related
              precision    recall  f1-score   support

         0.0       0.47      0.81      0.59       321
         1.0       0.92      0.70      0.79       981

    accuracy                           0.73      1302
   macro avg       0.69      0.76      0.69      1302
weighted avg       0.81      0.73      0.74      1302

request
              precision    recall  f1-score   support

         0.0       0.95      0.89      0.92      1088
         1.0       0.57      0.75      0.65       214

    accuracy                           0.87      1302
   macro avg       0.76      0.82      0.78      1302
weighted avg       0.89      0.87      0.87      1302

offer
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1295
         1.0       0.14      0.14      0.14         7

    accuracy                           0.99      1302
   macro avg       0.57      0.57      0.57      1302
weighted avg       0.99      0.99      0.99      130

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_forest.iloc[:,i]));

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
forest_report = df_from_sklearn_cl_reports(classification_reports)

In [33]:
forest_precisions = forest_report.loc[forest_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)
forest_precisions.mean()

0.38885714285714296

In [34]:
forest_recalls = forest_report.loc[forest_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)
forest_recalls.mean()

0.5308571428571429

### Stacking 

In [62]:

def get_stacking():
    level0 = list()
    level0.append(("svc",SVC))
    level0.append(("tree", Tree))
    level0.append(("forest", forest))
    model = StackingClassifier(estimators=level0, final_estimator = MultiOutputClassifier(estimator=
                                                    (LogisticRegression())))
    return model
    
"""

def get_stacking():
    level0 = list()
    level0.append(("svc",SVC))
    level0.append(("tree", Tree))
    level0.append(("forest", forest))
    model = StackingClassifier(estimators=level0)
    return model
"""

'\n\ndef get_stacking():\n    level0 = list()\n    level0.append(("svc",SVC))\n    level0.append(("tree", Tree))\n    level0.append(("forest", forest))\n    model = StackingClassifier(estimators=level0)\n    return model\n'

In [63]:
model = get_stacking()

In [1]:
model.fit(X_train, y_train)

NameError: name 'model' is not defined