# ML Pipeline Preparation

## 1. Importing libraries and loading data from data base

In [1]:
#Natural Language Processing
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#Data Science Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Regex and pickle
import re
import pickle

#Database sql connectivity
from sqlalchemy import create_engine

#Natural Language Processing Libraries
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Machine Learning Libraries (sklearn)
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import classification_report


#### Optional display settings

In [3]:
plt.style.use('seaborn-dark')
pd.set_option('display.max_rows', 50)

In [4]:
engine = create_engine('sqlite:///disaster_database.db')
df = pd.read_sql_table('Messages', engine)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26028 entries, 0 to 26027
Data columns (total 37 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      26028 non-null  int64  
 1   message                 26028 non-null  object 
 2   related                 26028 non-null  float64
 3   request                 26028 non-null  float64
 4   offer                   26028 non-null  float64
 5   aid_related             26028 non-null  float64
 6   medical_help            26028 non-null  float64
 7   medical_products        26028 non-null  float64
 8   search_and_rescue       26028 non-null  float64
 9   security                26028 non-null  float64
 10  military                26028 non-null  float64
 11  water                   26028 non-null  float64
 12  food                    26028 non-null  float64
 13  shelter                 26028 non-null  float64
 14  clothing                26028 non-null

#### General division into labels and features

In [5]:
X = df.iloc[:, 1]
Y = df.iloc[:, 3:]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, 
                                                    test_size = 0.1, 
                                                    random_state = 42)


## 2. Key functions

#### Tokenizer for processing the text data

In [7]:
def tokenize(text): 
    #Normalization - lowercase and punctuation removal:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #Tokenization:
    words = text.split()
    words = word_tokenize(text)
    #Stop words removal:
    words = [w for w in words if w not in stopwords.words("english")]
    #Lemmatization
    lemmed = [WordNetLemmatizer().lemmatize(w, pos = "v") for w in words]
    
    return lemmed

#### Since I want to test out many models, it seems reasonable to write a function which will make pipeline by typing the keyword arguments

In [8]:
def make_sklearn_pipeline(Pipeline = Pipeline, memory = None, verbose = False, **kwargs):
    """
    Firt argument is sklearn Pipepline class. It should not be changed.
    
    Definition of pipeline steps happens EXPLICITLY within the instantiation!
    Definition of typical pipeline looks like following:
    example_pipeline = make_sklearn_pipeline(steps = [('name#1',transformer#1),
                                                      ('name#2',transformer#2),
                                                      ('name#3',transformer#3),
                                                      ('name#4',transformer#4),
                                                      ........................,
                                                      ('name#n',classifier#n)], 
                                                      verbose = ...,
                                                      memory = ...)
    
    It is not required to give values for ´verbose´ and ´memory´. 
    They have default values as False and None respectively. 
    For more information visit: 
    ´https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html´
    """
    
    pipeline = Pipeline(steps = kwargs["steps"], memory=memory, verbose = verbose)
    
    return pipeline

#### Function transforming the classification report into data frame with recall, precision and accuracy only 

In [9]:
def df_from_sklearn_cl_reports(cl_report):
    """
    This function transforms the sklearn_cl_report into a simple data frame
    Thanks to this procedure the results for different categories are easier 
    to comparte

    Parameters
    ----------
    cl_reports : sklearn classification report object

    Returns: Data Frame with precisions, recalls and f1_scores from the sklearn
    classification report
    """
    data_frame = pd.DataFrame()
    #This for loop extracts labels, precisions, recalls and f1_scores.
    #List expressions inside the loop are just the trival transformations
    #of the classification report
    for feature in list(cl_report.keys()):
        #Preparing lists for future data series
        labels = []
        precisions = []
        recalls = []
        f1_scores = []
        #List comprehensions are used to form one main list of results
        l = cl_report[feature].split(' ')
        l = [x for x in l if x !='']
        l = [x for x in l if '\n' not in x]
        l = l[:l.index("accuracy")]
        columns = l[:3]
        l = [x for x in l if x not in columns]
        #Each fourth element in the list "l" refers to the labels, precisions, 
        #recalls and f1_scores respectivelly
        for i, element in enumerate(l):
            if i == 0 or i % 4 == 0:
                labels.append(element)
                precisions.append(l[i+1])
                recalls.append(l[i+2])
                f1_scores.append(l[i+3])
        #Communicates are simply our labels. We obtain them by pasting the
        #features name at each index
        communicates = [feature]*len(labels)
        #Formation of the data frame:
        #Beginning:
        if data_frame.shape[0] == 0:
            data_frame["communicate"] = communicates
            data_frame["label"] = labels
            data_frame["precisions"] = precisions
            data_frame["recalls"] = recalls
            data_frame["f1_scores"] = f1_scores
        #If already some categories are given:
        else:
            auxilliary_df = pd.DataFrame()
            auxilliary_df['communicate'] = communicates
            auxilliary_df['label'] = labels
            auxilliary_df['precisions'] = precisions
            auxilliary_df['recalls'] = recalls
            auxilliary_df['f1_scores'] = f1_scores
            
            data_frame = pd.concat([data_frame, auxilliary_df])
            
            del auxilliary_df
            
    data_frame.set_index(["communicate"], inplace = True)
    
    #Changing the data types
    data_frame.label = data_frame.label.astype("float")
    data_frame.label = data_frame.label.astype("int")
    data_frame.precisions = data_frame.precisions.astype("float")
    data_frame.recalls = data_frame.recalls.astype("float")
    data_frame.f1_scores = data_frame.f1_scores.astype("float")
            
    return data_frame 

In [10]:
classification_reports = {}

## 3. Building a machine learning pipeline

### 3.1 SVC

In [11]:
SVC = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = LinearSVC
                                                         (class_weight = "balanced", 
                                                          dual = False, max_iter = 100000,
                                                          random_state = 42)))])

In [12]:
classification_reports.clear()

In [13]:
SVC.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.3min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   5.3s


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000129E0BFB3A0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=LinearSVC(class_weight='balanced',
                                                           dual=False,
                                                           max_iter=100000,
                                                           random_state=42)))],
         verbose=True)

#### Classification report

In [14]:
y_pred_svc = SVC.predict(X_test)
y_pred_svc = pd.DataFrame(y_pred_svc, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_svc.iloc[:,i]))

request
              precision    recall  f1-score   support

         0.0       0.94      0.89      0.91      2175
         1.0       0.56      0.73      0.63       428

    accuracy                           0.86      2603
   macro avg       0.75      0.81      0.77      2603
weighted avg       0.88      0.86      0.87      2603

offer
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      2589
         1.0       0.08      0.07      0.08        14

    accuracy                           0.99      2603
   macro avg       0.54      0.53      0.54      2603
weighted avg       0.99      0.99      0.99      2603

aid_related
              precision    recall  f1-score   support

         0.0       0.81      0.78      0.80      1558
         1.0       0.69      0.73      0.71      1045

    accuracy                           0.76      2603
   macro avg       0.75      0.76      0.75      2603
weighted avg       0.76      0.76      0.76     

              precision    recall  f1-score   support

         0.0       0.97      0.92      0.95      2475
         1.0       0.22      0.42      0.29       128

    accuracy                           0.90      2603
   macro avg       0.59      0.67      0.62      2603
weighted avg       0.93      0.90      0.91      2603

direct_report
              precision    recall  f1-score   support

         0.0       0.91      0.84      0.88      2105
         1.0       0.50      0.66      0.57       498

    accuracy                           0.81      2603
   macro avg       0.71      0.75      0.72      2603
weighted avg       0.83      0.81      0.82      2603



In [15]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_svc.iloc[:,i]));

In [16]:
svc_report = df_from_sklearn_cl_reports(classification_reports)

### In my opinion the most important metrics is recall for True values. It is essential to come with proper help to all which require it. Second important parameter is precision for True values, so the proper organisations will not waste their ressources for unnecessary actions.

#### Mean precision for True over all categories

In [17]:
svc_precisions = svc_report.loc[
    svc_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)
svc_precisions.mean()

0.39382352941176474

#### Precisions for True for each category

In [18]:
svc_precisions

communicate
earthquake                0.78
weather_related           0.75
aid_related               0.69
food                      0.68
storm                     0.67
water                     0.63
shelter                   0.59
floods                    0.56
request                   0.56
military                  0.55
death                     0.52
cold                      0.52
direct_report             0.50
fire                      0.47
electricity               0.45
clothing                  0.41
buildings                 0.41
medical_help              0.38
medical_products          0.36
money                     0.36
refugees                  0.32
other_aid                 0.29
missing_people            0.24
infrastructure_related    0.23
other_weather             0.22
transport                 0.22
hospitals                 0.21
shops                     0.20
other_infrastructure      0.18
search_and_rescue         0.14
aid_centers               0.12
security                  0

#### Mean recall for True over all categories

In [19]:
svc_recalls = svc_report.loc[
    svc_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)
svc_recalls.mean()

0.48147058823529415

#### Recalls for True for each category

In [20]:
svc_recalls

communicate
food                      0.84
weather_related           0.82
earthquake                0.79
storm                     0.77
water                     0.74
request                   0.73
aid_related               0.73
shelter                   0.72
floods                    0.68
death                     0.68
direct_report             0.66
electricity               0.57
military                  0.56
medical_products          0.56
medical_help              0.55
cold                      0.52
money                     0.52
buildings                 0.52
clothing                  0.50
refugees                  0.49
other_aid                 0.47
fire                      0.42
other_weather             0.42
infrastructure_related    0.38
transport                 0.37
missing_people            0.31
other_infrastructure      0.31
hospitals                 0.20
search_and_rescue         0.17
security                  0.12
shops                     0.09
aid_centers               0

#### 3.2 DecisionTree

In [21]:
tree = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = DecisionTreeClassifier
                                                         (max_depth=7, min_samples_leaf=1, 
                                                          class_weight = "balanced",
                                                          random_state = 42)))])

In [22]:
classification_reports.clear()

In [23]:
tree.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.3min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   7.1s


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000129E0BFB3A0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                        max_depth=7,
                                                                        random_state=42)))],
         verbose=True)

In [24]:
y_pred_tree = tree.predict(X_test)
y_pred_tree = pd.DataFrame(y_pred_tree, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_tree.iloc[:,i]))

request
              precision    recall  f1-score   support

         0.0       0.94      0.88      0.91      2175
         1.0       0.54      0.70      0.61       428

    accuracy                           0.85      2603
   macro avg       0.74      0.79      0.76      2603
weighted avg       0.87      0.85      0.86      2603

offer
              precision    recall  f1-score   support

         0.0       1.00      0.91      0.95      2589
         1.0       0.02      0.29      0.03        14

    accuracy                           0.91      2603
   macro avg       0.51      0.60      0.49      2603
weighted avg       0.99      0.91      0.95      2603

aid_related
              precision    recall  f1-score   support

         0.0       0.74      0.82      0.77      1558
         1.0       0.67      0.57      0.62      1045

    accuracy                           0.72      2603
   macro avg       0.71      0.69      0.70      2603
weighted avg       0.71      0.72      0.71     

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      2348
         1.0       0.82      0.80      0.81       255

    accuracy                           0.96      2603
   macro avg       0.90      0.89      0.89      2603
weighted avg       0.96      0.96      0.96      2603

cold
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      2540
         1.0       0.54      0.60      0.57        63

    accuracy                           0.98      2603
   macro avg       0.76      0.80      0.78      2603
weighted avg       0.98      0.98      0.98      2603

other_weather
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96      2475
         1.0       0.28      0.27      0.27       128

    accuracy                           0.93      2603
   macro avg       0.62      0.62      0.62      2603
weighted avg       0.93      0.93      0.93      2603



In [25]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_tree.iloc[:,i]));

In [26]:
tree_report = df_from_sklearn_cl_reports(classification_reports)

#### Mean precision for True over all categories

In [27]:
decision_tree_precisions = tree_report.loc[tree_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)
decision_tree_precisions.mean()

0.35558823529411776

#### Precisions for True for each category

In [28]:
decision_tree_precisions

communicate
weather_related           0.85
earthquake                0.82
food                      0.68
aid_related               0.67
storm                     0.66
floods                    0.64
shelter                   0.61
water                     0.58
cold                      0.54
request                   0.54
military                  0.51
direct_report             0.48
medical_help              0.46
death                     0.39
electricity               0.38
buildings                 0.38
other_aid                 0.34
clothing                  0.30
other_weather             0.28
security                  0.25
money                     0.25
medical_products          0.24
transport                 0.23
infrastructure_related    0.22
other_infrastructure      0.16
refugees                  0.15
fire                      0.14
aid_centers               0.10
search_and_rescue         0.08
hospitals                 0.08
missing_people            0.04
offer                     0

#### Mean recall for True over all categories

In [29]:
decision_tree_recalls = tree_report.loc[tree_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)

decision_tree_recalls.mean()

0.5388235294117647

#### Recalls for True for each category

In [30]:
decision_tree_recalls

communicate
water                     0.85
food                      0.83
earthquake                0.80
shelter                   0.76
death                     0.73
storm                     0.71
request                   0.70
money                     0.69
floods                    0.67
buildings                 0.66
refugees                  0.63
clothing                  0.61
weather_related           0.60
cold                      0.60
electricity               0.59
medical_products          0.59
aid_related               0.57
direct_report             0.56
missing_people            0.55
other_aid                 0.53
military                  0.50
aid_centers               0.47
infrastructure_related    0.45
fire                      0.45
other_infrastructure      0.44
search_and_rescue         0.44
hospitals                 0.40
transport                 0.39
medical_help              0.32
offer                     0.29
security                  0.28
other_weather             0

#### 3.2 RandomForest

In [31]:
classification_reports.clear()

In [32]:
forest = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = RandomForestClassifier
                                (n_estimators = 500, max_depth=7, min_samples_leaf=1, 
                                 class_weight = "balanced",random_state = 42)))])

In [33]:
forest.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.2min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 1.0min


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000129E0BFB3A0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(class_weight='balanced',
                                                                        max_depth=7,
                                                                        n_estimators=500,
                                                                        random_state=42)))],
         verbose=True)

In [34]:
y_pred_forest = forest.predict(X_test)
y_pred_forest = pd.DataFrame(y_pred_forest, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_forest.iloc[:,i]))

request
              precision    recall  f1-score   support

         0.0       0.95      0.89      0.92      2175
         1.0       0.56      0.74      0.64       428

    accuracy                           0.86      2603
   macro avg       0.75      0.81      0.78      2603
weighted avg       0.88      0.86      0.87      2603

offer
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      2589
         1.0       0.09      0.07      0.08        14

    accuracy                           0.99      2603
   macro avg       0.54      0.53      0.54      2603
weighted avg       0.99      0.99      0.99      2603

aid_related
              precision    recall  f1-score   support

         0.0       0.79      0.83      0.81      1558
         1.0       0.73      0.66      0.69      1045

    accuracy                           0.76      2603
   macro avg       0.76      0.75      0.75      2603
weighted avg       0.76      0.76      0.76     

              precision    recall  f1-score   support

         0.0       0.97      0.89      0.93      2475
         1.0       0.18      0.47      0.26       128

    accuracy                           0.87      2603
   macro avg       0.58      0.68      0.60      2603
weighted avg       0.93      0.87      0.90      2603

direct_report
              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88      2105
         1.0       0.50      0.63      0.56       498

    accuracy                           0.81      2603
   macro avg       0.70      0.74      0.72      2603
weighted avg       0.83      0.81      0.82      2603



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_forest.iloc[:,i]));

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
forest_report = df_from_sklearn_cl_reports(classification_reports)

#### Mean precision for True over all categories

In [37]:
forest_precisions = forest_report.loc[forest_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)
forest_precisions.mean()

0.3735294117647059

#### Precisions for True for each category

In [38]:
forest_precisions

communicate
fire                      0.78
aid_related               0.73
weather_related           0.73
earthquake                0.70
food                      0.57
request                   0.56
storm                     0.54
direct_report             0.50
tools                     0.50
floods                    0.49
shelter                   0.49
water                     0.47
cold                      0.44
medical_help              0.44
buildings                 0.42
military                  0.40
death                     0.37
medical_products          0.36
other_aid                 0.33
missing_people            0.30
money                     0.28
electricity               0.28
refugees                  0.27
transport                 0.23
infrastructure_related    0.21
clothing                  0.20
search_and_rescue         0.19
other_weather             0.18
security                  0.18
other_infrastructure      0.17
hospitals                 0.17
aid_centers               0

#### Mean recall for True over all categories

In [39]:
forest_recalls = forest_report.loc[forest_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)
forest_recalls.mean()

0.5082352941176472

#### Recalls for True for each category

In [40]:
forest_recalls

communicate
food                      0.82
water                     0.76
shelter                   0.76
request                   0.74
floods                    0.73
military                  0.71
medical_products          0.70
weather_related           0.69
earthquake                0.68
storm                     0.68
aid_related               0.66
buildings                 0.65
refugees                  0.64
direct_report             0.63
death                     0.63
medical_help              0.60
electricity               0.57
cold                      0.57
clothing                  0.56
other_infrastructure      0.56
infrastructure_related    0.55
other_aid                 0.52
transport                 0.51
other_weather             0.47
money                     0.45
missing_people            0.34
search_and_rescue         0.29
security                  0.21
fire                      0.21
hospitals                 0.17
aid_centers               0.09
offer                     0

### Grid Search for the best model (decision tree) 

In [41]:
parameters = {
    'clf__estimator__max_depth': [5,7,9,20],
    'clf__estimator__criterion': ["gini", "entropy"]    
}

In [42]:
tree_opt = GridSearchCV(tree, param_grid = parameters)

In [43]:
classification_reports.clear()

In [44]:
#Attention! This cell takes much time to perform!
tree_opt.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.6min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   3.8s
[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.6min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   3.8s
[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.6min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   4.2s
[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.7min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   3.9s
[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.7min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipel

[Pipeline] ............... (step 3 of 3) Processing clf, total=  29.2s
[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.7min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=  27.0s
[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.3min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=  33.2s


GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x00000129E0BFB3A0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                                               max_depth=7,
                                                                                               random_state=42)))],
                                verbose=True),
             param_grid={'clf__estimator__criterion': ['gini', 'entropy'],
                         'clf__estimator__max_depth': [5, 7, 9, 20]})

In [45]:
tree_opt.cv_results_


{'mean_fit_time': array([162.75935116, 170.94305186, 168.73900099, 184.78123236,
        172.30614152, 170.79023337, 169.48390012, 191.00139642]),
 'std_fit_time': array([ 3.45169985,  8.5733746 ,  1.91539448,  2.88525028,  6.26343151,
         4.14690175,  1.38919186, 10.55712749]),
 'mean_score_time': array([40.24747219, 40.93305717, 39.95875807, 40.66876884, 40.59716311,
        39.66671414, 39.68406019, 41.73592234]),
 'std_score_time': array([0.79366467, 1.5253647 , 1.15818795, 0.8086596 , 0.85017281,
        0.49761043, 0.60251688, 2.78967555]),
 'param_clf__estimator__criterion': masked_array(data=['gini', 'gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy', 'entropy'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_clf__estimator__max_depth': masked_array(data=[5, 7, 9, 20, 5, 7, 9, 20],
              mask=[False, False, False, False, False, False, False, False],
  

#### The best rank seems to be for criterion "entropy" and max depth 20. It doesn' necessary mean better recalls or precisions for our categories

In [46]:
tree = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = DecisionTreeClassifier
                                    (criterion = "entropy", max_depth=20, min_samples_leaf=1, 
                                    class_weight = "balanced", random_state = 42)))])

In [47]:
classification_reports.clear()

In [48]:
tree.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.3min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=  34.1s


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000129E0BFB3A0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                        criterion='entropy',
                                                                        max_depth=20,
                                                                        random_state=42)))],
         verbose=True)

In [50]:
y_pred_tree = tree.predict(X_test)
y_pred_tree = pd.DataFrame(y_pred_tree, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_tree.iloc[:,i]))

request
              precision    recall  f1-score   support

         0.0       0.94      0.88      0.91      2175
         1.0       0.53      0.70      0.60       428

    accuracy                           0.85      2603
   macro avg       0.73      0.79      0.75      2603
weighted avg       0.87      0.85      0.86      2603

offer
              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99      2589
         1.0       0.02      0.07      0.04        14

    accuracy                           0.98      2603
   macro avg       0.51      0.53      0.51      2603
weighted avg       0.99      0.98      0.98      2603

aid_related
              precision    recall  f1-score   support

         0.0       0.77      0.80      0.78      1558
         1.0       0.68      0.64      0.66      1045

    accuracy                           0.74      2603
   macro avg       0.73      0.72      0.72      2603
weighted avg       0.73      0.74      0.73     

              precision    recall  f1-score   support

         0.0       0.99      0.96      0.97      2540
         1.0       0.25      0.57      0.34        63

    accuracy                           0.95      2603
   macro avg       0.62      0.76      0.66      2603
weighted avg       0.97      0.95      0.96      2603

other_weather
              precision    recall  f1-score   support

         0.0       0.97      0.93      0.95      2475
         1.0       0.25      0.42      0.31       128

    accuracy                           0.91      2603
   macro avg       0.61      0.68      0.63      2603
weighted avg       0.93      0.91      0.92      2603

direct_report
              precision    recall  f1-score   support

         0.0       0.91      0.82      0.86      2105
         1.0       0.46      0.64      0.54       498

    accuracy                           0.79      2603
   macro avg       0.68      0.73      0.70      2603
weighted avg       0.82      0.79      0.80   

In [51]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_tree.iloc[:,i]));

In [52]:
tree_report = df_from_sklearn_cl_reports(classification_reports)

In [53]:
decision_tree_precisions = tree_report.loc[tree_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)
decision_tree_precisions.mean()

0.3405882352941176

In [54]:
decision_tree_recalls = tree_report.loc[tree_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)
decision_tree_recalls.mean()

0.4947058823529413

#### As we can see the results aren't better than for the initial decision tree (in terms of considered parameters). Therefore the model will be left as it was before the grid search (max_depth = 7 and criterion "gini"). Such a decision tree will be used in train_classifier.py