# ML Pipeline Preparation

### 1. Importing libraries and loading data from data base

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\x\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
import collections
import copy


from sqlalchemy import create_engine

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 

from sklearn.utils.class_weight import compute_sample_weight
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.multioutput import MultiOutputClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report





import joblib

### Optional display settings

In [3]:
plt.style.use('seaborn-dark')
pd.set_option('display.max_rows', 50)

In [4]:
engine = create_engine('sqlite:///Pipeline_Project.db')
df = pd.read_sql_table('Messages', engine)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26028 entries, 0 to 26027
Data columns (total 37 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      26028 non-null  int64  
 1   message                 26028 non-null  object 
 2   related                 26028 non-null  float64
 3   request                 26028 non-null  float64
 4   offer                   26028 non-null  float64
 5   aid_related             26028 non-null  float64
 6   medical_help            26028 non-null  float64
 7   medical_products        26028 non-null  float64
 8   search_and_rescue       26028 non-null  float64
 9   security                26028 non-null  float64
 10  military                26028 non-null  float64
 11  water                   26028 non-null  float64
 12  food                    26028 non-null  float64
 13  shelter                 26028 non-null  float64
 14  clothing                26028 non-null

In [73]:
main_List = []

In [81]:
list_of_categories

['cold',
 'electricity',
 'military',
 'fire',
 'buildings',
 'money',
 'medical_help',
 'medical_products',
 'refugees',
 'other_aid',
 'missing_people',
 'hospitals',
 'other_weather',
 'transport',
 'infrastructure_related',
 'search_and_rescue',
 'aid_centers',
 'other_infrastructure',
 'security',
 'offer',
 'shops',
 'tools',
 'direct_report',
 'clothing']

In [74]:
for column in df[list_of_categories]:
    for message in df.loc[df[column] == 1]["message"]:
        message = message.lower()
        words = word_tokenize(message)
        for word in words:
            main_List.append(word)

In [75]:
words = [w for w in main_List if w not in stopwords.words("english")]

In [76]:
words = [w for w in words if len(w) > 2]

In [77]:
series = pd.Series(words)

In [78]:
series.value_counts().head(50)

people        5232
water         4630
help          4294
food          3950
need          3279
please        2468
earthquake    2174
said          2166
areas         1829
also          1759
n't           1727
relief        1688
aid           1482
health        1440
government    1422
affected      1412
two           1360
one           1307
destroyed     1239
many          1226
sandy         1197
http          1181
would         1168
area          1166
emergency     1137
supplies      1131
country       1126
heavy         1121
children      1096
like          1061
since         1058
rains         1041
floods        1040
tents         1012
million       1002
power         1001
house          997
flood          995
houses         970
including      965
medical        956
disaster       927
homes          917
region         901
due            899
local          861
assistance     853
rain           849
find           848
hit            847
dtype: int64

In [79]:
important_words = list(series.value_counts().drop(["n't", 
                            "said", "would", "like", 
                            "two", "one", "please", "three", 
                            "http","including", "also", 
                           "still", "since", "many", "thousands", "millions", "years"]).head(100).index)

In [80]:
important_words

['people',
 'water',
 'help',
 'food',
 'need',
 'earthquake',
 'areas',
 'relief',
 'aid',
 'health',
 'government',
 'affected',
 'destroyed',
 'sandy',
 'area',
 'emergency',
 'supplies',
 'country',
 'heavy',
 'children',
 'rains',
 'floods',
 'tents',
 'million',
 'power',
 'house',
 'flood',
 'houses',
 'medical',
 'disaster',
 'homes',
 'region',
 'due',
 'local',
 'assistance',
 'rain',
 'find',
 'hit',
 'roads',
 'shelter',
 'families',
 'international',
 'victims',
 'haiti',
 'province',
 'well',
 'get',
 'storm',
 'damaged',
 'last',
 'new',
 'know',
 'city',
 'caused',
 'support',
 'villages',
 'living',
 'flooding',
 'damage',
 'village',
 'could',
 'red',
 'weather',
 'district',
 'rescue',
 'state',
 'days',
 'family',
 'winter',
 'river',
 'work',
 'hurricane',
 'pakistan',
 'tsunami',
 'killed',
 'hospital',
 'provide',
 'efforts',
 'may',
 'good',
 'left',
 'least',
 'thank',
 'year',
 'first',
 'live',
 'needs',
 'time',
 'road',
 'care',
 'humanitarian',
 'north',
 

### General division into labels and features

In [5]:
X = df.iloc[:, 1]
Y = df.iloc[:, 2:]

In [None]:
### Weights adjustments

In [None]:
#class_weights = []

In [None]:
"""for feature in Y:
    dictionary = {(Y[feature].value_counts()/len(Y)).index[0]: (Y[feature].value_counts()/len(Y)).values[1],
                  (Y[feature].value_counts()/len(Y)).index[1]: (Y[feature].value_counts()/len(Y)).values[0]}

    class_weights.append(dictionary)
"""    

In [None]:
"""for dictionary in copy.copy(class_weights):
    dictionary = {k: dictionary[k] for k in sorted(dictionary)}
    
    class_weights.pop(0)
    class_weights.append(dictionary)
"""

In [None]:
"""
for element in class_weights:
    for key, value in element.items():
        print(type(key), type(value))
        
"""

In [None]:
#compute_sample_weight(class_weights, Y)

### 2. Key functions

#### Tokenizer for processing the text data

In [6]:
def tokenize(text):
    #Normalization - lowercase  - no punctuation removal - the nltk.tokenize should interpret them by itself
    text = text.lower()
    ######text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #Toeknization
    ######words = text.split()
    words = word_tokenize(text)
    #Stop words removal
    words = [w for w in words if w not in stopwords.words("english")]
    
    #Named Entities??
    # Perhaps
    
    #Lemmatization
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
    
    
    return lemmed

#### Since I want to test out many models, it seems reasonable to write a function which will make pipeline with keyword arguments

In [7]:
def make_sklearn_pipeline(Pipeline = Pipeline, memory = None, verbose = False, **kwargs):
    """
    Firt argument is sklearn Pipepline class. It should not be changed.
    
    Definition of pipeline steps happens EXPLICITLY within the instantiation!
    Definition of typical pipeline looks like following:
    example_pipeline = make_sklearn_pipeline(steps = [('name#1',function#1),
                                                      ('name#2',function#2),
                                                      ('name#3',function#3),
                                                      ('name#4',function#4)....], verbose = ...,
                                                      memory = ...)
    
    It is not required to give values for ´verbose´ and ´memory´. They have default values as False
    and None respectively. For more information visit: 
    ´https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html´
    """
    pipeline = Pipeline(steps = kwargs['steps'], memory = memory, verbose = verbose)
    
    return pipeline

#### Function transforming the classification report into data frame with recall, precision and accuracy only 

In [8]:
def df_from_sklearn_cl_reports(cl_reports):

    data_frame = pd.DataFrame()

    for feature in list(cl_reports.keys()):
        l = cl_reports[feature].split(' ')
        l = [x for x in l if x !='']
        l = [x for x in l if '\n' not in x]
        l = l[:l.index('accuracy')]
        columns = l[:3]
        l = [x for x in l if x not in columns]
        labels = []
        precisions = []
        recalls = []
        f1_scores = []
    
        for i, element in enumerate(l):
            if i == 0 or i % 4 == 0:
                labels.append(element)
                precisions.append(l[i+1])
                recalls.append(l[i+2])
                f1_scores.append(l[i+3])
            
        comunicates = [feature]*len(labels)
    
        if data_frame.shape[0] == 0:      
            data_frame['communicate'] = comunicates
            data_frame['label'] = labels
            data_frame['precisions'] = precisions
            data_frame['recalls'] = recalls
            data_frame['f1_scores'] = f1_scores
        
        else:
            auxiliary_df = pd.DataFrame()
            auxiliary_df['communicate'] = comunicates
            auxiliary_df['label'] = labels
            auxiliary_df['precisions'] = precisions
            auxiliary_df['recalls'] = recalls
            auxiliary_df['f1_scores'] = f1_scores 
        
            data_frame = pd.concat([data_frame, auxiliary_df])
        
            del auxiliary_df
    
    data_frame.set_index(["communicate"], inplace = True)
    
    #Chaning the data types
    data_frame.label = data_frame.label.astype("float")
    data_frame.label = data_frame.label.astype("int")
    data_frame.precisions = data_frame.precisions.astype("float")
    data_frame.recalls = data_frame.recalls.astype("float")
    data_frame.f1_scores = data_frame.f1_scores.astype("float")
    
    return data_frame    

### 3. Building a machine learning pipeline

#### 3.1 AdaBoost Classifier

##### Definition and fitting

In [None]:
adaboost_with_transformer = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = AdaBoostClassifier(
                           DecisionTreeClassifier(class_weight = "balanced"))))])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)


In [None]:
adaboost_with_transformer.fit(X_train, y_train)

##### Predicting and classification report

In [None]:
y_pred_ada = adaboost_with_transformer.predict(X_test)
y_pred_ada = pd.DataFrame(y_pred_ada, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_ada.iloc[:,i]))

##### Providing a data frame for easier results assessment

In [9]:
classification_reports = {}

In [None]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_ada.iloc[:,i]));

In [None]:
adaboost_with_transformer_report_df = df_from_sklearn_cl_reports(classification_reports)

In [None]:
adaboost_with_transformer_report_df.loc[
    adaboost_with_transformer_report_df.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False).mean()

In [None]:
adaboost_with_transformer_report_df.loc[
    adaboost_with_transformer_report_df.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)

In [None]:
adaboost_with_transformer_report_df.loc[
    adaboost_with_transformer_report_df.label == 1].groupby(
    ["communicate"])["f1_scores"].mean().sort_values(ascending = False).mean()

In [None]:
RandomForest_with_transformer = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = RandomForestClassifier
                                                         (class_weight = "balanced")))])

In [None]:
RandomForest_with_transformer.fit(X_train, y_train)

In [None]:
y_pred_forest = RandomForest_with_transformer.predict(X_test)
y_pred_forest = pd.DataFrame(y_pred_forest, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_forest.iloc[:,i]))

In [None]:
classification_reports.clear()

In [None]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_forest.iloc[:,i]));

In [None]:
forest_with_transformer_report_df = df_from_sklearn_cl_reports(classification_reports)

In [None]:
forest_with_transformer_report_df.loc[
    forest_with_transformer_report_df.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False).mean()

In [None]:
forest_with_transformer_report_df.loc[
    forest_with_transformer_report_df.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False).mean()

##### Trying to ehence the model's performance via GridSearch

In [None]:
parameters = {
    'clf__estimator__n_estimators': [100,1000]    
}

In [None]:
RandomForest_optimized = GridSearchCV(RandomForest_with_transformer, param_grid = parameters)

In [None]:
#Attention! This cell may take a bit time to perform (around 15 minutes on typical PC)
RandomForest_optimized.fit(X_train, y_train)

In [None]:
y_pred_opt = RandomForest_optimized.predict(X_test)
y_pred_opt = pd.DataFrame(y_pred_opt, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_opt.iloc[:,i]))

In [None]:
classification_reports.clear()

In [None]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_opt.iloc[:,i]));

In [None]:
RandomForest_optimized_report_df = df_from_sklearn_cl_reports(classification_reports)

In [None]:
RandomForest_optimized_report_df.loc[
    RandomForest_optimized_report_df.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False).mean()

In [None]:
RandomForest_optimized_report_df.loc[
    RandomForest_optimized_report_df.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False).mean()

In [None]:
classification_reports.clear()

In [None]:
dt_depth_2 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=1, class_weight = "balanced")
dt_depth_9 = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1, class_weight = "balanced")

In [None]:
parameters = {
    'clf__estimator__base_estimator': [dt_depth_2,dt_depth_9],
    'clf__estimator__n_estimators': [50,100]    
}

In [None]:
adaboost_transf_optimized = GridSearchCV(adaboost_with_transformer, param_grid = parameters)

In [None]:
#Attention! This cell may take a bit time to perform (around 15 minutes on typical PC)
adaboost_transf_optimized.fit(X_train, y_train)

In [None]:
y_pred_opt_ada = adaboost_transf_optimized.predict(X_test)
y_pred_opt_ada = pd.DataFrame(y_pred_opt_ada, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_opt_ada.iloc[:,i]))

In [None]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_opt_ada.iloc[:,i]));

In [None]:
adaboost_optimized_report_df = df_from_sklearn_cl_reports(classification_reports)

In [None]:
adaboost_optimized_report_df.loc[
    adaboost_optimized_report_df.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False).mean()

In [None]:
adaboost_optimized_report_df.loc[
    adaboost_optimized_report_df.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False).mean()


>Undersampling still gives poor results - escpecially for recall which is very important in disaster response system. Undersampling may not necesserilly work good in multioutput problems, because of possible noise beeing added to the data set.
Moreover, if we really want to balance all of the classes via undersampling we will end up with very small data set (some imbalanced classes are much less numerous than the better balanced ones). Hence the next iterration will be conducted with **´class weights´** adjustments and without any reduction of the data set size.

In [None]:
adaboost_optimized_report_df.to_csv("Adaboost_DT_GridSearch_depth2and9_estimators_50and100.csv", 
                                    index = True)

In [None]:
classification_reports.clear()

In [None]:
dt_splitter_random = DecisionTreeClassifier(splitter="random",class_weight = "balanced")
dt_splitter_best = DecisionTreeClassifier(class_weight = "balanced")

In [None]:
parameters = {
    'clf__estimator__base_estimator': [dt_splitter_random,dt_splitter_best],
    'clf__estimator__n_estimators': [50,100]    
}

In [None]:
adaboost_transf_optimized = GridSearchCV(adaboost_with_transformer, param_grid = parameters)

In [None]:
#Attention! This cell may take a bit time to perform (around 15 minutes on typical PC)
adaboost_transf_optimized.fit(X_train, y_train)

In [None]:
y_pred_opt_ada = adaboost_transf_optimized.predict(X_test)
y_pred_opt_ada = pd.DataFrame(y_pred_opt_ada, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_opt_ada.iloc[:,i]))

In [None]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_opt_ada.iloc[:,i]));

In [None]:
adaboost_optimized_report_df = df_from_sklearn_cl_reports(classification_reports)

In [None]:
adaboost_optimized_report_df.loc[
    adaboost_optimized_report_df.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)

In [None]:
adaboost_optimized_report_df.loc[
    adaboost_optimized_report_df.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False).mean()

In [None]:
adaboost_optimized_report_df.loc[
    adaboost_optimized_report_df.label == 1].groupby(
    ["communicate"])["f1_scores"].mean().sort_values(ascending = False)

In [None]:
adaboost_optimized_report_df.groupby(
    ["communicate"])["f1_scores"].mean().sort_values(ascending = False)


>Undersampling still gives poor results - escpecially for recall which is very important in disaster response system. Undersampling may not necesserilly work good in multioutput problems, because of possible noise beeing added to the data set.
Moreover, if we really want to balance all of the classes via undersampling we will end up with very small data set (some imbalanced classes are much less numerous than the better balanced ones). Hence the next iterration will be conducted with **´class weights´** adjustments and without any reduction of the data set size.

In [None]:
adaboost_optimized_report_df.to_csv("Adaboost_DT_GridSearch_splitter_random_best_estimators_50and100.csv", 
                                    index = True)

In [None]:
filename = "AdaBoostClassifier_opt"

In [None]:
joblib.dump(adaboost_transf_optimized, filename)

## SVC

In [11]:
SVC = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = LinearSVC
                                                         (class_weight = "balanced", dual = False, max_iter = 10000)))])

In [12]:
classification_reports.clear()

In [13]:
SVC.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.2min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   5.4s


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000227E6689A60>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=LinearSVC(class_weight='balanced',
                                                           dual=False,
                                                           max_iter=10000)))],
         verbose=True)

In [14]:
y_pred_svc = SVC.predict(X_test)
y_pred_svc = pd.DataFrame(y_pred_svc, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_svc.iloc[:,i]))

related
              precision    recall  f1-score   support

         0.0       0.59      0.69      0.63      1221
         1.0       0.90      0.85      0.88      3985

    accuracy                           0.81      5206
   macro avg       0.74      0.77      0.76      5206
weighted avg       0.83      0.81      0.82      5206

request
              precision    recall  f1-score   support

         0.0       0.94      0.89      0.91      4311
         1.0       0.58      0.74      0.65       895

    accuracy                           0.86      5206
   macro avg       0.76      0.81      0.78      5206
weighted avg       0.88      0.86      0.87      5206

offer
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5182
         1.0       0.00      0.00      0.00        24

    accuracy                           0.99      5206
   macro avg       0.50      0.50      0.50      5206
weighted avg       0.99      0.99      0.99      520

In [15]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_svc.iloc[:,i]));

In [16]:
svc_report = df_from_sklearn_cl_reports(classification_reports)

In [48]:
svc_precisions = svc_report.loc[
    svc_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)

['cold',
 'electricity',
 'military',
 'fire',
 'buildings',
 'money',
 'medical_help',
 'medical_products',
 'refugees',
 'other_aid',
 'missing_people',
 'hospitals',
 'other_weather',
 'transport',
 'infrastructure_related',
 'search_and_rescue',
 'aid_centers',
 'other_infrastructure',
 'security',
 'offer',
 'shops',
 'tools']

In [47]:
svc_recalls = svc_report.loc[
    svc_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)

['money',
 'refugees',
 'cold',
 'transport',
 'fire',
 'other_weather',
 'infrastructure_related',
 'search_and_rescue',
 'hospitals',
 'other_infrastructure',
 'missing_people',
 'aid_centers',
 'security',
 'shops',
 'tools',
 'offer']

['cold',
 'electricity',
 'military',
 'fire',
 'buildings',
 'money',
 'medical_help',
 'medical_products',
 'refugees',
 'other_aid',
 'missing_people',
 'hospitals',
 'other_weather',
 'transport',
 'infrastructure_related',
 'search_and_rescue',
 'aid_centers',
 'other_infrastructure',
 'security',
 'offer',
 'shops',
 'tools',
 'money',
 'refugees',
 'cold',
 'transport',
 'fire',
 'other_weather',
 'infrastructure_related',
 'search_and_rescue',
 'hospitals',
 'other_infrastructure',
 'missing_people',
 'aid_centers',
 'security',
 'shops',
 'tools',
 'offer']


>Undersampling still gives poor results - escpecially for recall which is very important in disaster response system. Undersampling may not necesserilly work good in multioutput problems, because of possible noise beeing added to the data set.
Moreover, if we really want to balance all of the classes via undersampling we will end up with very small data set (some imbalanced classes are much less numerous than the better balanced ones). Hence the next iterration will be conducted with **´class weights´** adjustments and without any reduction of the data set size.

## Adaboost SVC

In [None]:
classification_reports.clear()

In [None]:
adaSVC = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                       ('clf', MultiOutputClassifier(estimator = AdaBoostClassifier(
                           LinearSVC(class_weight = "balanced"),algorithm="SAMME")))])

In [None]:
adaSVC.fit(X_train, y_train)

##### Predicting and classification report

In [None]:
y_pred_ada_svc = adaSVC.predict(X_test)
y_pred_ada_svc = pd.DataFrame(y_pred_ada_svc, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_ada_svc.iloc[:,i]))

##### Providing a data frame for easier results assessment

In [None]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_ada_svc.iloc[:,i]));

In [None]:
ada_svc_report_df = df_from_sklearn_cl_reports(classification_reports)

In [None]:
ada_svc_report_df.loc[
    ada_svc_report_df.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False).mean()

In [None]:
ada_svc_report_df.loc[
    ada_svc_report_df.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False).mean()

## Decision Tree

In [19]:
Tree = make_sklearn_pipeline(verbose = True, steps = [('vect', CountVectorizer(tokenizer = tokenize)),
                           ('tfidf', TfidfTransformer()),                                              
                           ('clf', MultiOutputClassifier(estimator = DecisionTreeClassifier
                                                         (max_depth=9, min_samples_leaf=1, class_weight = "balanced")))])

In [20]:
classification_reports.clear()

In [21]:
Tree.fit(X_train, y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 3.1min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=  10.5s


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000227E6689A60>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                        max_depth=9)))],
         verbose=True)

In [22]:
y_pred_tree = Tree.predict(X_test)
y_pred_tree = pd.DataFrame(y_pred_tree, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_tree.iloc[:,i]))

related
              precision    recall  f1-score   support

         0.0       0.43      0.57      0.49      1221
         1.0       0.85      0.76      0.81      3985

    accuracy                           0.72      5206
   macro avg       0.64      0.67      0.65      5206
weighted avg       0.75      0.72      0.73      5206

request
              precision    recall  f1-score   support

         0.0       0.92      0.86      0.89      4311
         1.0       0.50      0.66      0.57       895

    accuracy                           0.83      5206
   macro avg       0.71      0.76      0.73      5206
weighted avg       0.85      0.83      0.84      5206

offer
              precision    recall  f1-score   support

         0.0       1.00      0.90      0.95      5182
         1.0       0.01      0.29      0.03        24

    accuracy                           0.90      5206
   macro avg       0.51      0.60      0.49      5206
weighted avg       0.99      0.90      0.94      520

In [23]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_tree.iloc[:,i]));

In [24]:
tree_report = df_from_sklearn_cl_reports(classification_reports)

In [42]:
decision_tree_precisions = tree_report.loc[tree_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False)

decision_tree_precisions.loc[decision_tree_precisions < 0.5]

communicate
medical_help              0.47
military                  0.46
direct_report             0.45
buildings                 0.39
electricity               0.37
other_aid                 0.33
medical_products          0.32
other_weather             0.30
transport                 0.29
money                     0.28
cold                      0.27
clothing                  0.22
infrastructure_related    0.18
other_infrastructure      0.16
refugees                  0.14
aid_centers               0.09
fire                      0.09
search_and_rescue         0.08
hospitals                 0.08
missing_people            0.07
security                  0.02
shops                     0.01
offer                     0.01
tools                     0.01
Name: precisions, dtype: float64

In [43]:
decision_tree_recalls = tree_report.loc[tree_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False)

decision_tree_recalls.loc[decision_tree_recalls < 0.5]

communicate
security                  0.48
search_and_rescue         0.46
medical_help              0.44
fire                      0.41
aid_centers               0.40
military                  0.37
hospitals                 0.37
other_weather             0.36
transport                 0.36
other_infrastructure      0.31
offer                     0.29
missing_people            0.28
infrastructure_related    0.27
shops                     0.14
tools                     0.09
Name: recalls, dtype: float64

In [63]:
series_of_categories = pd.Series(list(svc_precisions.loc[svc_precisions < 0.5].index) \
+ list(svc_recalls.loc[svc_recalls < 0.5].index) \
+ list(decision_tree_precisions.loc[decision_tree_precisions < 0.5].index) \
+ list(decision_tree_recalls.loc[decision_tree_recalls < 0.5].index))

In [69]:
list_of_categories = list(series_of_categories.drop_duplicates().values)


>Undersampling still gives poor results - escpecially for recall which is very important in disaster response system. Undersampling may not necesserilly work good in multioutput problems, because of possible noise beeing added to the data set.
Moreover, if we really want to balance all of the classes via undersampling we will end up with very small data set (some imbalanced classes are much less numerous than the better balanced ones). Hence the next iterration will be conducted with **´class weights´** adjustments and without any reduction of the data set size.

In [None]:
parameters = {
    'clf__estimator__max_depth': [2,4,5,7,9,11]    
}

In [None]:
Tree_opt = GridSearchCV(Tree, param_grid = parameters)

In [None]:
#Attention! This cell may take a bit time to perform (around 15 minutes on typical PC)
Tree_opt.fit(X_train, y_train)

In [None]:
y_pred_tree_opt = Tree_opt.predict(X_test)
y_pred_tree_opt = pd.DataFrame(y_pred_tree_opt, columns = list(Y.columns))
y_test = y_test.reset_index(drop = True)
for i, var in enumerate(Y):
    print(var)
    print(classification_report(y_test.iloc[:,i], y_pred_tree_opt.iloc[:,i]))

In [None]:
classification_reports.clear()

In [None]:
for i, var in enumerate(Y):
    classification_reports[var] = (classification_report(y_test.iloc[:,i], y_pred_tree_opt.iloc[:,i]));

In [None]:
tree_opt_df_report = df_from_sklearn_cl_reports(classification_reports)

In [None]:
tree_opt_df_report.loc[
    tree_opt_df_report.label == 1].groupby(
    ["communicate"])["precisions"].mean().sort_values(ascending = False).mean()

In [None]:
tree_opt_df_report.loc[
    tree_opt_df_report.label == 1].groupby(
    ["communicate"])["recalls"].mean().sort_values(ascending = False).mean()

### Linear SVC