In [34]:
import os
from typing import List

import pandas as pd
import spacy
from scipy.stats import loguniform
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV, cross_validate
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier

from preprocessing.InputDataset import FramingArticleDataset
from preprocessing.BOWPipeline import BOWPipeline, basic_tokenizing_and_cleaning

DATA_DIR = os.path.join('..', 'data')

## Load the Data

In [2]:
en_train = FramingArticleDataset(data_dir=DATA_DIR, language='en', subtask=2, split='train',
                                 load_preprocessed_units_of_analysis=True, units_of_analysis_dir= os.path.join(DATA_DIR, 'preprocessed'))

433it [00:00, 20058.24it/s]


## Binarize the labels

In [3]:
labels = ('fairness_and_equality', 'security_and_defense', 'crime_and_punishment', 'morality', 'policy_prescription_and_evaluation',
          'capacity_and_resources', 'economic', 'cultural_identity', 'health_and_safety', 'quality_of_life', 'legality_constitutionality_and_jurisprudence',
          'political', 'public_opinion', 'external_regulation_and_reputation')

In [4]:
mlb = MultiLabelBinarizer()
mlb.fit([labels])

In [5]:
mlb.classes_

array(['capacity_and_resources', 'crime_and_punishment',
       'cultural_identity', 'economic',
       'external_regulation_and_reputation', 'fairness_and_equality',
       'health_and_safety',
       'legality_constitutionality_and_jurisprudence', 'morality',
       'policy_prescription_and_evaluation', 'political',
       'public_opinion', 'quality_of_life', 'security_and_defense'],
      dtype=object)

In [6]:
y_train = mlb.transform(en_train.df.frames.str.lower().str.split(','))

## Vectorize the documents

In [7]:
en_nlp = spacy.load('en_core_web_sm')

In [8]:
vectorizing_pipeline = BOWPipeline(
        tokenizer=lambda string: basic_tokenizing_and_cleaning(string, spacy_nlp_model=en_nlp),
        use_tfidf=True,
        min_df=0.05,
        max_df=0.95,
        ngram_range=(1, 1),
        max_features=1000
    )

vectorizing_pipeline.add_low_var_threshold(min_var=1e-3)
vectorizing_pipeline.add_corr_filter(corr_threshold=0.9)

In [9]:
X_train = vectorizing_pipeline.pipeline.fit_transform(en_train.df.title_and_first_paragraph)

## Test training multi-output classifier

In [10]:
clfs = {'svm': SVC(), 'linear_svm': LinearSVC(), 'log_reg': LogisticRegression(), 'ridge': RidgeClassifier(),
        'rf': RandomForestClassifier, 'xgboost': XGBClassifier(), 'xgboostRF': XGBRFClassifier()}

### Independent multilabel

#### Nested cross validation

In [105]:
# Params of fitting
param_combinations = 10
search_loop_cv = 2
scoring_function = 'f1_micro', 'f1_macro', 'accuracy', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro'
model_selection_loop_cv = 10

In [106]:
# Wrap as a multilabel classifier
mlo_clf = MultiOutputClassifier(clfs['svm'], n_jobs=2)

# Define search space of the model
params = {
    'estimator__C': loguniform(1e-2, 1e3),
    'estimator__gamma': loguniform(1e-4, 1e-1)
}

# Define inner loop search
search_routine = RandomizedSearchCV(
    estimator=mlo_clf,
    param_distributions=params,
    n_iter=param_combinations,
    cv=search_loop_cv,
    scoring=scoring_function,
    refit='f1_micro'
)

# Define Outer-loop routine and execute it
model_performance = cross_validate(
    search_routine,
    X_train,
    y_train,
    scoring=scoring_function,
    cv=model_selection_loop_cv,
    return_estimator=True
)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [107]:
model_performance

{'fit_time': array([0.98427534, 0.96864033, 1.04488873, 1.00687385, 1.05006814,
        1.04519653, 1.06866026, 1.06120825, 1.00300288, 1.01657963]),
 'score_time': array([0.02020073, 0.01956248, 0.01957321, 0.01884341, 0.02410173,
        0.01908326, 0.02055025, 0.02115512, 0.02114248, 0.01933861]),
 'estimator': [RandomizedSearchCV(cv=2,
                     estimator=MultiOutputClassifier(estimator=SVC(), n_jobs=2),
                     param_distributions={'estimator__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f241c19da00>,
                                          'estimator__gamma': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f241e2d2bb0>},
                     refit='f1_micro',
                     scoring=('f1_micro', 'f1_macro', 'accuracy',
                              'precision_micro', 'precision_macro',
                              'recall_micro', 'recall_macro')),
  RandomizedSearchCV(cv=2,
                     estim

In [75]:
model_performance['test_f1_micro'].mean()

0.5376282602091063

In [76]:
model_performance['test_f1_micro'].std()

0.030491075625106688

In [110]:
pd.DataFrame(model_performance['estimator'][0].cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__C,param_estimator__gamma,params,split0_test_f1_micro,split1_test_f1_micro,mean_test_f1_micro,...,split0_test_recall_micro,split1_test_recall_micro,mean_test_recall_micro,std_test_recall_micro,rank_test_recall_micro,split0_test_recall_macro,split1_test_recall_macro,mean_test_recall_macro,std_test_recall_macro,rank_test_recall_macro
0,0.024796,0.000499,0.026907,0.001548,0.226226,0.041095,"{'estimator__C': 0.22622573283031963, 'estimat...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
1,0.022196,0.00096,0.023957,0.001274,730.080254,0.000381,"{'estimator__C': 730.0802536296363, 'estimator...",0.441968,0.521589,0.481779,...,0.321188,0.45142,0.386304,0.065116,1,0.183038,0.252862,0.21795,0.034912,1
2,0.019843,0.000333,0.024559,0.001668,0.225737,0.006832,"{'estimator__C': 0.22573658804182628, 'estimat...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
3,0.023337,0.000921,0.029217,0.002111,8.631638,0.022478,"{'estimator__C': 8.631637502243715, 'estimator...",0.421965,0.512642,0.467303,...,0.295547,0.439462,0.367504,0.071958,2,0.167477,0.240315,0.203896,0.036419,2
4,0.022016,6.4e-05,0.027822,0.002454,2.995993,0.026783,"{'estimator__C': 2.995992829648104, 'estimator...",0.368526,0.508099,0.438312,...,0.249663,0.445441,0.347552,0.097889,3,0.137677,0.241263,0.18947,0.051793,3
5,0.019391,0.000604,0.023609,0.000736,0.216176,0.003754,"{'estimator__C': 0.2161763105544836, 'estimato...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
6,0.017673,0.000115,0.022433,0.000376,0.308084,0.000688,"{'estimator__C': 0.3080838358481395, 'estimato...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
7,0.020107,0.001608,0.025043,0.001235,0.074103,0.000896,"{'estimator__C': 0.07410334600553543, 'estimat...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
8,0.017614,0.000649,0.024648,0.000934,0.010271,0.007508,"{'estimator__C': 0.010270589297712174, 'estima...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
9,0.018714,0.001204,0.025016,0.001513,1.56193,0.000564,"{'estimator__C': 1.5619304706549475, 'estimato...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4


In [109]:
for estimator in model_performance['estimator']:
    print(estimator.best_params_)

{'estimator__C': 730.0802536296363, 'estimator__gamma': 0.00038140351982047637}
{'estimator__C': 10.73383078686751, 'estimator__gamma': 0.043109846869528234}
{'estimator__C': 481.0264305619139, 'estimator__gamma': 0.05647762841969143}
{'estimator__C': 821.2257586107107, 'estimator__gamma': 0.00846631455555496}
{'estimator__C': 323.8403487917489, 'estimator__gamma': 0.002624289875121625}
{'estimator__C': 88.04901633915712, 'estimator__gamma': 0.010059309419473828}
{'estimator__C': 146.55709189215872, 'estimator__gamma': 0.003552611559285617}
{'estimator__C': 11.528593234385763, 'estimator__gamma': 0.0740845762543288}
{'estimator__C': 92.68982904403184, 'estimator__gamma': 0.012639295954616034}
{'estimator__C': 654.6650492886708, 'estimator__gamma': 0.006010025633541435}


##### We are going to track performance for

 * Average performance across outer folds
 * Perfomance on holdout of best hyper-params of each outer fold
 * Average performance across inner folds of each hyper-param sample in each outer fold

In [141]:
import mlflow
from mlflow import log_metric, log_param

# Average performance across outer folds
with mlflow.start_run() as run:
    log_param('model_type', 'SVM')
    log_param('multilabel_type', 'independent')
    log_param('analysis_level', 'model_wide')
    for metric in [key for key in model_performance.keys() if 'test_' in key]:
        log_metric(f'{metric.split("test_")[1]}_mean', model_performance[metric].mean())
        log_metric(f'{metric.split("test_")[1]}_std', model_performance[metric].std())
        print(f'{metric.split("test_")[1]}_mean', model_performance[metric].mean())
        print(f'{metric.split("test_")[1]}_std', model_performance[metric].std())
mlflow.end_run()

f1_micro_mean 0.5357396628265825
f1_micro_std 0.040584181119694314
f1_macro_mean 0.3160875053852997
f1_macro_std 0.025650922175171075
accuracy_mean 0.0394291754756871
accuracy_std 0.020897373142693826
precision_micro_mean 0.6477434048874537
precision_micro_std 0.09006295631796474
precision_macro_mean 0.40472463068025444
precision_macro_std 0.06497796647781344
recall_micro_mean 0.46839462669974957
recall_micro_std 0.06826704915116316
recall_macro_mean 0.29547230499844773
recall_macro_std 0.028297922313826863


In [155]:
# Hyperparam choice performance measured across outer folds
for fold_i in range(len(model_performance['estimator'])):
    with mlflow.start_run() as run:
        log_param('model_type', 'SVM')
        log_param('multilabel_type', 'independent')
        log_param('analysis_level', 'outer_cv')

        # Log the best hyper-paras found for fold_i values
        for key, value in model_performance['estimator'][fold_i].best_params_.items():
            log_param(key.split('__')[-1], value)

        # Log the metrics
        for metric in [key for key in model_performance.keys() if 'test_' in key]:
            log_metric(metric.split('test_')[1], model_performance[metric][fold_i])

    mlflow.end_run()


In [181]:
# Average performance across inner folds of the different hyper-param samples
for hyperparam_search in model_performance['estimator']:
        results_df = pd.DataFrame(hyperparam_search.cv_results_)
        metrics_mean_cols = [col for col in results_df.columns if 'mean_test' in col]
        metrics_std_cols = [col for col in results_df.columns if 'std_test' in col]

        for sample_i, hyperparam_sample_results in results_df.iterrows():
            with mlflow.start_run() as run:
                log_param('model_type', 'SVM')
                log_param('multilabel_type', 'independent')
                log_param('analysis_level', 'inner_cv')

                # Log the hyperparameters
                for param, param_value in hyperparam_sample_results.params.items():
                    log_param(param.split('__')[-1], param_value)

                # Log the metrics
                for mean_metric, std_metric in zip(metrics_mean_cols, metrics_std_cols):
                    log_metric(mean_metric, hyperparam_sample_results[mean_metric])
                    log_metric(std_metric, hyperparam_sample_results[std_metric])

            mlflow.end_run()

C 0.22622573283031963
gamma 0.041094912743885305
mean_test_f1_micro 0.4025615475460471
std_test_f1_micro 0.042951272966773424
mean_test_f1_macro 0.11818996594717426
std_test_f1_macro 0.02007753638745465
mean_test_accuracy 0.0025906735751295338
std_test_accuracy 0.0025906735751295338
mean_test_precision_micro 0.501667022773
std_test_precision_micro 0.02152885351565978
mean_test_precision_macro 0.08881450929819071
std_test_precision_macro 0.014072241256953617
mean_test_recall_micro 0.34474985324643104
std_test_recall_micro 0.07079573718705182
mean_test_recall_macro 0.17857142857142855
std_test_recall_macro 0.03571428571428571



C 730.0802536296363
gamma 0.00038140351982047637
mean_test_f1_micro 0.4817786886428912
std_test_f1_micro 0.039810257816521616
mean_test_f1_macro 0.23096107291008283
std_test_f1_macro 0.013699488563011258
mean_test_accuracy 0.02843063938892153
std_test_accuracy 0.0026574435126328726
mean_test_precision_micro 0.6629601226993865
std_test_precision_micro 0.0453732106

In [157]:
model_performance['estimator'][0].cv_results_

{'mean_fit_time': array([0.02479637, 0.02219582, 0.01984251, 0.02333748, 0.02201629,
        0.01939094, 0.01767337, 0.02010703, 0.01761436, 0.01871419]),
 'std_fit_time': array([4.98652458e-04, 9.60111618e-04, 3.32713127e-04, 9.21368599e-04,
        6.36577606e-05, 6.03795052e-04, 1.15036964e-04, 1.60789490e-03,
        6.49452209e-04, 1.20425224e-03]),
 'mean_score_time': array([0.02690673, 0.02395713, 0.02455914, 0.02921712, 0.02782238,
        0.02360904, 0.02243257, 0.02504301, 0.02464771, 0.02501631]),
 'std_score_time': array([0.00154757, 0.00127351, 0.00166762, 0.00211132, 0.00245416,
        0.00073588, 0.00037646, 0.00123453, 0.00093412, 0.00151277]),
 'param_estimator__C': masked_array(data=[0.22622573283031963, 730.0802536296363,
                    0.22573658804182628, 8.631637502243715,
                    2.995992829648104, 0.2161763105544836,
                    0.3080838358481395, 0.07410334600553543,
                    0.010270589297712174, 1.5619304706549475],
     

In [176]:
a = pd.DataFrame(model_performance['estimator'][0].cv_results_)
a

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__C,param_estimator__gamma,params,split0_test_f1_micro,split1_test_f1_micro,mean_test_f1_micro,...,split0_test_recall_micro,split1_test_recall_micro,mean_test_recall_micro,std_test_recall_micro,rank_test_recall_micro,split0_test_recall_macro,split1_test_recall_macro,mean_test_recall_macro,std_test_recall_macro,rank_test_recall_macro
0,0.024796,0.000499,0.026907,0.001548,0.226226,0.041095,"{'estimator__C': 0.22622573283031963, 'estimat...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
1,0.022196,0.00096,0.023957,0.001274,730.080254,0.000381,"{'estimator__C': 730.0802536296363, 'estimator...",0.441968,0.521589,0.481779,...,0.321188,0.45142,0.386304,0.065116,1,0.183038,0.252862,0.21795,0.034912,1
2,0.019843,0.000333,0.024559,0.001668,0.225737,0.006832,"{'estimator__C': 0.22573658804182628, 'estimat...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
3,0.023337,0.000921,0.029217,0.002111,8.631638,0.022478,"{'estimator__C': 8.631637502243715, 'estimator...",0.421965,0.512642,0.467303,...,0.295547,0.439462,0.367504,0.071958,2,0.167477,0.240315,0.203896,0.036419,2
4,0.022016,6.4e-05,0.027822,0.002454,2.995993,0.026783,"{'estimator__C': 2.995992829648104, 'estimator...",0.368526,0.508099,0.438312,...,0.249663,0.445441,0.347552,0.097889,3,0.137677,0.241263,0.18947,0.051793,3
5,0.019391,0.000604,0.023609,0.000736,0.216176,0.003754,"{'estimator__C': 0.2161763105544836, 'estimato...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
6,0.017673,0.000115,0.022433,0.000376,0.308084,0.000688,"{'estimator__C': 0.3080838358481395, 'estimato...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
7,0.020107,0.001608,0.025043,0.001235,0.074103,0.000896,"{'estimator__C': 0.07410334600553543, 'estimat...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
8,0.017614,0.000649,0.024648,0.000934,0.010271,0.007508,"{'estimator__C': 0.010270589297712174, 'estima...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4
9,0.018714,0.001204,0.025016,0.001513,1.56193,0.000564,"{'estimator__C': 1.5619304706549475, 'estimato...",0.35961,0.445513,0.402562,...,0.273954,0.415546,0.34475,0.070796,4,0.142857,0.214286,0.178571,0.035714,4


In [168]:
a.iloc[0].params

{'estimator__C': 0.22622573283031963, 'estimator__gamma': 0.041094912743885305}

In [179]:
metrics_mean_cols = [col for col in results_df.columns if 'mean' in col]
metrics_std_cols = [col for col in results_df.columns if 'std' in col]

In [180]:
for mean_metric, std_metric in zip(metrics_mean_cols, metrics_std_cols):
    print(mean_metric, std_metric)

mean_fit_time std_fit_time
mean_score_time std_score_time
mean_test_f1_micro std_test_f1_micro
mean_test_f1_macro std_test_f1_macro
mean_test_accuracy std_test_accuracy
mean_test_precision_micro std_test_precision_micro
mean_test_precision_macro std_test_precision_macro
mean_test_recall_micro std_test_recall_micro
mean_test_recall_macro std_test_recall_macro



### Classifier Chain

### Classifiers that supports multilabel

In [None]:
clfs = {'ridge': RidgeClassifier(), 'rf': RandomForestClassifier()}

In [None]:
clf = RidgeClassifier()

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.predict(X_train)