In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV
import dask_ml.model_selection as dcv
from dask.diagnostics import ProgressBar
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import f1_score, hamming_loss, make_scorer, accuracy_score

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from skmultilearn.model_selection import iterative_train_test_split


In [2]:
DATA_DIR = "../../data/processed/"
INPUT_FILE_NAME = 'final_squash15_with_pos_ner.parquet'
df = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)
df.head()

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,sim_tags,squash15_tags,pos_sequence,ner_sequence
0,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0,"[thank, chris, truly, great, honor, opportunit...",thank chris truly great honor opportunity come...,"cars,solar system,energy,culture,politics,scie...","culture,politics,science,global issues,technology",NOUN PUNCT NOUN PUNCT NOUN PUNCT ADJ NOUN PUNC...,
1,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0,"[term, invention, like, tell, tale, favorite, ...",term invention like tell tale favorite project...,"macarthur grant,simplicity,design,solar system...","design,global issues",NOUN PUNCT ADJ NOUN,
2,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0,"[public, dewey, long, ago, observe, constitute...",public dewey long ago observe constitute discu...,"corruption,inequality,science,investment,war,c...","science,culture,politics,global issues,business",NOUN PUNCT NOUN PUNCT NOUN PUNCT ADJ NOUN PUNC...,
3,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0,"[want, start, say, houston, problem, enter, se...",want start say houston problem enter second ge...,"flight,design,nasa,science,invention,entrepren...","design,science,business",NOUN PUNCT NOUN PUNCT NOUN,
4,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0,"[want, talk, background, idea, car, art, actua...",want talk background idea car art actually mea...,"cars,design,transportation,invention,technolog...","design,technology,business,science",NOUN PUNCT NOUN PUNCT NOUN PUNCT NOUN,


In [3]:
df = df.dropna(subset=['squash15_tags'])
df = df.reset_index(drop=True)
df.iloc[:,:10].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2313 entries, 0 to 2312
Data columns (total 10 columns):
speaker                    2313 non-null object
headline                   2313 non-null object
description                2313 non-null object
duration                   2313 non-null object
tags                       2313 non-null object
transcript                 2313 non-null object
WC                         2313 non-null float64
clean_transcript           2313 non-null object
clean_transcript_string    2313 non-null object
sim_tags                   2313 non-null object
dtypes: float64(1), object(9)
memory usage: 180.8+ KB


In [4]:
X = df['clean_transcript_string']
labels = df[['squash15_tags']]

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

y = []
for index, row in labels.iterrows():
    y.append(set(row['squash15_tags'].split(',')))
    
mlb = MultiLabelBinarizer()
encoded_y = mlb.fit_transform(y)

In [6]:
print(encoded_y[0])
print(len(encoded_y[0]))
print(mlb.inverse_transform(encoded_y)[:10])

[0 0 0 0 1 0 0 0 1 0 0 0 1 1 1]
15
[('culture', 'global issues', 'politics', 'science', 'technology'), ('design', 'global issues'), ('business', 'culture', 'global issues', 'politics', 'science'), ('business', 'design', 'science'), ('business', 'design', 'science', 'technology'), ('biodiversity', 'biomechanics', 'science', 'technology'), ('design', 'entertainment', 'media', 'science', 'technology'), ('culture', 'design'), ('business', 'culture', 'design', 'global issues', 'media', 'science', 'technology'), ('culture', 'global issues', 'science')]


In [7]:
from skmultilearn.model_selection import iterative_train_test_split

X_train, y_train, X_test, y_test = iterative_train_test_split(X.values.reshape(len(X.values), 1), encoded_y, test_size = 0.2)
X_train = pd.DataFrame(X_train)[0]
X_test = pd.DataFrame(X_test)[0]

In [8]:
print(y_train.sum(axis=0))
print(y_test.sum(axis=0))

[ 174  176  279  142  915  382  227  177  543  297  162  231  159 1174
  622]
[ 44  44  70  43 240  95  58  41 136  88  55  48  40 293 165]


## Gridsearch for the best single model for all labels

### References 
http://scikit.ml/api/skmultilearn.problem_transform.br.html

https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

http://scikit.ml/stratification.html

https://stackoverflow.com/questions/12632992/gridsearch-for-an-estimator-inside-a-onevsrestclassifier/12637528#12637528

### Binary Relevance

In [12]:
# TODO: 
# 1. Check if TfidfTransformer use_idf=False is the same as Countvectorizer? or there are other metrics to suppress
# 2. Get scoring function to work, hamming? -- kinda done
# 3. Balanced class labels
# 4. Set better param ranges
# 5. Remove vectorizer step once we decide on which is better, then use sparse csr and hopefully it trains faster

# param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range = [1, 2, 3, 4, 5]
param_range_lr = [1.0, 0.5, 0.1]

# Set params, comment out as see fit

vectorizer_params = {
#     'vectorizer__min_df': np.linspace(0.005, 0.05, 5),
#     'vectorizer__ngram_range': [(1, 1), (1, 2)], # This shit blows up your memory
#     'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': [True, False],
}

lr_params = {
    'clf__classifier': [LogisticRegression()],
        'clf__classifier__penalty': ['l1', 'l2'],
        'clf__classifier__C': [0.01, 0.1, 1, 10, 100],
        'clf__classifier__solver': ['liblinear'],
        'clf__classifier__class_weight': ['balanced'],
}

svc_params = {
    'clf__classifier': [SVC()],
        'clf__classifier__kernel': ['linear', 'rbf'],
        'clf__classifier__C': param_range, # np.logspace(-1, 2, 10),
        'clf__classifier__gamma': ['auto'], # np.logspace(-1, 1, 10)
        'clf__classifier__probability': [True],
        'clf__classifier__class_weight': ['balanced'],
}

rf_params = {
    'clf__classifier': [RandomForestClassifier()],
        'clf__classifier__criterion': ['gini', 'entropy'],
        'clf__classifier__min_samples_leaf': param_range,
        'clf__classifier__max_depth': param_range,
        'clf__classifier__min_samples_split': param_range[1:],
        'clf__classifier__n_estimators': [10],
        'clf__classifier__class_weight': ['balanced'],
}

mnb_params = {
    'clf__classifier': [MultinomialNB()],
        'clf__classifier__alpha': [0.7, 1.0, 1.5],
}

## Stack params
parameters = [
    {**vectorizer_params, **lr_params},
#     {**vectorizer_params, **svc_params},
#     {**vectorizer_params, **rf_params},
    {**vectorizer_params, **mnb_params}
]

br_pipeline = Pipeline([('vectorizer', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', BinaryRelevance()),
                       ]
                      )

# Gridsearch settings
# scoring = make_scorer(f1_score, average='micro') # possible scorings 'f1_micro' 'f1_macro'
# scoring = 'f1_micro'
# scoring = make_scorer(hamming_loss)
# scoring = 'neg_log_loss'
scoring = 'f1_samples'
folds = 4
njobs = -1

br_model = dcv.GridSearchCV(br_pipeline, parameters, scoring=scoring, cv=folds, n_jobs=njobs)

In [13]:
%%time

with ProgressBar():
    br_model.fit(X_train, y_train)
print(br_model.best_params_, br_model.best_score_)
pd.DataFrame(br_model.cv_results_)
filename = f'best_br_model.joblib'
joblib.dump(br_model, filename)

[####                                    ] | 10% Completed | 23.7s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 10% Completed | 24.6s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 10% Completed | 25.3s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 10% Completed | 26.2s

  'precision', 'predicted', average, warn_for)


[########                                ] | 22% Completed |  1min 53.1s

  'precision', 'predicted', average, warn_for)


[##########                              ] | 27% Completed |  2min 24.5s

  'precision', 'predicted', average, warn_for)


[###########                             ] | 29% Completed |  2min 27.5s

  'precision', 'predicted', average, warn_for)


[############                            ] | 31% Completed |  2min 32.4s

  'precision', 'predicted', average, warn_for)


[############                            ] | 31% Completed |  2min 32.6s

  'precision', 'predicted', average, warn_for)


[############                            ] | 32% Completed |  2min 34.5s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 33% Completed |  2min 35.0s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 34% Completed |  2min 37.6s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[##############                          ] | 36% Completed |  2min 39.8s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 36% Completed |  2min 41.2s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 38% Completed |  2min 43.5s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[###############                         ] | 38% Completed |  2min 45.3s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 39% Completed |  2min 46.2s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 39% Completed |  2min 47.8s

  'precision', 'predicted', average, warn_for)


[################                        ] | 40% Completed |  2min 48.2s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[################                        ] | 41% Completed |  2min 49.6s

  'precision', 'predicted', average, warn_for)


[################                        ] | 41% Completed |  2min 52.1s

  'precision', 'predicted', average, warn_for)


[##################                      ] | 46% Completed |  3min 21.8s

  'precision', 'predicted', average, warn_for)


[###################                     ] | 48% Completed |  3min 25.9s

  'precision', 'predicted', average, warn_for)


[###################                     ] | 48% Completed |  3min 28.5s

  'precision', 'predicted', average, warn_for)


[#####################                   ] | 52% Completed |  3min 34.5s

  'precision', 'predicted', average, warn_for)


[#####################                   ] | 53% Completed |  3min 36.6s

  'precision', 'predicted', average, warn_for)


[######################                  ] | 55% Completed |  3min 39.5s

  'precision', 'predicted', average, warn_for)


[######################                  ] | 55% Completed |  3min 40.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######################                  ] | 57% Completed |  3min 43.1s

  'precision', 'predicted', average, warn_for)


[#######################                 ] | 58% Completed |  3min 46.4s

  'precision', 'predicted', average, warn_for)


[#######################                 ] | 58% Completed |  3min 47.1s

  'precision', 'predicted', average, warn_for)


[#######################                 ] | 59% Completed |  3min 49.2s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[########################                ] | 60% Completed |  3min 49.7s

  'precision', 'predicted', average, warn_for)


[########################                ] | 61% Completed |  3min 50.2s

  'precision', 'predicted', average, warn_for)


[########################                ] | 61% Completed |  3min 51.6s

  'precision', 'predicted', average, warn_for)


[##########################              ] | 66% Completed |  4min 22.3s

  'precision', 'predicted', average, warn_for)


[##########################              ] | 66% Completed |  4min 24.8s

  'precision', 'predicted', average, warn_for)


[##########################              ] | 67% Completed |  4min 26.1s

  'precision', 'predicted', average, warn_for)


[###########################             ] | 68% Completed |  4min 29.2s

  'precision', 'predicted', average, warn_for)


[############################            ] | 70% Completed |  4min 32.7s

  'precision', 'predicted', average, warn_for)


[############################            ] | 72% Completed |  4min 35.3s

  'precision', 'predicted', average, warn_for)


[#############################           ] | 73% Completed |  4min 37.7s

  'precision', 'predicted', average, warn_for)


[#############################           ] | 74% Completed |  4min 40.6s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 75% Completed |  4min 42.1s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 75% Completed |  4min 42.9s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 76% Completed |  4min 44.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[###############################         ] | 77% Completed |  4min 46.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[###############################         ] | 79% Completed |  4min 51.8s

  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  4min 52.1s

  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  4min 52.3s

  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  4min 55.4s

  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  4min 56.8s

  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  4min 59.0s

  'precision', 'predicted', average, warn_for)


[##################################      ] | 85% Completed |  5min 21.1s

  'precision', 'predicted', average, warn_for)


[##################################      ] | 86% Completed |  5min 23.7s

  'precision', 'predicted', average, warn_for)


[##################################      ] | 86% Completed |  5min 24.4s

  'precision', 'predicted', average, warn_for)


[##################################      ] | 86% Completed |  5min 25.4s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 87% Completed |  5min 27.3s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 88% Completed |  5min 29.5s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  5min 32.1s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  5min 33.2s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  5min 33.8s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 92% Completed |  5min 41.1s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  6min  1.6s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 97% Completed |  6min  3.9s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 97% Completed |  6min  4.9s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  6min  5.7s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  6min  6.9s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 99% Completed |  6min  7.4s

  'precision', 'predicted', average, warn_for)


[########################################] | 100% Completed |  6min 12.9s
[########################################] | 100% Completed | 12.1s
{'clf__classifier': LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__classifier__C': 1, 'clf__classifier__class_weight': 'balanced', 'clf__classifier__penalty': 'l2', 'clf__classifier__solver': 'liblinear', 'tfidf__use_idf': True} 0.5404493684821554
Wall time: 6min 26s


['best_br_model.joblib']

In [14]:
y_pred_prob = br_model.predict_proba(X_test)
t = 0.5 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
score = f1_score(y_test, y_pred_new, average="micro")
print(f"Binary relevance best model's f1-score {score}")

Binary relevance best model's f1-score 0.5979784805999347


'public dewey long ago observe constitute discussion debate tyranny assumption question avoid doxa realm unquestioned will subject assumption debate discussion spirit join discussion critical issue time mobilize different form capital project state build assumption clearly capitalism year acceptable democracy look world look map capitalist economy democratic polity rare exception norm question form capitalism type democratic participation acknowledge moment bring rare consensus assumption provide grind type action consensus moment allow act necessary matter fragile provisional consensus able forward majority world benefit capitalism democratic system globe experience state repressive organization concern denial right denial justice provision term experience capitalism aspect rest globe experience extractive industry blood diamond smuggle emerald timber cut right poor second technical assistance technical assistance shock bad form today ugly face develop world develop country ten billio

In [22]:
mlb.inverse_transform(br_model.predict(X_test[[0]]))

[('business', 'culture', 'global issues', 'politics')]

### OneVsRest

In [None]:
# TODO: 
# 1. Check if TfidfTransformer use_idf=False is the same as Countvectorizer? or there are other metrics to suppress
# 2. Use proper scoring function - ideally, predicting relevant labels should be more important than predicting irrelevant ones
# 3. Balanced class labels
# 4. Set better param ranges

# param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range = [1, 2, 3, 4, 5]
param_range_lr = [1.0, 0.5, 0.1]

# Set params, comment out as see fit

vectorizer_params = {
#     'vectorizer__min_df': np.linspace(0.005, 0.05, 5),
#     'vectorizer__ngram_range': [(1, 1), (1, 2)], # This shit blows up your memory
#     'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': [True, False],
}

lr_params = {
    'clf__estimator': [LogisticRegression()],
        'clf__estimator__penalty': ['l1', 'l2'],
        'clf__estimator__C': param_range_lr,
        'clf__estimator__solver': ['liblinear'],
        'clf__estimator__class_weight': ['balanced'],
}

svc_params = {
    'clf__estimator': [SVC()],
        'clf__estimator__kernel': ['linear', 'rbf'],
        'clf__estimator__C': param_range, # np.logspace(-1, 2, 10),
        'clf__estimator__gamma': ['auto'], # np.logspace(-1, 1, 10)
        'clf__estimator__probability': [True],
        'clf__estimator__class_weight': ['balanced'],
}

rf_params = {
    'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__criterion': ['gini', 'entropy'],
        'clf__estimator__min_samples_leaf': param_range,
        'clf__estimator__max_depth': param_range,
        'clf__estimator__min_samples_split': param_range[1:],
        'clf__estimator__n_estimators': [10],
        'clf__estimator__class_weight': ['balanced'],
}

mnb_params = {
    'clf__estimator': [MultinomialNB()],
        'clf__estimator__alpha': [0.7, 1.0],
}

## Stack params
parameters = [
    {**vectorizer_params, **lr_params},
#     {**vectorizer_params, **svc_params},
#     {**vectorizer_params, **rf_params},
    {**vectorizer_params, **mnb_params}
]

ovr_pipeline = Pipeline([('vectorizer', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', OneVsRestClassifier(LogisticRegression())),
                        ]
                       )

# Gridsearch settings
# scoring = make_scorer(f1_score, average='micro') # possible scorings 'f1_micro' 'f1_macro'
scoring = 'f1_micro'
# scoring = make_scorer(hamming_loss) # hamming gives equal weighting to both relevant and irrelevant?
# maybe use precision somewhere
folds = 3
njobs = -1

ovr_model = GridSearchCV(ovr_pipeline, parameters, scoring=scoring, cv=folds, n_jobs=njobs)

In [None]:
%%time
ovr_model.fit(X_train,y_train)
print(ovr_model.best_params_, ovr_model.best_score_)
pd.DataFrame(ovr_model.cv_results_)

In [None]:
y_pred_prob = ovr_model.predict_proba(X_test)
t = 0.1 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
score = f1_score(y_test, y_pred_new, average="micro")
print(f"One vs Rest best model's f1-score {score}")

## Gridsearch best model for each tag

https://stackoverflow.com/questions/38555650/try-multiple-estimator-in-one-grid-search



In [9]:
# param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range = [1, 2, 3, 4, 5]
param_range_lr = [1.0, 0.5, 0.1]

# Set params, comment out as see fit

vectorizer_params = {
#     'vectorizer__min_df': np.linspace(0.005, 0.05, 5),
#     'vectorizer__ngram_range': [(1, 1), (1, 2)], # This shit blows up your memory
#     'tfidf__norm': ('l1', 'l2'),
    'tfidf__use_idf': [True, False],
}

# Add any Binary classification model setting here.
# Also add to general parameters to be passed into pipeline below if want to use new model.

lr_params = {
    'clf': [LogisticRegression()],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [0.01, 0.1, 1, 10],
        'clf__solver': ['liblinear'],
        'clf__class_weight': ['balanced'],
}

svc_params = {
    'clf': [SVC()],
        'clf__kernel': ['linear', 'rbf'],
        'clf__C': param_range, # np.logspace(-1, 2, 10),
        'clf__gamma': ['auto'], # np.logspace(-1, 1, 10)
        'clf__probability': [True],
        'clf__class_weight': ['balanced'],
}

rf_params = {
    'clf': [RandomForestClassifier()],
        'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_leaf': [2, 4, 8],
#         'clf__max_depth': np.linspace(10, 50, num = 5),
        'clf__min_samples_split': [2, 5, 10],
        'clf__n_estimators': [60, 100, 200],
        'clf__class_weight': ['balanced'],
}

mnb_params = {
    'clf': [MultinomialNB()],
        'clf__alpha': [0.7, 1.0],
}

## Stack params
parameters = [
    {**vectorizer_params, **lr_params},
#     {**vectorizer_params, **svc_params},
    {**vectorizer_params, **rf_params},
#     {**vectorizer_params, **mnb_params}
]

per_tag_pipe = Pipeline([('vectorizer', CountVectorizer()), 
                  ('tfidf', TfidfTransformer()), 
                  ('clf', LogisticRegression())], verbose=True)

# scoring = make_scorer(hamming_loss)
scoring = 'f1'
# scoring = 'f1_micro'
# scoring = 'balanced_accuracy'
# scoring = 'precision'
folds = 4
njobs = -1

per_tag_model = dcv.GridSearchCV(per_tag_pipe, parameters, scoring=scoring, cv=folds, n_jobs=njobs)

In [10]:
tags = [tag for tag in mlb.inverse_transform(np.ones(shape=(1, 15)))[0]]
print(tags)
tags.index('technology')

['biodiversity', 'biomechanics', 'business', 'communication', 'culture', 'design', 'entertainment', 'future', 'global issues', 'history', 'humanity', 'media', 'politics', 'science', 'technology']


14

In [11]:
%%time
for index in range(len(tags)):
    print(f"Processing {tags[index]}")
    with ProgressBar():
        per_tag_model.fit(X_train, y_train[:, index])
#     display(pd.DataFrame(per_tag_model.cv_results_))
    t = 0.5 #threshold value
    prediction_prob = per_tag_model.predict_proba(X_test)
    prediction = (prediction_prob[:, 1] >= t).astype(int)
    # save model or model params somewhere
    print(f'tag {index}: {tags[index]} best model {per_tag_model.best_params_}')
    print(f'tag {index}: {tags[index]} counts - predicted: {sum(prediction)}, actual: {sum(y_test[:, index])}')
    print(f'tag {index}: {tags[index]} test f1-score is {f1_score(y_test[:, index], prediction, average="binary")}')
    print(f'tag {index}: {tags[index]} test accuracy is {accuracy_score(y_test[:, index], prediction)}')
    filename = f'best_{tags[index]}_model.joblib'
    joblib.dump(per_tag_model, filename)
    print('--------------------------')

Processing future
[#                                       ] | 3% Completed |  5.6s

  'precision', 'predicted', average, warn_for)


[#                                       ] | 4% Completed |  7.9s

  'precision', 'predicted', average, warn_for)


[##                                      ] | 5% Completed |  9.6s

  'precision', 'predicted', average, warn_for)


[##                                      ] | 5% Completed | 10.1s

  'precision', 'predicted', average, warn_for)


[##                                      ] | 6% Completed | 11.5s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[##                                      ] | 6% Completed | 11.7s

  'precision', 'predicted', average, warn_for)


[##                                      ] | 6% Completed | 12.4s

  'precision', 'predicted', average, warn_for)


[##                                      ] | 6% Completed | 13.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[##                                      ] | 7% Completed | 15.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[###                                     ] | 7% Completed | 16.0s

  'precision', 'predicted', average, warn_for)


[###                                     ] | 8% Completed | 17.5s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[###                                     ] | 8% Completed | 18.7s

  'precision', 'predicted', average, warn_for)


[###                                     ] | 8% Completed | 19.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[###                                     ] | 9% Completed | 21.0s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 10% Completed | 22.9s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####                                    ] | 10% Completed | 24.1s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 10% Completed | 24.5s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 25.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 26.8s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 27.4s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 27.8s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 12% Completed | 30.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####                                    ] | 12% Completed | 30.7s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 12% Completed | 31.6s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 12% Completed | 32.5s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#####                                   ] | 13% Completed | 33.7s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 13% Completed | 34.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#####                                   ] | 13% Completed | 34.6s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 14% Completed | 36.4s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 14% Completed | 37.8s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 14% Completed | 38.7s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 15% Completed | 39.4s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 15% Completed | 40.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######                                  ] | 15% Completed | 42.5s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 16% Completed | 43.8s

  'precision', 'predicted', average, warn_for)


[#######                                 ] | 19% Completed | 56.3s

  'precision', 'predicted', average, warn_for)


[##########                              ] | 27% Completed |  1min 13.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[###########                             ] | 27% Completed |  1min 14.9s

  'precision', 'predicted', average, warn_for)


[############                            ] | 31% Completed |  1min 29.4s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 33% Completed |  1min 32.4s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 33% Completed |  1min 33.0s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 33% Completed |  1min 33.8s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 33% Completed |  1min 34.5s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 33% Completed |  1min 34.8s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 34% Completed |  1min 35.9s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 34% Completed |  1min 36.7s

  'precision', 'predicted', average, warn_for)


[#############                           ] | 34% Completed |  1min 37.6s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 35% Completed |  1min 38.3s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 35% Completed |  1min 38.8s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 35% Completed |  1min 39.7s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 35% Completed |  1min 40.1s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 35% Completed |  1min 40.5s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 36% Completed |  1min 41.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[##############                          ] | 36% Completed |  1min 42.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[##############                          ] | 36% Completed |  1min 43.3s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 37% Completed |  1min 44.0s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 37% Completed |  1min 44.9s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 38% Completed |  1min 46.1s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 38% Completed |  1min 47.5s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[###############                         ] | 38% Completed |  1min 48.4s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 38% Completed |  1min 48.8s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 39% Completed |  1min 49.9s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 39% Completed |  1min 50.4s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 39% Completed |  1min 51.1s

  'precision', 'predicted', average, warn_for)


[################                        ] | 40% Completed |  1min 52.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[################                        ] | 40% Completed |  1min 54.1s

  'precision', 'predicted', average, warn_for)


[################                        ] | 40% Completed |  1min 54.3s

  'precision', 'predicted', average, warn_for)


[################                        ] | 41% Completed |  1min 55.0s

  'precision', 'predicted', average, warn_for)


[################                        ] | 41% Completed |  1min 55.8s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[################                        ] | 41% Completed |  1min 56.4s

  'precision', 'predicted', average, warn_for)


[################                        ] | 41% Completed |  1min 57.5s

  'precision', 'predicted', average, warn_for)


[################                        ] | 42% Completed |  1min 58.4s

  'precision', 'predicted', average, warn_for)


[#################                       ] | 43% Completed |  1min 59.7s

  'precision', 'predicted', average, warn_for)


[#################                       ] | 43% Completed |  2min  0.1s

  'precision', 'predicted', average, warn_for)


[#################                       ] | 43% Completed |  2min  1.0s

  'precision', 'predicted', average, warn_for)


[#################                       ] | 43% Completed |  2min  1.2s

  'precision', 'predicted', average, warn_for)


[#################                       ] | 43% Completed |  2min  2.4s

  'precision', 'predicted', average, warn_for)


[#################                       ] | 44% Completed |  2min  4.5s

  'precision', 'predicted', average, warn_for)


[####################                    ] | 51% Completed |  2min 35.4s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 76% Completed |  3min 28.8s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 76% Completed |  3min 29.5s

  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  3min 42.1s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 88% Completed |  4min  3.3s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 89% Completed |  4min  5.5s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 89% Completed |  4min  7.1s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min  8.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min  9.6s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 10.4s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 11.1s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 12.2s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 13.1s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 92% Completed |  4min 14.8s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#####################################   ] | 92% Completed |  4min 15.0s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 92% Completed |  4min 16.2s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 93% Completed |  4min 16.8s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 93% Completed |  4min 17.1s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 93% Completed |  4min 18.5s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#####################################   ] | 94% Completed |  4min 20.6s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#####################################   ] | 94% Completed |  4min 21.9s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 23.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 23.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 24.7s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 25.3s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 25.9s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 27.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 27.6s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 28.5s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 97% Completed |  4min 29.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  4min 31.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  4min 32.7s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  4min 33.5s

  'precision', 'predicted', average, warn_for)


[########################################] | 100% Completed |  4min 37.2s
[                                        ] | 0% Completed |  1.1s[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   1.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[                                        ] | 0% Completed |  1.2s[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.1s
[########################################] | 100% Completed |  1.3s
tag 7: future best model {'clf': LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 0.1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'tfidf__use_idf': False}
tag 7: future counts -

  'precision', 'predicted', average, warn_for)


[#                                       ] | 3% Completed |  5.0s

  'precision', 'predicted', average, warn_for)


[########                                ] | 20% Completed |  1min 16.3s

  'precision', 'predicted', average, warn_for)


[##################                      ] | 47% Completed |  2min 38.5s

  'precision', 'predicted', average, warn_for)


[#######################                 ] | 59% Completed |  3min 15.9s

  'precision', 'predicted', average, warn_for)


[############################            ] | 72% Completed |  3min 56.7s

  'precision', 'predicted', average, warn_for)


[########################################] | 100% Completed |  5min 10.9s
[                                        ] | 0% Completed |  1.2s[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   1.3s
[                                        ] | 0% Completed |  1.3s[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[                                        ] | 0% Completed |  1.4s[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.1s
[########################################] | 100% Completed |  1.5s
tag 8: global issues best model {'clf': LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__s

  'precision', 'predicted', average, warn_for)


[#                                       ] | 3% Completed |  6.1s

  'precision', 'predicted', average, warn_for)


[###                                     ] | 8% Completed | 27.4s

  'precision', 'predicted', average, warn_for)


[###                                     ] | 8% Completed | 28.1s

  'precision', 'predicted', average, warn_for)


[###                                     ] | 9% Completed | 29.1s

  'precision', 'predicted', average, warn_for)


[###                                     ] | 9% Completed | 29.9s

  'precision', 'predicted', average, warn_for)


[###                                     ] | 9% Completed | 30.1s

  'precision', 'predicted', average, warn_for)


[###                                     ] | 9% Completed | 31.2s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####                                    ] | 10% Completed | 33.2s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 10% Completed | 34.0s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 35.0s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 36.4s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 37.6s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 38.1s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 12% Completed | 40.2s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 13% Completed | 42.6s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 13% Completed | 44.4s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 14% Completed | 46.3s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 14% Completed | 46.9s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 14% Completed | 48.9s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 14% Completed | 49.4s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 15% Completed | 50.1s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 15% Completed | 51.1s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 15% Completed | 51.8s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 15% Completed | 52.0s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 16% Completed | 53.8s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######                                  ] | 16% Completed | 55.6s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 17% Completed | 56.1s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#######                                 ] | 18% Completed | 58.5s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 35% Completed |  1min 57.6s

  'precision', 'predicted', average, warn_for)


[###################                     ] | 47% Completed |  2min 35.6s

  'precision', 'predicted', average, warn_for)


[########################                ] | 60% Completed |  3min 11.5s

  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  4min  8.1s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 89% Completed |  4min 31.2s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 89% Completed |  4min 31.6s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 33.0s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 34.5s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 37.0s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 37.5s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 37.9s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 92% Completed |  4min 39.4s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 92% Completed |  4min 40.9s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 93% Completed |  4min 42.3s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 94% Completed |  4min 45.0s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 94% Completed |  4min 45.4s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 47.2s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 48.1s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 49.1s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 50.0s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 51.1s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 51.6s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 52.4s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 97% Completed |  4min 54.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################### ] | 97% Completed |  4min 56.4s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 97% Completed |  4min 57.0s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  4min 59.6s

  'precision', 'predicted', average, warn_for)


[########################################] | 100% Completed |  5min  2.9s
[                                        ] | 0% Completed |  1.0s[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   1.1s
[                                        ] | 0% Completed |  1.1s[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[                                        ] | 0% Completed |  1.2s[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.1s
[########################################] | 100% Completed |  1.3s
tag 9: history best model {'clf': LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 0.1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__sol

  'precision', 'predicted', average, warn_for)


[####                                    ] | 10% Completed | 38.4s

  'precision', 'predicted', average, warn_for)


[#######                                 ] | 19% Completed | 58.5s

  'precision', 'predicted', average, warn_for)


[##########                              ] | 26% Completed |  1min 30.8s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 35% Completed |  1min 50.6s

  'precision', 'predicted', average, warn_for)


[###############                         ] | 39% Completed |  2min  5.3s

  'precision', 'predicted', average, warn_for)


[######################                  ] | 56% Completed |  2min 43.0s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#######################                 ] | 59% Completed |  2min 57.1s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 76% Completed |  3min 39.9s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  3min 52.7s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 88% Completed |  4min 11.7s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 89% Completed |  4min 13.0s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 89% Completed |  4min 14.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 15.9s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 16.6s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 17.1s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 17.9s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 18.4s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 19.6s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 19.9s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 20.5s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 92% Completed |  4min 21.8s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 92% Completed |  4min 22.2s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 92% Completed |  4min 23.2s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 93% Completed |  4min 24.1s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 93% Completed |  4min 24.6s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 94% Completed |  4min 27.1s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 94% Completed |  4min 27.6s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#####################################   ] | 94% Completed |  4min 28.6s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 30.0s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 30.2s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 30.4s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 31.4s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 31.9s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 32.5s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 33.9s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 34.1s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 97% Completed |  4min 35.5s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 97% Completed |  4min 35.8s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################### ] | 97% Completed |  4min 37.4s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 97% Completed |  4min 37.8s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  4min 39.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  4min 40.2s

  'precision', 'predicted', average, warn_for)


[########################################] | 100% Completed |  4min 43.1s
[                                        ] | 0% Completed |  1.1s[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   1.2s
[                                        ] | 0% Completed |  1.2s[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[                                        ] | 0% Completed |  1.3s[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.2s
[########################################] | 100% Completed |  1.4s
tag 10: humanity best model {'clf': LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solve

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[###                                     ] | 7% Completed | 24.6s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 17% Completed | 47.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#######                                 ] | 17% Completed | 47.5s

  'precision', 'predicted', average, warn_for)


[#######                                 ] | 17% Completed | 48.8s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#######                                 ] | 18% Completed | 50.5s

  'precision', 'predicted', average, warn_for)


[#######                                 ] | 18% Completed | 52.4s

  'precision', 'predicted', average, warn_for)


[#######                                 ] | 19% Completed | 54.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[########                                ] | 20% Completed | 57.5s

  'precision', 'predicted', average, warn_for)


[########                                ] | 20% Completed | 57.8s

  'precision', 'predicted', average, warn_for)


[########                                ] | 20% Completed |  1min  1.0s

  'precision', 'predicted', average, warn_for)


[########                                ] | 22% Completed |  1min  5.5s

  'precision', 'predicted', average, warn_for)


[#########                               ] | 22% Completed |  1min  8.9s

  'precision', 'predicted', average, warn_for)


[#########                               ] | 23% Completed |  1min  9.4s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[#########                               ] | 23% Completed |  1min 10.6s

  'precision', 'predicted', average, warn_for)


[#########                               ] | 23% Completed |  1min 11.8s

  'precision', 'predicted', average, warn_for)


[#########                               ] | 24% Completed |  1min 13.9s

  'precision', 'predicted', average, warn_for)


[#########                               ] | 24% Completed |  1min 15.0s

  'precision', 'predicted', average, warn_for)


[##########                              ] | 25% Completed |  1min 18.4s

  'precision', 'predicted', average, warn_for)


[##########                              ] | 27% Completed |  1min 23.4s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 35% Completed |  2min  0.3s

  'precision', 'predicted', average, warn_for)


[####################                    ] | 51% Completed |  2min 49.4s

  'precision', 'predicted', average, warn_for)


[####################                    ] | 51% Completed |  2min 49.6s

  'precision', 'predicted', average, warn_for)


[#######################                 ] | 59% Completed |  3min 24.9s

  'precision', 'predicted', average, warn_for)


[###########################             ] | 69% Completed |  3min 48.8s

  'precision', 'predicted', average, warn_for)


[###########################             ] | 69% Completed |  3min 49.4s

  'precision', 'predicted', average, warn_for)


[###########################             ] | 69% Completed |  3min 49.8s

  'precision', 'predicted', average, warn_for)


[############################            ] | 70% Completed |  3min 55.1s

  'precision', 'predicted', average, warn_for)


[############################            ] | 70% Completed |  3min 56.4s

  'precision', 'predicted', average, warn_for)


[############################            ] | 70% Completed |  3min 58.3s

  'precision', 'predicted', average, warn_for)


[############################            ] | 71% Completed |  4min  1.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[############################            ] | 71% Completed |  4min  2.7s

  'precision', 'predicted', average, warn_for)


[############################            ] | 72% Completed |  4min  4.7s

  'precision', 'predicted', average, warn_for)


[#############################           ] | 72% Completed |  4min  6.2s

  'precision', 'predicted', average, warn_for)


[#############################           ] | 73% Completed |  4min  7.5s

  'precision', 'predicted', average, warn_for)


[#############################           ] | 74% Completed |  4min 10.9s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 75% Completed |  4min 14.0s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 75% Completed |  4min 14.8s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[##############################          ] | 75% Completed |  4min 16.8s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 76% Completed |  4min 18.5s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 76% Completed |  4min 20.2s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 76% Completed |  4min 20.6s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 76% Completed |  4min 21.6s

  'precision', 'predicted', average, warn_for)


[##############################          ] | 77% Completed |  4min 22.5s

  'precision', 'predicted', average, warn_for)


[###############################         ] | 77% Completed |  4min 24.3s

  'precision', 'predicted', average, warn_for)


[###############################         ] | 79% Completed |  4min 26.8s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 89% Completed |  4min 52.8s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 54.5s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 55.6s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 56.6s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 58.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 59.5s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 92% Completed |  5min  1.5s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 92% Completed |  5min  1.7s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 92% Completed |  5min  4.4s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 93% Completed |  5min  8.1s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 94% Completed |  5min 14.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  5min 17.6s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  5min 18.2s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  5min 18.7s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  5min 20.1s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  5min 21.0s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  5min 21.8s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  5min 23.2s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  5min 23.5s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  5min 24.5s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 97% Completed |  5min 25.2s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 97% Completed |  5min 27.1s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  5min 27.6s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  5min 29.9s

  'precision', 'predicted', average, warn_for)


[########################################] | 100% Completed |  5min 32.8s
[                                        ] | 0% Completed |  1.0s[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   1.1s
[                                        ] | 0% Completed |  1.1s[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[                                        ] | 0% Completed |  1.2s[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.1s
[########################################] | 100% Completed |  1.3s
tag 11: media best model {'clf': LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver':

  'precision', 'predicted', average, warn_for)


[#                                       ] | 3% Completed |  7.3s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 39.5s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 40.2s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 40.8s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 12% Completed | 42.7s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 13% Completed | 44.0s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 13% Completed | 45.4s

  'precision', 'predicted', average, warn_for)


[#####                                   ] | 14% Completed | 47.4s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 16% Completed | 53.0s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 16% Completed | 54.0s

  'precision', 'predicted', average, warn_for)


[######                                  ] | 17% Completed | 55.3s

  'precision', 'predicted', average, warn_for)


[#######                                 ] | 18% Completed | 59.9s

  'precision', 'predicted', average, warn_for)


[#######                                 ] | 19% Completed |  1min  2.1s

  'precision', 'predicted', average, warn_for)


[#######                                 ] | 19% Completed |  1min  2.6s

  'precision', 'predicted', average, warn_for)


[##########                              ] | 27% Completed |  1min 24.7s

  'precision', 'predicted', average, warn_for)


[###################                     ] | 47% Completed |  2min 15.5s

  'precision', 'predicted', average, warn_for)


[########################                ] | 60% Completed |  2min 52.7s

  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  3min 44.2s

  'precision', 'predicted', average, warn_for)


[###################################     ] | 89% Completed |  4min 23.7s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 25.2s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  4min 26.7s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 30.2s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 30.7s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 91% Completed |  4min 31.4s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 92% Completed |  4min 33.4s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 92% Completed |  4min 34.7s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 93% Completed |  4min 36.3s

  'precision', 'predicted', average, warn_for)


[#####################################   ] | 94% Completed |  4min 39.8s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 42.3s

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 43.3s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 44.3s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 95% Completed |  4min 45.2s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 46.3s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 48.4s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 48.7s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 96% Completed |  4min 49.6s

  'precision', 'predicted', average, warn_for)


[######################################  ] | 97% Completed |  4min 50.2s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 97% Completed |  4min 54.2s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  4min 54.8s

  'precision', 'predicted', average, warn_for)


[####################################### ] | 98% Completed |  4min 58.6s

  'precision', 'predicted', average, warn_for)


[########################################] | 100% Completed |  5min  2.4s
[                                        ] | 0% Completed |  1.2s[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   1.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[                                        ] | 0% Completed |  1.4s[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.2s
[########################################] | 100% Completed |  1.5s
tag 12: politics best model {'clf': LogisticRegression(C=10, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 10, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'tfidf__use_idf': True}
tag 12: politics count

  'precision', 'predicted', average, warn_for)


[#########                               ] | 23% Completed |  1min 46.5s

  'precision', 'predicted', average, warn_for)


[##################                      ] | 47% Completed |  3min 21.2s

  'precision', 'predicted', average, warn_for)


[###########################             ] | 67% Completed |  4min 31.6s

  'precision', 'predicted', average, warn_for)


[###############################         ] | 79% Completed |  5min 21.5s

  'precision', 'predicted', average, warn_for)


[####################################    ] | 90% Completed |  6min 14.9s

  'precision', 'predicted', average, warn_for)


[########################################] | 100% Completed |  6min 43.9s
[                                        ] | 0% Completed |  1.1s[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   1.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[                                        ] | 0% Completed |  4.5s[Pipeline] ............... (step 3 of 3) Processing clf, total=   3.4s
[########################################] | 100% Completed |  4.7s
tag 13: science best model {'clf': RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=4,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       n_estimators=200, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0

  'precision', 'predicted', average, warn_for)


[#                                       ] | 3% Completed |  9.8s

  'precision', 'predicted', average, warn_for)


[####                                    ] | 11% Completed | 55.4s

  'precision', 'predicted', average, warn_for)


[##############                          ] | 35% Completed |  2min 17.8s

  'precision', 'predicted', average, warn_for)


[###################                     ] | 47% Completed |  3min  5.3s

  'precision', 'predicted', average, warn_for)


[###########################             ] | 68% Completed |  4min  4.4s

  'precision', 'predicted', average, warn_for)


[################################        ] | 80% Completed |  4min 46.3s

  'precision', 'predicted', average, warn_for)


[########################################] | 100% Completed |  5min 55.2s
[                                        ] | 0% Completed |  1.2s[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   1.2s
[                                        ] | 0% Completed |  1.3s[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[                                        ] | 0% Completed |  1.4s[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.1s
[########################################] | 100% Completed |  1.5s
tag 14: technology best model {'clf': LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__sol

In [71]:
%%time
the_br_model = joblib.load('best_br_model.joblib')
y_pred_new = the_br_model.predict(X_test)
for index in range(len(tags)):
    print(f"Processing {tags[index]}")
    prediction = y_pred_new[:, index].A
    
    if (sum(y_train[:, index]) / len(y_train)) < 0.5:
        baseline_prediction = np.zeros(len(y_test)).astype(int)
    else:
        baseline_prediction = np.ones(len(y_test)).astype(int)
        
    print(f'tag {index}: {tags[index]} counts - predicted: {sum(prediction)}, actual: {sum(y_test[:, index])}')
    print(f'tag {index}: {tags[index]} baseline f1-score is {f1_score(y_test[:, index], baseline_prediction, average="binary")}')
    print(f'tag {index}: {tags[index]} test f1-score is {f1_score(y_test[:, index], prediction, average="binary")}')
    print(f'tag {index}: {tags[index]} baseline accuracy is {accuracy_score(y_test[:, index], baseline_prediction)}')
    print(f'tag {index}: {tags[index]} test accuracy is {accuracy_score(y_test[:, index], prediction)}')
    print('--------------------------')

Processing biodiversity
tag 0: biodiversity counts - predicted: [66], actual: 44
tag 0: biodiversity baseline f1-score is 0.0
tag 0: biodiversity test f1-score is 0.5454545454545454
tag 0: biodiversity baseline accuracy is 0.9089026915113871
tag 0: biodiversity test accuracy is 0.8964803312629399
--------------------------
Processing biomechanics
tag 1: biomechanics counts - predicted: [59], actual: 44
tag 1: biomechanics baseline f1-score is 0.0
tag 1: biomechanics test f1-score is 0.5825242718446602
tag 1: biomechanics baseline accuracy is 0.9089026915113871
tag 1: biomechanics test accuracy is 0.9109730848861284
--------------------------
Processing business
tag 2: business counts - predicted: [83], actual: 70
tag 2: business baseline f1-score is 0.0
tag 2: business test f1-score is 0.5490196078431373
tag 2: business baseline accuracy is 0.855072463768116
tag 2: business test accuracy is 0.8571428571428571
--------------------------
Processing communication
tag 3: communication coun

  'precision', 'predicted', average, warn_for)


In [31]:
import glob

In [40]:
per_tag_models = glob.glob("*.joblib")
per_tag_models.remove('best_br_model.joblib')
len(per_tag_models)

15

In [53]:
max(sum(y_test[:, 3]), len(y_test) - sum(y_test[:, 3]))

0.9109730848861284

In [65]:
sum(y_train[:, 3]) / len(y_train)

0.07759562841530054

In [69]:
%%time
for index in range(len(tags)):
    themodel = joblib.load(f"best_{tags[index]}_model.joblib")
    print(themodel.best_params_)
    print(f"Processing {tags[index]}")
    prediction = themodel.predict(X_test)
    
    if (sum(y_train[:, index]) / len(y_train)) < 0.5:
        baseline_prediction = np.zeros(len(y_test)).astype(int)
    else:
        baseline_prediction = np.ones(len(y_test)).astype(int)
        
    print(f'tag {index}: {tags[index]} counts - predicted: {sum(prediction)}, actual: {sum(y_test[:, index])}')
    print(f'tag {index}: {tags[index]} baseline f1-score is {f1_score(y_test[:, index], baseline_prediction, average="binary")}')
    print(f'tag {index}: {tags[index]} test f1-score is {f1_score(y_test[:, index], prediction, average="binary")}')
    print(f'tag {index}: {tags[index]} baseline accuracy is {accuracy_score(y_test[:, index], baseline_prediction)}')
    print(f'tag {index}: {tags[index]} test accuracy is {accuracy_score(y_test[:, index], prediction)}')
    print('--------------------------')

{'clf': LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'tfidf__use_idf': False}
Processing biodiversity
tag 0: biodiversity counts - predicted: 77, actual: 44
tag 0: biodiversity baseline f1-score is 0.0
tag 0: biodiversity test f1-score is 0.5950413223140496
tag 0: biodiversity baseline accuracy is 0.9089026915113871
tag 0: biodiversity test accuracy is 0.8985507246376812
--------------------------


  'precision', 'predicted', average, warn_for)


{'clf': LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'tfidf__use_idf': False}
Processing biomechanics
tag 1: biomechanics counts - predicted: 69, actual: 44
tag 1: biomechanics baseline f1-score is 0.0
tag 1: biomechanics test f1-score is 0.6017699115044248
tag 1: biomechanics baseline accuracy is 0.9089026915113871
tag 1: biomechanics test accuracy is 0.906832298136646
--------------------------
{'clf': LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, pe