In [1]:
import os

import mlflow
from mlflow import log_metric, log_param, MlflowClient
from mlflow.entities import ViewType
from IPython.display import display, Markdown
import pandas as pd

## Get list of experiments

In [2]:
exp_ids = os.listdir('./mlruns')

In [3]:
len(exp_ids)

2009

# Model Wide Metrics

## Loop over experiments, and get their model wide metrics

Extract the following things
 
 - Name of the experiment
 - Model type it belongs to
 - metrics
 - run name

In [4]:
client =  MlflowClient()
analysis_level = 'model_wide'
query = f"params.analysis_level = '{analysis_level}'"

Retrieve all metrics and params of 'model_wide' runs and save it in a csv file

In [5]:
rows = []
for exp_i, exp_id in enumerate(exp_ids):
    # Retrieve 'model_wide' runs of the experiment
    try:
        runs = client.search_runs(experiment_ids=exp_id, filter_string=query)
    except:
        print(f'problems with exp_id: {exp_id} in position {exp_i}')
        continue
        
    # Add relevant parameters to the row
    for run in runs:
        row_dict = {'exp_id': exp_id, 'run_uuid': run.info.run_uuid, 'runName': run.data.tags['mlflow.runName'],
                    **run.data.metrics, **run.data.params}
        rows.append(row_dict)
        
    if exp_i % 100 == 0 and exp_i!=0:
        pd.DataFrame(rows).set_index(['run_uuid', 'model_name', 'runName']).sort_index().to_csv('all_model_wide_results.csv')
    
pd.DataFrame(rows).set_index(['run_uuid', 'model_name', 'runName']).sort_index().to_csv('all_model_wide_results.csv')

problems with exp_id: 168812780760165522 in position 1942


## Group types of models (experiment type and model type) and pick best performing in terms of f1-score per unit of analysis
 
 * Create dictionary for experiment types and model types
 * Create to dataframes (for each of the previous groups) and pick the ones with max test_f1_score

In [6]:
mw_df = pd.read_csv('all_model_wide_results.csv', index_col=['run_uuid', 'model_name', 'runName'])

In [7]:
m = {
    'Naive Bayes': ['NaiveBayes',
                     'NaiveBayes_BorderlineSMOTE_v1',
                     'NaiveBayes_BorderlineSMOTE_v2',
                     'NaiveBayes_BorderlineSMOTE_v3',
                     'NaiveBayes_ROS_v1',
                     'NaiveBayes_ROS_v2',
                     'NaiveBayes_SMOTE_v1',
                     'NaiveBayes_SMOTE_v2',
                     'NaiveBayes_SMOTE_v3',],
    
    'Complement Naive Bayes': ['ComplementNaiveBayes',
                              'ComplementNaiveBayes_BorderlineSMOTE_v1',
                              'ComplementNaiveBayes_BorderlineSMOTE_v2',
                              'ComplementNaiveBayes_BorderlineSMOTE_v3',
                              'ComplementNaiveBayes_ROS_v1',
                              'ComplementNaiveBayes_SMOTE_v1',
                              'ComplementNaiveBayes_SMOTE_v2'],
     
     'LogisticRegression': ['LogisticRegression',
                           'LogisticRegression_BorderlineSMOTE',
                             'LogisticRegression_BorderlineSMOTE_v2',
                             'LogisticRegression_ROS_v1',
                             'LogisticRegression_ROS_v2',
                             'LogisticRegression_ROS_v3',
                             'LogisticRegression_SMOTE',
                             'LogisticRegression_SMOTE_v2',
                             'LogisticRegression_SVMSMOTE',
                             'LogisticRegression_SVMSMOTE_v2'],
     
     'Logistic Regression Lasso': ['LogisticRegressionLassoV1',
                                     'LogisticRegressionLassoV2',
                                     'LogisticRegressionLasso_BorderlineSMOTE_v1',
                                     'LogisticRegressionLasso_BorderlineSMOTE_v2',
                                     'LogisticRegressionLasso_BorderlineSMOTE_v3',
                                     'LogisticRegressionLasso_SMOTE_v1',
                                     'LogisticRegressionLasso_SMOTE_v2',
                                     'LogisticRegressionLasso_SMOTE_v3',
                                     'LogisticRegressionLasso_SVMSMOTE_v1',],
     
     'Logistic Regression ElasticNet': ['LogisticRegressionElasticNetV1',
                                        'LogisticRegressionElasticNetV2',
                                        'LogisticRegressionElasticNetV3',
                                         'LogisticRegressionElasticNet_BorderlineSMOTE_v1',
                                         'LogisticRegressionElasticNet_BorderlineSMOTE_v2',
                                         'LogisticRegressionElasticNet_BorderlineSMOTE_v3',
                                         'LogisticRegressionElasticNet_ROS_v1',
                                         'LogisticRegressionElasticNet_ROS_v2',
                                         'LogisticRegressionElasticNet_ROS_v3',
                                         'LogisticRegressionElasticNet_SMOTE_v1',
                                         'LogisticRegressionElasticNet_SMOTE_v2',
                                         'LogisticRegressionElasticNet_SMOTE_v3',
                                         'LogisticRegressionElasticNet_SVMSMOTE_v1',
                                         'LogisticRegressionElasticNet_SVMSMOTE_v2',
                                         'LogisticRegressionElasticNet_SVMSMOTE_v3'],
     
     'Logistic Regression Ridge': ['LogisticRegressionRidgeDual',
                                     'LogisticRegressionRidgeDual_BorderlineSMOTE_v1',
                                     'LogisticRegressionRidgeDual_BorderlineSMOTE_v2',
                                     'LogisticRegressionRidgeDual_BorderlineSMOTE_v3',
                                     'LogisticRegressionRidgeDual_ROS_v1',
                                     'LogisticRegressionRidgeDual_SMOTE_v1',
                                     'LogisticRegressionRidgeDual_SMOTE_v2',
                                     'LogisticRegressionRidgeDual_SMOTE_v3',
                                     'LogisticRegressionRidgeDual_SVMSMOTE_v1',
                                     'LogisticRegressionRidgeDual_SVMSMOTE_v2',
                                     'LogisticRegressionRidgeDual_SVMSMOTE_v3',
                                     'LogisticRegressionRidgeV1',
                                     'LogisticRegressionRidgeV2',
                                     'LogisticRegressionRidge_BorderlineSMOTE_v1',
                                     'LogisticRegressionRidge_BorderlineSMOTE_v2',
                                     'LogisticRegressionRidge_BorderlineSMOTE_v3',
                                     'LogisticRegressionRidge_ROS_v1',
                                     'LogisticRegressionRidge_ROS_v2',
                                     'LogisticRegressionRidge_ROS_v3',
                                     'LogisticRegressionRidge_SMOTE_v1',
                                     'LogisticRegressionRidge_SMOTE_v2',
                                     'LogisticRegressionRidge_SMOTE_v3',
                                     'LogisticRegressionRidge_SVMSMOTE_v1',
                                     'LogisticRegressionRidge_SVMSMOTE_v2',
                                     'LogisticRegressionRidge_SVMSMOTE_v3'],
    
    'RidgeClassifier' :['RidgeClassifierV1',
                         'RidgeClassifierV2',
                         'RidgeClassifier_BorderlineSMOTE_v1',
                         'RidgeClassifier_BorderlineSMOTE_v2',
                         'RidgeClassifier_BorderlineSMOTE_v3',
                         'RidgeClassifier_ROS_v1',
                         'RidgeClassifier_ROS_v2',
                         'RidgeClassifier_ROS_v3',
                         'RidgeClassifier_SMOTE_v1',
                         'RidgeClassifier_SMOTE_v2',
                         'RidgeClassifier_SMOTE_v3',
                         'RidgeClassifier_SVMSMOTE_v1',
                         'RidgeClassifier_SVMSMOTE_v2',
                         'RidgeClassifier_SVMSMOTE_v3',],
    
    
    'SVM': ['SVM_rbf_BorderlineSMOTE_v1',
             'SVM_rbf_BorderlineSMOTE_v2',
             'SVM_rbf_BorderlineSMOTE_v3',
             'SVM_rbf_ROS_v1',
             'SVM_rbf_ROS_v2',
             'SVM_rbf_ROS_v3',
             'SVM_rbf_SMOTE_v1',
             'SVM_rbf_SMOTE_v2',
             'SVM_rbf_SMOTE_v3',
             'SVM_rbf_V1',
             'SVM_rbf_V2',
             'SVM_rbf_V3',
             'SVM_rbf_V4',
             'SVM_sigmoid_ROS_v1',
             'SVM_sigmoid_ROS_v2',
             'SVM_sigmoid_V3',
             'SVM_sigmoid_V4',
             'SVM_sigmoid_broader_gamma',
             'SVM_sigmoid_narrow_gamma',],
    
    'Random Forest': ['RandomForestSK_V1',
                     'RandomForestSK_V2',
                     'RandomForestSK_V3',
                     'RandomForest_BorderlineSMOTE_v0',
                     'RandomForest_BorderlineSMOTE_v1',
                     'RandomForest_BorderlineSMOTE_v2',
                     'RandomForest_BorderlineSMOTE_v3',
                     'RandomForest_ROS_v0',
                     'RandomForest_ROS_v1',
                     'RandomForest_ROS_v2',
                     'RandomForest_ROS_v3',
                     'RandomForest_SMOTE_v0',
                     'RandomForest_SMOTE_v1',
                     'RandomForest_SMOTE_v2',
                     'RandomForest_SMOTE_v3',],
    
    'XGBoost': [ 'XGBoost_broad_BorderlineSMOTE',
                 'XGBoost_broad_ROS',
                 'XGBoost_broad_SMOTE',
                 'XGBoost_broad_SVMSMOTE',
                 'XGBoost_narrow_BorderlineSMOTE',
                 'XGBoost_narrow_ROS',
                 'XGBoost_narrow_SMOTE',
                 'XGBoost_narrow_SVMSMOTE']
     
    }

recode_dict = {v: k for k,vv in m.items() for v in vv}

In [8]:
mw_df['model_type'] = mw_df.index.get_level_values(1).map(recode_dict)

In [9]:
best_models_df = mw_df.reset_index().loc[mw_df.reset_index().groupby(['model_type', 'unit_of_analysis'])['test_f1_micro_mean'].idxmax()]\
    .set_index(['model_type', 'unit_of_analysis'])

Unnamed: 0_level_0,Unnamed: 1_level_0,run_uuid,model_name,runName,exp_id,train_recall_macro_std,train_recall_micro_mean,test_accuracy_mean,train_precision_micro_std,test_f1_macro_mean,test_recall_macro_std,...,tfidf,language,corr_threshold,analysis_level,spacy_model_used,max_df,vectorizer_max_features,is_count_vectorizer,n_gram_range_start,n_gram_range_end
model_type,unit_of_analysis,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Complement Naive Bayes,raw_text,975db7d9d8944a51b21f06b77b4f3b60,ComplementNaiveBayes_SMOTE_v2,unruly-panda-298,403974574172768364,0.027166,0.828485,0.021009,0.020932,0.39052,0.025887,...,True,ru,0.9,model_wide,ru_core_news_sm,0.4,10000,False,1,3
Complement Naive Bayes,title,35312937e998481ebf8d6a8c7e1a5d88,ComplementNaiveBayes_ROS_v1,fun-pig-957,782066207554034527,0.027493,0.737889,0.0,0.02348,0.289899,0.039814,...,True,ru,0.9,model_wide,ru_core_news_sm,0.4,10000,False,1,3
Complement Naive Bayes,title_and_10_sentences,54184d6e3c304c56856fb85ed82d95fc,ComplementNaiveBayes_BorderlineSMOTE_v1,polite-sow-560,259607648085163492,0.056189,0.775995,0.010585,0.057042,0.341736,0.111935,...,True,ru,0.9,model_wide,ru_core_news_sm,0.4,10000,False,1,3
Complement Naive Bayes,title_and_5_sentences,21b61557bb104a94be5ca70a021d95af,ComplementNaiveBayes_SMOTE_v2,monumental-bird-839,337802502687668798,0.091816,0.663523,0.0,0.043877,0.324353,0.079888,...,True,ru,0.9,model_wide,ru_core_news_sm,0.4,10000,False,1,3
Complement Naive Bayes,title_and_first_paragraph,7467af49fff94e71b7cdfcb5a7b8d2c4,ComplementNaiveBayes_SMOTE_v2,grandiose-panda-725,144028333125583488,0.04215,0.733376,0.0,0.009499,0.309241,0.062317,...,True,ru,0.9,model_wide,ru_core_news_sm,0.4,10000,False,1,3
Complement Naive Bayes,title_and_first_sentence_each_paragraph,3e59f64df8bc4c3199605be1c144dd46,ComplementNaiveBayes_ROS_v1,resilient-fly-950,953344357930397000,0.067579,0.88303,0.005128,0.065913,0.356836,0.088434,...,True,ru,0.9,model_wide,ru_core_news_sm,0.4,10000,False,1,3
Logistic Regression ElasticNet,raw_text,f580a57de7c14c30a237ad46ae271dd4,LogisticRegressionElasticNet_SVMSMOTE_v2,puzzled-stag-256,739444040876616155,0.021316,0.779288,0.042178,0.010008,0.379513,0.057405,...,True,ru,0.9,model_wide,ru_core_news_sm,0.4,10000,False,1,3
Logistic Regression ElasticNet,title,28a85e5540f443d79e40bfca4b20e4f8,LogisticRegressionElasticNet_ROS_v1,hilarious-bass-662,670379349728290292,0.0,1.0,0.010256,0.001469,0.304981,0.058314,...,True,ru,0.9,model_wide,ru_core_news_sm,0.771987,285,False,1,1
Logistic Regression ElasticNet,title_and_10_sentences,ee36956330ae4165818b3c821a036e86,LogisticRegressionElasticNet_SMOTE_v2,nimble-bear-769,561868998222439389,0.024976,0.702553,0.015793,0.022926,0.345985,0.069844,...,True,ru,0.9,model_wide,ru_core_news_sm,0.4,10000,False,1,3
Logistic Regression ElasticNet,title_and_5_sentences,1dacdf3ed85a4ea9a5436b3894a70a85,LogisticRegressionElasticNet_SMOTE_v2,spiffy-gnu-327,413353278508159409,0.006283,0.630134,0.010585,0.026523,0.318428,0.019542,...,True,ru,0.9,model_wide,ru_core_news_sm,0.4,10000,False,1,3


### Generate the tables to report

In [10]:
def display_performance_table(df, metric, display_=True):
    report_table = df.reset_index().copy()
    report_table['result'] = report_table[f'test_{metric}_mean'].map(lambda x: f'{x:.2f}') + \
    ' $\pm$ ' + report_table[f'test_{metric}_std'].map(lambda x: f'{x:.2f}')
    report_table['col_title'] = report_table.unit_of_analysis.str.split('_').str.join(' ') 
    report_table['col_title'] = pd.Categorical(
        report_table.col_title,
        categories=['title', 'title and first paragraph', 'title and 5 sentences', 'title and 10 sentences',
                    'title and first sentence each paragraph', 'raw text'],
        ordered=True)
    report_table = report_table[['model_type', 'col_title', 'result']]\
        .pivot_table(index='model_type', columns=['col_title'], values=['result'], aggfunc='first', fill_value=0)\
        .droplevel(0, axis=1)

    report_table.columns.names = [None]

    # Highlight best scoring models according to their average
    mean_perf_arr = report_table.applymap(lambda x: float(str(x).split(' ')[0])).to_numpy()
    highlight_mask = mean_perf_arr == mean_perf_arr.max()
    report_table_arr = report_table.to_numpy()  # Note it passes the array by reference
    report_table_arr[highlight_mask] = '**' + report_table_arr[highlight_mask] + '**'

    if display_:
        display(Markdown(report_table.to_markdown()))
    else:
        return report_table.to_markdown()

In [11]:
display_performance_table(df=best_models_df, metric='f1_micro', display_=True)

| model_type                     | title           | title and first paragraph   | title and 5 sentences   | title and 10 sentences   | title and first sentence each paragraph   | raw text            |
|:-------------------------------|:----------------|:----------------------------|:------------------------|:-------------------------|:------------------------------------------|:--------------------|
| Complement Naive Bayes         | 0.35 $\pm$ 0.01 | 0.37 $\pm$ 0.00             | 0.39 $\pm$ 0.03         | 0.41 $\pm$ 0.02          | 0.42 $\pm$ 0.01                           | 0.45 $\pm$ 0.01     |
| Logistic Regression ElasticNet | 0.35 $\pm$ 0.02 | 0.39 $\pm$ 0.03             | 0.38 $\pm$ 0.02         | 0.41 $\pm$ 0.02          | 0.43 $\pm$ 0.02                           | 0.43 $\pm$ 0.01     |
| Logistic Regression Lasso      | 0.34 $\pm$ 0.02 | 0.39 $\pm$ 0.02             | 0.36 $\pm$ 0.03         | 0.39 $\pm$ 0.00          | 0.40 $\pm$ 0.01                           | 0.42 $\pm$ 0.02     |
| Logistic Regression Ridge      | 0.37 $\pm$ 0.01 | 0.42 $\pm$ 0.01             | 0.38 $\pm$ 0.03         | 0.41 $\pm$ 0.01          | 0.44 $\pm$ 0.01                           | **0.46 $\pm$ 0.03** |
| LogisticRegression             | 0.34 $\pm$ 0.03 | 0.38 $\pm$ 0.01             | 0.40 $\pm$ 0.02         | 0.40 $\pm$ 0.03          | 0.40 $\pm$ 0.02                           | 0.43 $\pm$ 0.02     |
| Naive Bayes                    | 0.33 $\pm$ 0.01 | 0.36 $\pm$ 0.02             | 0.39 $\pm$ 0.03         | 0.41 $\pm$ 0.02          | 0.43 $\pm$ 0.03                           | **0.46 $\pm$ 0.03** |
| Random Forest                  | 0.28 $\pm$ 0.03 | 0.31 $\pm$ 0.02             | 0.35 $\pm$ 0.03         | 0.38 $\pm$ 0.02          | 0.36 $\pm$ 0.02                           | 0.43 $\pm$ 0.03     |
| RidgeClassifier                | 0.36 $\pm$ 0.03 | 0.40 $\pm$ 0.02             | 0.39 $\pm$ 0.01         | 0.41 $\pm$ 0.01          | 0.43 $\pm$ 0.03                           | 0.45 $\pm$ 0.02     |
| SVM                            | 0.35 $\pm$ 0.02 | 0                           | 0                       | 0                        | 0                                         | 0                   |
| XGBoost                        | 0.36 $\pm$ 0.01 | 0.37 $\pm$ 0.01             | 0.37 $\pm$ 0.01         | 0.38 $\pm$ 0.00          | 0.39 $\pm$ 0.02                           | 0.44 $\pm$ 0.02     |

In [12]:
display_performance_table(df=best_models_df, metric='precision_micro', display_=True)

| model_type                     | title           | title and first paragraph   | title and 5 sentences   | title and 10 sentences   | title and first sentence each paragraph   | raw text            |
|:-------------------------------|:----------------|:----------------------------|:------------------------|:-------------------------|:------------------------------------------|:--------------------|
| Complement Naive Bayes         | 0.28 $\pm$ 0.01 | 0.30 $\pm$ 0.01             | 0.32 $\pm$ 0.04         | 0.32 $\pm$ 0.03          | 0.32 $\pm$ 0.03                           | 0.38 $\pm$ 0.02     |
| Logistic Regression ElasticNet | 0.31 $\pm$ 0.02 | 0.43 $\pm$ 0.03             | 0.34 $\pm$ 0.02         | 0.38 $\pm$ 0.01          | 0.42 $\pm$ 0.05                           | 0.42 $\pm$ 0.02     |
| Logistic Regression Lasso      | 0.27 $\pm$ 0.02 | 0.35 $\pm$ 0.01             | 0.32 $\pm$ 0.04         | 0.36 $\pm$ 0.03          | 0.37 $\pm$ 0.02                           | 0.39 $\pm$ 0.02     |
| Logistic Regression Ridge      | 0.30 $\pm$ 0.00 | 0.34 $\pm$ 0.02             | 0.35 $\pm$ 0.03         | 0.34 $\pm$ 0.04          | 0.42 $\pm$ 0.02                           | 0.36 $\pm$ 0.03     |
| LogisticRegression             | 0.27 $\pm$ 0.02 | 0.33 $\pm$ 0.02             | 0.40 $\pm$ 0.01         | 0.40 $\pm$ 0.04          | 0.37 $\pm$ 0.02                           | **0.51 $\pm$ 0.03** |
| Naive Bayes                    | 0.29 $\pm$ 0.01 | 0.32 $\pm$ 0.02             | 0.36 $\pm$ 0.03         | 0.38 $\pm$ 0.03          | 0.36 $\pm$ 0.02                           | 0.38 $\pm$ 0.04     |
| Random Forest                  | 0.28 $\pm$ 0.04 | 0.32 $\pm$ 0.06             | 0.35 $\pm$ 0.01         | 0.32 $\pm$ 0.02          | 0.35 $\pm$ 0.02                           | 0.39 $\pm$ 0.03     |
| RidgeClassifier                | 0.31 $\pm$ 0.02 | 0.36 $\pm$ 0.01             | 0.34 $\pm$ 0.01         | 0.37 $\pm$ 0.02          | 0.45 $\pm$ 0.04                           | 0.48 $\pm$ 0.04     |
| SVM                            | 0.28 $\pm$ 0.00 | 0                           | 0                       | 0                        | 0                                         | 0                   |
| XGBoost                        | 0.25 $\pm$ 0.01 | 0.26 $\pm$ 0.01             | 0.26 $\pm$ 0.01         | 0.30 $\pm$ 0.02          | 0.31 $\pm$ 0.02                           | 0.36 $\pm$ 0.04     |

In [13]:
display_performance_table(df=best_models_df, metric='recall_micro', display_=True)

| model_type                     | title           | title and first paragraph   | title and 5 sentences   | title and 10 sentences   | title and first sentence each paragraph   | raw text        |
|:-------------------------------|:----------------|:----------------------------|:------------------------|:-------------------------|:------------------------------------------|:----------------|
| Complement Naive Bayes         | 0.54 $\pm$ 0.01 | 0.63 $\pm$ 0.07             | 0.53 $\pm$ 0.06         | 0.63 $\pm$ 0.09          | 0.65 $\pm$ 0.07                           | 0.61 $\pm$ 0.02 |
| Logistic Regression ElasticNet | 0.40 $\pm$ 0.06 | 0.39 $\pm$ 0.03             | 0.48 $\pm$ 0.02         | 0.49 $\pm$ 0.06          | 0.47 $\pm$ 0.02                           | 0.47 $\pm$ 0.04 |
| Logistic Regression Lasso      | 0.55 $\pm$ 0.04 | 0.52 $\pm$ 0.04             | 0.47 $\pm$ 0.04         | 0.46 $\pm$ 0.04          | 0.48 $\pm$ 0.02                           | 0.48 $\pm$ 0.02 |
| Logistic Regression Ridge      | 0.53 $\pm$ 0.04 | 0.61 $\pm$ 0.08             | 0.47 $\pm$ 0.05         | 0.57 $\pm$ 0.07          | 0.52 $\pm$ 0.05                           | 0.68 $\pm$ 0.04 |
| LogisticRegression             | 0.53 $\pm$ 0.07 | 0.52 $\pm$ 0.07             | 0.41 $\pm$ 0.04         | 0.43 $\pm$ 0.02          | 0.44 $\pm$ 0.02                           | 0.39 $\pm$ 0.02 |
| Naive Bayes                    | 0.44 $\pm$ 0.04 | 0.50 $\pm$ 0.06             | 0.47 $\pm$ 0.06         | 0.48 $\pm$ 0.04          | 0.56 $\pm$ 0.05                           | 0.63 $\pm$ 0.03 |
| Random Forest                  | 0.36 $\pm$ 0.08 | 0.46 $\pm$ 0.09             | 0.41 $\pm$ 0.02         | 0.51 $\pm$ 0.04          | 0.40 $\pm$ 0.04                           | 0.51 $\pm$ 0.03 |
| RidgeClassifier                | 0.46 $\pm$ 0.07 | 0.54 $\pm$ 0.05             | 0.51 $\pm$ 0.05         | 0.49 $\pm$ 0.02          | 0.46 $\pm$ 0.04                           | 0.45 $\pm$ 0.04 |
| SVM                            | 0.56 $\pm$ 0.11 | 0                           | 0                       | 0                        | 0                                         | 0               |
| XGBoost                        | 0.68 $\pm$ 0.05 | **0.71 $\pm$ 0.03**         | 0.66 $\pm$ 0.01         | 0.59 $\pm$ 0.07          | 0.55 $\pm$ 0.06                           | 0.63 $\pm$ 0.09 |

In [14]:
display_performance_table(df=best_models_df, metric='accuracy', display_=True)

| model_type                     | title           | title and first paragraph   | title and 5 sentences   | title and 10 sentences   | title and first sentence each paragraph   | raw text            |
|:-------------------------------|:----------------|:----------------------------|:------------------------|:-------------------------|:------------------------------------------|:--------------------|
| Complement Naive Bayes         | 0.00 $\pm$ 0.00 | 0.00 $\pm$ 0.00             | 0.00 $\pm$ 0.00         | 0.01 $\pm$ 0.01          | 0.01 $\pm$ 0.01                           | 0.02 $\pm$ 0.01     |
| Logistic Regression ElasticNet | 0.01 $\pm$ 0.01 | 0.05 $\pm$ 0.05             | 0.01 $\pm$ 0.01         | 0.02 $\pm$ 0.01          | 0.03 $\pm$ 0.01                           | 0.04 $\pm$ 0.02     |
| Logistic Regression Lasso      | 0.01 $\pm$ 0.01 | 0.00 $\pm$ 0.00             | 0.03 $\pm$ 0.01         | 0.03 $\pm$ 0.02          | 0.01 $\pm$ 0.01                           | 0.03 $\pm$ 0.01     |
| Logistic Regression Ridge      | 0.00 $\pm$ 0.00 | 0.03 $\pm$ 0.02             | 0.02 $\pm$ 0.01         | 0.01 $\pm$ 0.01          | 0.03 $\pm$ 0.03                           | 0.02 $\pm$ 0.01     |
| LogisticRegression             | 0.01 $\pm$ 0.01 | 0.01 $\pm$ 0.01             | 0.04 $\pm$ 0.02         | 0.03 $\pm$ 0.01          | 0.03 $\pm$ 0.01                           | **0.11 $\pm$ 0.02** |
| Naive Bayes                    | 0.01 $\pm$ 0.01 | 0.00 $\pm$ 0.00             | 0.01 $\pm$ 0.01         | 0.03 $\pm$ 0.02          | 0.01 $\pm$ 0.01                           | 0.03 $\pm$ 0.02     |
| Random Forest                  | 0.01 $\pm$ 0.01 | 0.01 $\pm$ 0.01             | 0.02 $\pm$ 0.02         | 0.01 $\pm$ 0.01          | 0.02 $\pm$ 0.01                           | 0.02 $\pm$ 0.01     |
| RidgeClassifier                | 0.01 $\pm$ 0.01 | 0.01 $\pm$ 0.01             | 0.01 $\pm$ 0.02         | 0.02 $\pm$ 0.02          | 0.03 $\pm$ 0.03                           | 0.05 $\pm$ 0.01     |
| SVM                            | 0.01 $\pm$ 0.01 | 0                           | 0                       | 0                        | 0                                         | 0                   |
| XGBoost                        | 0.01 $\pm$ 0.01 | 0.01 $\pm$ 0.01             | 0.01 $\pm$ 0.01         | 0.03 $\pm$ 0.02          | 0.02 $\pm$ 0.02                           | 0.03 $\pm$ 0.02     |

## Retrieve the best experiments runs and write them in a single experiment

In [None]:
os.make_dir('post_processed_runs', exists_ok=True)
os.chdir('post_processed_runs')
client =  MlflowClient()

In [27]:
for unit_of_analysis, df in best_models_df.reset_index().groupby(['unit_of_analysis']):
    print(f"best_performing_models_{unit_of_analysis}")
    new_exp_id = client.create_experiment(f"best_performing_models_{unit_of_analysis}")
    
    try:
        # Add run information to the experiment
        for idx, row in df.iterrows():
            with mlflow.start_run(experiment_id=new_exp_id):
                for col in df.columns:
                    if 'train'in col or 'test' in col: 
                        log_metric(col, row[col])

                    else:
                        log_param(col, row[col])

            mlflow.end_run()
    except Exception as e:
        print(e)
        pass

best_performing_models_raw_text
best_performing_models_title
best_performing_models_title_and_10_sentences
best_performing_models_title_and_5_sentences
best_performing_models_title_and_first_paragraph
best_performing_models_title_and_first_sentence_each_paragraph


# Outer CV Metrics

Retrieve all outer CV Metrics, (subset the top performing ones), and write them to new experiments where they are well grouped per unit of analysis and model type

In [16]:
analysis_level = 'outer_cv'
query = f"params.analysis_level = '{analysis_level}'"

rows = []
for exp_i, exp_id in enumerate(exp_ids):
    # Retrieve 'model_wide' runs of the experiment
    try:
        runs = client.search_runs(experiment_ids=exp_id, filter_string=query)
    except:
        print(f'problems with exp_id: {exp_id} in position {exp_i}')
        continue
        
    # Add relevant parameters to the row
    for run in runs:
        row_dict = {'exp_id': exp_id, 'run_uuid': run.info.run_uuid, 'runName': run.data.tags['mlflow.runName'],
                    **run.data.metrics, **run.data.params}
        rows.append(row_dict)
        
    if exp_i % 100 == 0 and exp_i!=0:
        pd.DataFrame(rows).set_index(['run_uuid', 'model_name', 'runName']).sort_index().to_csv('all_outer_cv_results.csv')
    
pd.DataFrame(rows).set_index(['run_uuid', 'model_name', 'runName']).sort_index().to_csv('all_outer_cv_results.csv')

problems with exp_id: 168812780760165522 in position 1942


In [28]:
ocv_df = pd.read_csv('all_outer_cv_results.csv', index_col=['exp_id','run_uuid', 'model_name', 'runName'])

In [29]:
ocv_df['model_type'] = ocv_df.index.get_level_values(2).map(recode_dict)

In [32]:
ocv_df.model_type.value_counts()

Logistic Regression Ridge         3982
Logistic Regression ElasticNet    2333
RidgeClassifier                   2093
LogisticRegression                1857
Logistic Regression Lasso         1608
Random Forest                      540
Naive Bayes                        324
Complement Naive Bayes             252
XGBoost                            166
SVM                                111
Name: model_type, dtype: int64

## Post process results of experiments

In [None]:
os.make_dir('post_processed_runs', exists_ok=True)
os.chdir('post_processed_runs')
client =  MlflowClient()

### Create one experiment per model type

In [None]:
for model_type, df in ocv_df.reset_index().groupby(['model_type']):
    new_exp_name = f"all_runs_of_{model_type}"
    print(new_exp_name)
    new_exp_id = client.create_experiment(new_exp_name)
    
    try:
        # Add run information to the experiment
        for idx, row in df.iterrows():
            with mlflow.start_run(experiment_id=new_exp_id):
                for col in df.columns:
                    if 'train'in col or 'test' in col: 
                        log_metric(col, row[col])

                    else:
                        log_param(col, row[col])

            mlflow.end_run()
    except Exception as e:
        print(e)
        pass

all_runs_of_Complement Naive Bayes


### Create one experiment per unit of analysis

In [None]:
for unit_of_analysis, df in ocv_df.reset_index().groupby(['unit_of_analysis']):
    new_exp_name = f"all_runs_of_{unit_of_analysis}"
    print(new_exp_name)
    new_exp_id = client.create_experiment(new_exp_name)
    
    try:
        # Add run information to the experiment
        for idx, row in df.iterrows():
            with mlflow.start_run(experiment_id=new_exp_id):
                for col in df.columns:
                    if 'train'in col or 'test' in col: 
                        log_metric(col, row[col])

                    else:
                        log_param(col, row[col])

            mlflow.end_run()
    except Exception as e:
        print(e)
        pass

### Create one experiment per model type per unit of analysis

In [None]:
for (model_type, unit_of_analysis), df in ocv_df.reset_index().groupby(['model_type','unit_of_analysis']):
    new_exp_name = f"all_runs_of_{model_type}_{unit_of_analysis}"
    print(new_exp_name)
    new_exp_id = client.create_experiment(new_exp_name)
    
    try:
        # Add run information to the experiment
        for idx, row in df.iterrows():
            with mlflow.start_run(experiment_id=new_exp_id):
                for col in df.columns:
                    if 'train'in col or 'test' in col: 
                        log_metric(col, row[col])

                    else:
                        log_param(col, row[col])

            mlflow.end_run()
    except Exception as e:
        print(e)
        pass

# Extra stuff

In [29]:
test_id = run_ids[1]
test_query = "params.analysis_level = '{analysis_level}'"

In [30]:
runs = client.search_runs(
    experiment_ids=[test_id],
    filter_string=test_query.format(analysis_level=analysis_level))

In [None]:
len(runs)

In [44]:
runs[0].data.metrics

{'train_recall_macro': 0.717208310100105,
 'train_accuracy': 0.17054263565891473,
 'train_f1_macro': 0.5289355975824069,
 'test_f1_macro': 0.2727723308052808,
 'test_precision_macro': 0.24911713731277244,
 'train_precision_micro': 0.49831346568861945,
 'train_precision_macro': 0.44566856489109563,
 'test_precision_micro': 0.3358132789723725,
 'test_accuracy': 0.03225806451612903,
 'test_recall_macro': 0.34139809415202177,
 'train_recall_micro': 0.7371794871794872,
 'train_f1_micro': 0.5736494831238187,
 'test_recall_micro': 0.4267515923566879,
 'test_f1_micro': 0.3623497081652117}

In [45]:
runs[0].data.params

{'min_df': '0.0001',
 'multilabel_type': 'chain',
 'base_estimator_name': "Pipeline(steps=[('up', BorderlineSMOTE()),\n                ('preproc', StandardScaler(with_mean=False)),\n                ('model', RidgeClassifier(max_iter=100000))])",
 'model_name': 'RidgeClassifier_BorderlineSMOTE_v2',
 'k_neighbors': '2',
 'kind': 'borderline-2',
 'min_var': '0.001',
 'sampling_strategy': 'not majority',
 'n_features': '37',
 'tfidf': 'True',
 'class_weight': 'balanced',
 'language': 'ru',
 'corr_threshold': '0.9',
 'analysis_level': 'outer_cv',
 'spacy_model_used': 'ru_core_news_sm',
 'max_df': '0.4',
 'with_std': 'True',
 'vectorizer_max_features': '10000',
 'is_count_vectorizer': 'False',
 'alpha': '301.93720962923425',
 'unit_of_analysis': 'title_and_10_sentences',
 'n_gram_range_start': '1',
 'm_neighbors': '7',
 'n_gram_range_end': '3'}

In [41]:
runs = client.search_runs(
    experiment_ids='656382851946062557', 
    filter_string=test_query.format(analysis_level='outer_cv'),
    run_view_type=ViewType.ALL)

In [49]:
runs[0].info

<RunInfo: artifact_uri='/scratch/jberme/tune_russian/mlruns/656382851946062557/9847672f5c574ca6ae4963c15a7851a5/artifacts', end_time=1674322806703, experiment_id='656382851946062557', lifecycle_stage='active', run_id='9847672f5c574ca6ae4963c15a7851a5', run_uuid='9847672f5c574ca6ae4963c15a7851a5', start_time=1674322805914, status='FINISHED', user_id='jberme'>

In [55]:
runs[0].data.tags

{'mlflow.source.name': 'benchmark_subtask_2.py',
 'mlflow.user': 'jberme',
 'mlflow.runName': 'honorable-wolf-618',
 'mlflow.source.git.commit': '3dc0c6739468e9b07b8dd09a5611b105d391ef5a',
 'mlflow.source.type': 'LOCAL'}

In [48]:
runs[0].info.run_uuid

'9847672f5c574ca6ae4963c15a7851a5'

In [57]:
runs[0]

<Run: data=<RunData: metrics={'test_accuracy': 0.03225806451612903,
 'test_f1_macro': 0.2727723308052808,
 'test_f1_micro': 0.3623497081652117,
 'test_precision_macro': 0.24911713731277244,
 'test_precision_micro': 0.3358132789723725,
 'test_recall_macro': 0.34139809415202177,
 'test_recall_micro': 0.4267515923566879,
 'train_accuracy': 0.17054263565891473,
 'train_f1_macro': 0.5289355975824069,
 'train_f1_micro': 0.5736494831238187,
 'train_precision_macro': 0.44566856489109563,
 'train_precision_micro': 0.49831346568861945,
 'train_recall_macro': 0.717208310100105,
 'train_recall_micro': 0.7371794871794872}, params={'alpha': '301.93720962923425',
 'analysis_level': 'outer_cv',
 'base_estimator_name': "Pipeline(steps=[('up', BorderlineSMOTE()),\n"
                        "                ('preproc', "
                        'StandardScaler(with_mean=False)),\n'
                        "                ('model', "
                        'RidgeClassifier(max_iter=100000))])',
 'class_

In [28]:
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]

In [None]:
all_experiments