In [1]:
import os
import glob

from IPython.display import display, Markdown
import pandas as pd

# Group types of models (experiment type and model type) and pick best performing in terms of f1-score per unit of analysis and report them in a table


In [2]:
df_filepaths = glob.glob('./experiment_results/*.csv')

In [3]:
exps_df = pd.concat([pd.read_csv(df_filepath) for df_filepath in df_filepaths])

In [4]:
exps_df

Unnamed: 0,language,unit_of_analysis,model_type,model_subtype,model_name,train_f1_micro,train_f1_macro,train_accuracy,train_precision_micro,train_precision_macro,train_recall_micro,train_recall_macro,test_f1_micro,test_f1_macro,test_accuracy,test_precision_micro,test_precision_macro,test_recall_micro,test_recall_macro
0,ru,title_and_first_paragraph,Dummy Classifier,No Upsampling,DummyProbSampling,0.232641,0.177376,0.000000,0.232823,0.178356,0.234987,0.178365,0.188977,0.134921,0.000000,0.184040,0.125521,0.209302,0.162075
1,ru,title_and_first_paragraph,Dummy Classifier,No Upsampling,DummyUniformSampling,0.312392,0.263638,0.000000,0.239906,0.189308,0.522193,0.531628,0.272470,0.215776,0.000000,0.210352,0.155106,0.476744,0.446939
2,ru,title_and_first_paragraph,Dummy Classifier,No Upsampling,DummyMostFrequent,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ru,title_and_first_paragraph,KNN,BorderlineSMOTE,kNN_BorderlineSMOTE,0.411710,0.357731,0.000000,0.280950,0.249591,0.934726,0.864706,0.347942,0.254417,0.000000,0.224870,0.160999,0.837209,0.689031
4,ru,title_and_first_paragraph,KNN,No Upsampling,kNN,0.237011,0.146046,0.150327,0.369253,0.279661,0.232376,0.139411,0.140206,0.085776,0.026316,0.281469,0.202948,0.104651,0.060629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,en,title_and_5_sentences,XGBoost,BorderlineSMOTE,XGBoost_narrow_BorderlineSMOTE,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.603388,0.392899,0.038835,0.659811,0.450338,0.567237,0.361105
57,en,title_and_5_sentences,XGBoost,No Upsampling,XGBoost_narrow,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.602439,0.387458,0.048544,0.660854,0.451528,0.574572,0.360803
58,en,title_and_5_sentences,XGBoost,Random Oversampling,XGBoost_narrow_ROS,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.610316,0.404632,0.019417,0.655220,0.470681,0.589242,0.384710
59,en,title_and_5_sentences,XGBoost,SMOTE,XGBoost_narrow_SMOTE,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.606402,0.404920,0.038835,0.651281,0.475135,0.586797,0.380137


In [5]:
exps_df.model_type = exps_df.model_type.str.strip('¶')

In [6]:
exps_df.shape

(2156, 19)

In [7]:
def get_best_models(df, grouping_criterion):
    return df.reset_index().loc[df.reset_index().groupby(grouping_criterion)['test_f1_micro'].idxmax().dropna()]\
        .set_index(grouping_criterion)

### Generate the tables to report

In [8]:
def display_performance_table(df, metric, index_cols=['model_type'], display_=True):
    report_table = df.reset_index().copy()
    report_table['result'] = report_table[f'test_{metric}'].map(lambda x: f'{x:.3f}')
    report_table['col_title'] = report_table.unit_of_analysis.str.split('_').str.join(' ') 
    report_table['col_title'] = pd.Categorical(
        report_table.col_title,
        categories=['title', 'title and first paragraph', 'title and 5 sentences', 'title and 10 sentences',
                    'title and first sentence each paragraph', 'raw text'],
        ordered=True)
    report_table = report_table[index_cols + ['col_title', 'result']]\
        .pivot_table(index=index_cols, columns=['col_title'], values=['result'], aggfunc='first', fill_value=0)\
        .droplevel(0, axis=1)

    report_table.columns.names = [None]

    # Highlight best scoring models according to their average
    mean_perf_arr = report_table.applymap(lambda x: float(str(x).split(' ')[0])).to_numpy()
    highlight_mask = mean_perf_arr == mean_perf_arr.max()
    report_table_arr = report_table.to_numpy()  # Note it passes the array by reference
    report_table_arr[highlight_mask] = '**' + report_table_arr[highlight_mask] + '**'

    if display_:
        display(Markdown(report_table.to_markdown()))
    
    return report_table

### Generate tables for all languages

In [9]:
metrics_to_report = ['f1_micro', 'recall_micro', 'precision_micro', 'accuracy'] 

In [10]:
language_dict = {'en': 'English', 'it': 'Italian', 'fr': 'French', 'po': 'Polish', 'ru': 'Russian', 'ge': 'German'}

In [11]:
def display_metrics_and_write_to_file(df, grouping_criterion, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    best_model_dfs_list = []
    report_tables_dfs_dict = {metric: [] for metric in metrics_to_report}

    for language, results_df in df.groupby('language'):
        best_models_df = get_best_models(results_df, grouping_criterion + ['unit_of_analysis'])
        best_model_dfs_list.append(best_models_df.copy())

        display(Markdown(f'# {language_dict[language]}'))
        
        for metric in metrics_to_report:
            os.makedirs(os.path.join(output_dir, metric), exist_ok=True)

            output_dir_markdown = os.path.join(output_dir, metric, 'markdown')
            output_dir_latex = os.path.join(output_dir, metric, 'latex')
            output_dir_csv = os.path.join(output_dir, metric, 'csv')

            os.makedirs(output_dir_markdown, exist_ok=True)
            os.makedirs(output_dir_latex, exist_ok=True)
            os.makedirs(output_dir_csv, exist_ok=True)

            display(Markdown(f'## {metric}'))

            report_table = display_performance_table(df=best_models_df, index_cols=grouping_criterion, metric=metric, display_=True)

            # Export as markdown
            markdown_file = open(os.path.join(output_dir_markdown, f"{language_dict[language]}_{metric}.md"), "w")
            report_table.reset_index().to_markdown(markdown_file, index=False)
            markdown_file.close()

            # Export as latex table
            latex_file = open(os.path.join(output_dir_latex, f"{language_dict[language]}_{metric}.tex"), "w")
            report_table.reset_index().to_latex(latex_file, index=False)
            latex_file.close()

            # Export as csv
            report_table.to_csv(os.path.join(output_dir_csv, f"{language_dict[language]}_{metric}.csv"))

            # Stack all languages into single table
            report_table['language'] = language
            report_table = report_table.reset_index().set_index(['language'] + grouping_criterion)

            report_tables_dfs_dict[metric].append(report_table)

    # Store a csv with all the metrics and parameters of the best runs per language per grouping_criterion
    pd.concat(best_model_dfs_list)\
        .reset_index().set_index(['language'] + grouping_criterion).sort_index()\
        .to_csv(os.path.join(output_dir, 'best_exp_params_metrics_per_language.csv'))

    # Report or store unified table
    display(Markdown(f'# All 6 Languages'))
    for metric in metrics_to_report:
        display(Markdown(f'## {metric}'))
        multi_language_report_table_metric = pd.concat(report_tables_dfs_dict[metric])
        display(Markdown(multi_language_report_table_metric.reset_index().to_markdown(index=False)))

        output_dir_markdown = os.path.join(output_dir, metric, 'markdown')
        output_dir_latex = os.path.join(output_dir, metric, 'latex')
        output_dir_csv = os.path.join(output_dir, metric, 'csv')

        # Export as markdown
        markdown_file = open(os.path.join(output_dir_markdown, f"all_6_languages_{metric}.md"), "w")
        multi_language_report_table_metric.reset_index().to_markdown(markdown_file, index=False)
        markdown_file.close()

        # Export as latex table
        latex_file = open(os.path.join(output_dir_latex, f"all_6_languages_{metric}.tex"), "w")
        multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)
        latex_file.close()

        # Export as csv
        multi_language_report_table_metric.to_csv(os.path.join(output_dir_csv, f"all_6_languages_{metric}.csv"))

# Per model type

In [12]:
display_metrics_and_write_to_file(df=exps_df, grouping_criterion=['model_type'], output_dir='per_model_type_tables')

# English

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.427 |                       0.459 |                   0.437 |                    0.433 |                                     0.43  | 0.435      |
| KNN                          |   0.49  |                       0.54  |                   0.527 |                    0.478 |                                     0.466 | 0.543      |
| LinearSVM                    |   0.593 |                       0.612 |                   0.639 |                    0.651 |                                     0.649 | 0.679      |
| LogisticRegression           |   0.594 |                       0.618 |                   0.628 |                    0.653 |                                     0.659 | 0.694      |
| LogisticRegressionElasticNet |   0.604 |                       0.612 |                   0.638 |                    0.64  |                                     0.669 | 0.685      |
| LogisticRegressionLasso      |   0.555 |                       0.615 |                   0.624 |                    0.624 |                                     0.655 | 0.645      |
| LogisticRegressionRidge      |   0.61  |                       0.629 |                   0.643 |                    0.669 |                                     0.659 | 0.700      |
| NaiveBayes                   |   0.631 |                       0.695 |                   0.718 |                    0.712 |                                     0.713 | **0.719**  |
| RandomForest                 |   0.581 |                       0.616 |                   0.632 |                    0.654 |                                     0.663 | 0.656      |
| RidgeClassifier              |   0.602 |                       0.64  |                   0.636 |                    0.66  |                                     0.656 | 0.689      |
| SVM                          |   0.467 |                       0.472 |                   0.489 |                    0.523 |                                     0.541 | 0.569      |
| XGBoost                      |   0.526 |                       0.6   |                   0.615 |                    0.616 |                                     0.637 | 0.648      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|------------------------------------------:|-----------:|
| Dummy Classifier             |   0.482 |                       0.533 |                   0.499 | 0.416                    |                                     0.491 |      0.491 |
| KNN                          |   0.792 |                       0.719 |                   0.68  | **0.829**                |                                     0.8   |      0.523 |
| LinearSVM                    |   0.592 |                       0.577 |                   0.587 | 0.597                    |                                     0.606 |      0.641 |
| LogisticRegression           |   0.555 |                       0.589 |                   0.582 | 0.606                    |                                     0.623 |      0.658 |
| LogisticRegressionElasticNet |   0.56  |                       0.572 |                   0.582 | 0.589                    |                                     0.636 |      0.631 |
| LogisticRegressionLasso      |   0.482 |                       0.577 |                   0.575 | 0.587                    |                                     0.623 |      0.601 |
| LogisticRegressionRidge      |   0.589 |                       0.592 |                   0.601 | 0.626                    |                                     0.616 |      0.66  |
| NaiveBayes                   |   0.667 |                       0.719 |                   0.753 | 0.751                    |                                     0.76  |      0.76  |
| RandomForest                 |   0.518 |                       0.567 |                   0.577 | 0.604                    |                                     0.604 |      0.609 |
| RidgeClassifier              |   0.57  |                       0.614 |                   0.587 | 0.621                    |                                     0.614 |      0.653 |
| SVM                          |   0.45  |                       0.428 |                   0.457 | 0.479                    |                                     0.516 |      0.535 |
| XGBoost                      |   0.479 |                       0.567 |                   0.584 | 0.570                    |                                     0.592 |      0.614 |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences | title and first sentence each paragraph   |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|:------------------------------------------|-----------:|
| Dummy Classifier             |   0.425 |                       0.443 |                   0.428 |                    0.46  | 0.418                                     |      0.426 |
| KNN                          |   0.519 |                       0.506 |                   0.552 |                    0.456 | 0.365                                     |      0.668 |
| LinearSVM                    |   0.619 |                       0.69  |                   0.734 |                    0.745 | 0.734                                     |      0.746 |
| LogisticRegression           |   0.647 |                       0.669 |                   0.719 |                    0.736 | 0.723                                     |      0.751 |
| LogisticRegressionElasticNet |   0.664 |                       0.684 |                   0.729 |                    0.721 | 0.731                                     |      0.77  |
| LogisticRegressionLasso      |   0.674 |                       0.679 |                   0.697 |                    0.685 | 0.702                                     |      0.714 |
| LogisticRegressionRidge      |   0.645 |                       0.708 |                   0.71  |                    0.737 | 0.737                                     |      0.761 |
| NaiveBayes                   |   0.626 |                       0.697 |                   0.709 |                    0.701 | 0.698                                     |      0.707 |
| RandomForest                 |   0.698 |                       0.714 |                   0.736 |                    0.788 | **0.808**                                 |      0.749 |
| RidgeClassifier              |   0.645 |                       0.69  |                   0.722 |                    0.72  | 0.721                                     |      0.748 |
| SVM                          |   0.631 |                       0.759 |                   0.712 |                    0.644 | 0.699                                     |      0.719 |
| XGBoost                      |   0.636 |                       0.647 |                   0.664 |                    0.717 | 0.707                                     |      0.696 |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|------------------------------------------:|:-----------|
| Dummy Classifier             |   0     |                       0     |                   0     | 0.000                    |                                     0     | 0.000      |
| KNN                          |   0     |                       0     |                   0     | 0.000                    |                                     0     | 0.000      |
| LinearSVM                    |   0.039 |                       0.058 |                   0.087 | 0.107                    |                                     0.117 | 0.107      |
| LogisticRegression           |   0.078 |                       0.029 |                   0.078 | 0.078                    |                                     0.087 | 0.087      |
| LogisticRegressionElasticNet |   0.097 |                       0.068 |                   0.078 | 0.078                    |                                     0.078 | 0.107      |
| LogisticRegressionLasso      |   0.068 |                       0.068 |                   0.078 | 0.039                    |                                     0.068 | 0.058      |
| LogisticRegressionRidge      |   0.087 |                       0.068 |                   0.087 | 0.097                    |                                     0.097 | 0.087      |
| NaiveBayes                   |   0.029 |                       0.029 |                   0.039 | 0.039                    |                                     0.019 | 0.029      |
| RandomForest                 |   0.068 |                       0.078 |                   0.117 | **0.146**                |                                     0.136 | **0.146**  |
| RidgeClassifier              |   0.078 |                       0.039 |                   0.087 | 0.078                    |                                     0.087 | 0.078      |
| SVM                          |   0.049 |                       0.058 |                   0.058 | 0.058                    |                                     0.078 | 0.097      |
| XGBoost                      |   0.029 |                       0.039 |                   0.058 | 0.087                    |                                     0.068 | 0.107      |

  report_table.reset_index().to_latex(latex_file, index=False)


# French

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.389 |                       0.365 |                   0.341 |                    0.329 |                                     0.371 | 0.332      |
| KNN                          |   0.414 |                       0.435 |                   0.436 |                    0.433 |                                     0.348 | 0.436      |
| LinearSVM                    |   0.248 |                       0.319 |                   0.322 |                    0.251 |                                     0.327 | 0.311      |
| LogisticRegression           |   0.317 |                       0.349 |                   0.327 |                    0.256 |                                     0.336 | 0.337      |
| LogisticRegressionElasticNet |   0.241 |                       0.365 |                   0.339 |                    0.301 |                                     0.361 | 0.351      |
| LogisticRegressionLasso      |   0.281 |                       0.44  |                   0.365 |                    0.298 |                                     0.389 | 0.430      |
| LogisticRegressionRidge      |   0.332 |                       0.385 |                   0.327 |                    0.308 |                                     0.315 | 0.323      |
| NaiveBayes                   |   0.384 |                       0.434 |                   0.447 |                    0.482 |                                     0.472 | **0.556**  |
| RandomForest                 |   0.271 |                       0.308 |                   0.272 |                    0.284 |                                     0.314 | 0.343      |
| RidgeClassifier              |   0.355 |                       0.394 |                   0.324 |                    0.292 |                                     0.312 | 0.328      |
| SVM                          |   0.297 |                       0.049 |                   0.094 |                    0.044 |                                     0.063 | 0.015      |
| XGBoost                      |   0.237 |                       0.339 |                   0.313 |                    0.303 |                                     0.344 | 0.419      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.563 |                       0.532 |                   0.492 | 0.468                    |                                     0.532 | 0.484      |
| KNN                          |   0.865 |                       0.968 |                   0.992 | **1.000**                |                                     0.659 | **1.000**  |
| LinearSVM                    |   0.238 |                       0.286 |                   0.302 | 0.206                    |                                     0.286 | 0.270      |
| LogisticRegression           |   0.341 |                       0.317 |                   0.286 | 0.206                    |                                     0.294 | 0.286      |
| LogisticRegressionElasticNet |   0.214 |                       0.333 |                   0.302 | 0.230                    |                                     0.317 | 0.302      |
| LogisticRegressionLasso      |   0.246 |                       0.397 |                   0.325 | 0.270                    |                                     0.389 | 0.421      |
| LogisticRegressionRidge      |   0.357 |                       0.373 |                   0.317 | 0.254                    |                                     0.278 | 0.278      |
| NaiveBayes                   |   0.429 |                       0.532 |                   0.524 | 0.563                    |                                     0.571 | 0.690      |
| RandomForest                 |   0.246 |                       0.278 |                   0.238 | 0.246                    |                                     0.278 | 0.325      |
| RidgeClassifier              |   0.381 |                       0.381 |                   0.317 | 0.246                    |                                     0.278 | 0.294      |
| SVM                          |   0.317 |                       0.063 |                   0.079 | 0.032                    |                                     0.063 | 0.008      |
| XGBoost                      |   0.23  |                       0.381 |                   0.294 | 0.278                    |                                     0.365 | 0.405      |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|------------------------------------------:|-----------:|
| Dummy Classifier             |   0.317 |                       0.3   |                   0.281 | 0.300                    |                                     0.308 |      0.292 |
| KNN                          |   0.292 |                       0.3   |                   0.297 | 0.295                    |                                     0.317 |      0.297 |
| LinearSVM                    |   0.335 |                       0.4   |                   0.43  | 0.383                    |                                     0.425 |      0.396 |
| LogisticRegression           |   0.364 |                       0.442 |                   0.465 | 0.394                    |                                     0.425 |      0.448 |
| LogisticRegressionElasticNet |   0.358 |                       0.569 |                   0.443 | **0.588**                |                                     0.462 |      0.482 |
| LogisticRegressionLasso      |   0.458 |                       0.583 |                   0.523 | 0.461                    |                                     0.439 |      0.468 |
| LogisticRegressionRidge      |   0.342 |                       0.463 |                   0.41  | 0.499                    |                                     0.42  |      0.485 |
| NaiveBayes                   |   0.412 |                       0.4   |                   0.423 | 0.468                    |                                     0.45  |      0.519 |
| RandomForest                 |   0.446 |                       0.399 |                   0.467 | 0.504                    |                                     0.412 |      0.405 |
| RidgeClassifier              |   0.37  |                       0.494 |                   0.382 | 0.466                    |                                     0.403 |      0.482 |
| SVM                          |   0.341 |                       0.112 |                   0.319 | 0.070                    |                                     0.063 |      0.087 |
| XGBoost                      |   0.339 |                       0.355 |                   0.417 | 0.394                    |                                     0.379 |      0.502 |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0     |                       0     |                   0     |                    0     |                                     0     | 0.000      |
| KNN                          |   0     |                       0     |                   0     |                    0     |                                     0     | 0.000      |
| LinearSVM                    |   0     |                       0.048 |                   0.048 |                    0.071 |                                     0.024 | 0.071      |
| LogisticRegression           |   0.024 |                       0.048 |                   0.048 |                    0.071 |                                     0.048 | 0.071      |
| LogisticRegressionElasticNet |   0     |                       0.048 |                   0     |                    0.071 |                                     0.071 | **0.143**  |
| LogisticRegressionLasso      |   0     |                       0.048 |                   0.024 |                    0.024 |                                     0.024 | 0.119      |
| LogisticRegressionRidge      |   0     |                       0.048 |                   0.024 |                    0.071 |                                     0     | 0.095      |
| NaiveBayes                   |   0.024 |                       0     |                   0     |                    0     |                                     0.024 | 0.000      |
| RandomForest                 |   0     |                       0     |                   0     |                    0.048 |                                     0.024 | 0.071      |
| RidgeClassifier              |   0     |                       0.048 |                   0.024 |                    0.071 |                                     0     | 0.095      |
| SVM                          |   0     |                       0     |                   0     |                    0     |                                     0.048 | 0.000      |
| XGBoost                      |   0     |                       0     |                   0.024 |                    0     |                                     0.048 | 0.024      |

  report_table.reset_index().to_latex(latex_file, index=False)


# German

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.434 |                       0.453 |                   0.449 |                    0.423 |                                     0.474 | 0.443      |
| KNN                          |   0.44  |                       0.513 |                   0.509 |                    0.518 |                                     0.443 | 0.407      |
| LinearSVM                    |   0.338 |                       0.365 |                   0.395 |                    0.382 |                                     0.426 | 0.459      |
| LogisticRegression           |   0.34  |                       0.401 |                   0.409 |                    0.413 |                                     0.441 | 0.466      |
| LogisticRegressionElasticNet |   0.284 |                       0.392 |                   0.419 |                    0.397 |                                     0.431 | 0.491      |
| LogisticRegressionLasso      |   0.217 |                       0.402 |                   0.429 |                    0.389 |                                     0.467 | 0.521      |
| LogisticRegressionRidge      |   0.35  |                       0.481 |                   0.47  |                    0.437 |                                     0.461 | 0.449      |
| NaiveBayes                   |   0.376 |                       0.555 |                   0.556 |                    0.572 |                                     0.551 | **0.585**  |
| RandomForest                 |   0.324 |                       0.55  |                   0.409 |                    0.405 |                                     0.418 | 0.436      |
| RidgeClassifier              |   0.347 |                       0.504 |                   0.46  |                    0.436 |                                     0.451 | 0.457      |
| SVM                          |   0.417 |                       0.199 |                   0.274 |                    0.359 |                                     0.32  | 0.424      |
| XGBoost                      |   0.29  |                       0.543 |                   0.402 |                    0.395 |                                     0.442 | 0.501      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|------------------------------------------:|-----------:|
| Dummy Classifier             |   0.517 |                       0.5   |                   0.494 | 0.459                    |                                     0.529 |      0.523 |
| KNN                          |   0.686 |                       0.756 |                   0.767 | **0.860**                |                                     0.767 |      0.756 |
| LinearSVM                    |   0.302 |                       0.331 |                   0.355 | 0.343                    |                                     0.419 |      0.442 |
| LogisticRegression           |   0.308 |                       0.378 |                   0.378 | 0.372                    |                                     0.424 |      0.448 |
| LogisticRegressionElasticNet |   0.25  |                       0.36  |                   0.39  | 0.366                    |                                     0.424 |      0.494 |
| LogisticRegressionLasso      |   0.145 |                       0.366 |                   0.384 | 0.343                    |                                     0.436 |      0.506 |
| LogisticRegressionRidge      |   0.349 |                       0.477 |                   0.459 | 0.413                    |                                     0.453 |      0.436 |
| NaiveBayes                   |   0.314 |                       0.605 |                   0.581 | 0.599                    |                                     0.663 |      0.698 |
| RandomForest                 |   0.291 |                       0.576 |                   0.378 | 0.372                    |                                     0.378 |      0.395 |
| RidgeClassifier              |   0.343 |                       0.494 |                   0.448 | 0.419                    |                                     0.448 |      0.442 |
| SVM                          |   0.483 |                       0.25  |                   0.314 | 0.436                    |                                     0.372 |      0.547 |
| XGBoost                      |   0.279 |                       0.669 |                   0.343 | 0.343                    |                                     0.401 |      0.453 |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.403 |                       0.449 |                   0.444 |                    0.442 |                                     0.462 | 0.408      |
| KNN                          |   0.433 |                       0.452 |                   0.485 |                    0.507 |                                     0.439 | 0.420      |
| LinearSVM                    |   0.477 |                       0.451 |                   0.506 |                    0.458 |                                     0.464 | 0.525      |
| LogisticRegression           |   0.474 |                       0.466 |                   0.53  |                    0.518 |                                     0.485 | 0.521      |
| LogisticRegressionElasticNet |   0.416 |                       0.507 |                   0.489 |                    0.46  |                                     0.467 | 0.507      |
| LogisticRegressionLasso      |   0.451 |                       0.553 |                   0.524 |                    0.493 |                                     0.523 | 0.587      |
| LogisticRegressionRidge      |   0.45  |                       0.528 |                   0.501 |                    0.494 |                                     0.519 | 0.484      |
| NaiveBayes                   |   0.505 |                       0.539 |                   0.557 |                    0.572 |                                     0.498 | 0.531      |
| RandomForest                 |   0.478 |                       0.59  |                   0.503 |                    0.476 |                                     0.55  | 0.496      |
| RidgeClassifier              |   0.451 |                       0.556 |                   0.492 |                    0.486 |                                     0.498 | 0.493      |
| SVM                          |   0.396 |                       0.207 |                   0.288 |                    0.441 |                                     0.31  | 0.351      |
| XGBoost                      |   0.502 |                       0.502 |                   0.553 |                    0.516 |                                     0.608 | **0.612**  |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title | title and first paragraph   | title and 5 sentences   | title and 10 sentences   | title and first sentence each paragraph   | raw text   |
|:-----------------------------|--------:|:----------------------------|:------------------------|:-------------------------|:------------------------------------------|:-----------|
| Dummy Classifier             |       0 | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| KNN                          |       0 | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| LinearSVM                    |       0 | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | **0.029**  |
| LogisticRegression           |       0 | 0.000                       | 0.000                   | **0.029**                | 0.000                                     | 0.000      |
| LogisticRegressionElasticNet |       0 | **0.029**                   | 0.000                   | 0.000                    | 0.000                                     | **0.029**  |
| LogisticRegressionLasso      |       0 | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| LogisticRegressionRidge      |       0 | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| NaiveBayes                   |       0 | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| RandomForest                 |       0 | 0.000                       | 0.000                   | **0.029**                | 0.000                                     | **0.029**  |
| RidgeClassifier              |       0 | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | **0.029**  |
| SVM                          |       0 | 0.000                       | **0.029**               | 0.000                    | **0.029**                                 | 0.000      |
| XGBoost                      |       0 | **0.029**                   | 0.000                   | 0.000                    | 0.000                                     | 0.000      |

  report_table.reset_index().to_latex(latex_file, index=False)


# Italian

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.388 |                       0.395 |                   0.376 |                    0.386 |                                     0.36  | 0.379      |
| KNN                          |   0.523 |                       0.491 |                   0.502 |                    0.502 |                                     0.459 | 0.503      |
| LinearSVM                    |   0.406 |                       0.401 |                   0.387 |                    0.432 |                                     0.42  | 0.476      |
| LogisticRegression           |   0.446 |                       0.391 |                   0.395 |                    0.433 |                                     0.442 | 0.486      |
| LogisticRegressionElasticNet |   0.365 |                       0.395 |                   0.407 |                    0.433 |                                     0.496 | 0.482      |
| LogisticRegressionLasso      |   0.343 |                       0.42  |                   0.399 |                    0.457 |                                     0.481 | 0.487      |
| LogisticRegressionRidge      |   0.431 |                       0.439 |                   0.417 |                    0.466 |                                     0.435 | 0.473      |
| NaiveBayes                   |   0.509 |                       0.543 |                   0.553 |                    0.589 |                                     0.594 | **0.629**  |
| RandomForest                 |   0.351 |                       0.457 |                   0.43  |                    0.476 |                                     0.446 | 0.502      |
| RidgeClassifier              |   0.45  |                       0.431 |                   0.412 |                    0.459 |                                     0.435 | 0.508      |
| SVM                          |   0.359 |                       0.033 |                   0.041 |                    0.218 |                                     0.166 | 0.338      |
| XGBoost                      |   0.308 |                       0.456 |                   0.401 |                    0.431 |                                     0.452 | 0.488      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   |   title |   title and first paragraph | title and 5 sentences   | title and 10 sentences   |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|:------------------------|:-------------------------|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.487 |                       0.5   | 0.461                   | 0.374                    |                                     0.47  | 0.487      |
| KNN                          |   0.891 |                       0.904 | **1.000**               | **1.000**                |                                     0.839 | **1.000**  |
| LinearSVM                    |   0.374 |                       0.374 | 0.330                   | 0.365                    |                                     0.374 | 0.417      |
| LogisticRegression           |   0.43  |                       0.357 | 0.343                   | 0.374                    |                                     0.391 | 0.439      |
| LogisticRegressionElasticNet |   0.309 |                       0.357 | 0.339                   | 0.361                    |                                     0.435 | 0.443      |
| LogisticRegressionLasso      |   0.283 |                       0.361 | 0.348                   | 0.400                    |                                     0.461 | 0.470      |
| LogisticRegressionRidge      |   0.426 |                       0.413 | 0.378                   | 0.409                    |                                     0.383 | 0.417      |
| NaiveBayes                   |   0.552 |                       0.622 | 0.622                   | 0.613                    |                                     0.691 | 0.717      |
| RandomForest                 |   0.304 |                       0.439 | 0.391                   | 0.435                    |                                     0.374 | 0.435      |
| RidgeClassifier              |   0.448 |                       0.409 | 0.378                   | 0.404                    |                                     0.387 | 0.452      |
| SVM                          |   0.3   |                       0.017 | 0.022                   | 0.174                    |                                     0.209 | 0.335      |
| XGBoost                      |   0.27  |                       0.435 | 0.400                   | 0.413                    |                                     0.413 | 0.443      |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.35  |                       0.362 |                   0.352 |                    0.413 |                                     0.324 | 0.352      |
| KNN                          |   0.387 |                       0.357 |                   0.352 |                    0.352 |                                     0.348 | 0.353      |
| LinearSVM                    |   0.471 |                       0.449 |                   0.505 |                    0.575 |                                     0.545 | 0.652      |
| LogisticRegression           |   0.487 |                       0.474 |                   0.501 |                    0.577 |                                     0.571 | 0.633      |
| LogisticRegressionElasticNet |   0.472 |                       0.485 |                   0.574 |                    0.563 |                                     0.656 | 0.601      |
| LogisticRegressionLasso      |   0.523 |                       0.526 |                   0.494 |                    0.583 |                                     0.517 | 0.564      |
| LogisticRegressionRidge      |   0.466 |                       0.496 |                   0.502 |                    0.595 |                                     0.584 | 0.585      |
| NaiveBayes                   |   0.488 |                       0.501 |                   0.519 |                    0.583 |                                     0.533 | 0.583      |
| RandomForest                 |   0.448 |                       0.541 |                   0.514 |                    0.599 |                                     0.581 | 0.629      |
| RidgeClassifier              |   0.485 |                       0.479 |                   0.487 |                    0.587 |                                     0.582 | **0.666**  |
| SVM                          |   0.474 |                       0.313 |                   0.343 |                    0.453 |                                     0.291 | 0.430      |
| XGBoost                      |   0.381 |                       0.527 |                   0.411 |                    0.46  |                                     0.512 | 0.575      |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0     |                       0     |                   0     |                    0     |                                     0     | 0.000      |
| KNN                          |   0     |                       0     |                   0     |                    0     |                                     0     | 0.000      |
| LinearSVM                    |   0.067 |                       0.05  |                   0.067 |                    0.067 |                                     0.067 | 0.150      |
| LogisticRegression           |   0.067 |                       0.067 |                   0.067 |                    0.067 |                                     0.033 | 0.150      |
| LogisticRegressionElasticNet |   0.033 |                       0.067 |                   0.05  |                    0.1   |                                     0.067 | **0.167**  |
| LogisticRegressionLasso      |   0.017 |                       0.067 |                   0.017 |                    0.083 |                                     0.033 | 0.083      |
| LogisticRegressionRidge      |   0.033 |                       0.083 |                   0.1   |                    0.1   |                                     0.05  | 0.133      |
| NaiveBayes                   |   0     |                       0.017 |                   0.05  |                    0.1   |                                     0.033 | 0.067      |
| RandomForest                 |   0.033 |                       0.033 |                   0.083 |                    0.1   |                                     0.083 | 0.133      |
| RidgeClassifier              |   0.017 |                       0.083 |                   0.1   |                    0.083 |                                     0.05  | 0.150      |
| SVM                          |   0.05  |                       0.033 |                   0.017 |                    0.067 |                                     0.017 | 0.083      |
| XGBoost                      |   0     |                       0.017 |                   0.017 |                    0.033 |                                     0.017 | 0.117      |

  report_table.reset_index().to_latex(latex_file, index=False)


# Polish

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.501 |                       0.454 |                   0.491 |                    0.427 |                                     0.461 | 0.485      |
| KNN                          |   0.534 |                       0.444 |                   0.452 |                    0.342 |                                     0.337 | 0.325      |
| LinearSVM                    |   0.531 |                       0.454 |                   0.503 |                    0.465 |                                     0.545 | 0.550      |
| LogisticRegression           |   0.538 |                       0.471 |                   0.519 |                    0.492 |                                     0.537 | 0.579      |
| LogisticRegressionElasticNet |   0.513 |                       0.47  |                   0.486 |                    0.482 |                                     0.559 | 0.581      |
| LogisticRegressionLasso      |   0.412 |                       0.484 |                   0.491 |                    0.509 |                                     0.57  | 0.566      |
| LogisticRegressionRidge      |   0.546 |                       0.48  |                   0.539 |                    0.504 |                                     0.581 | 0.577      |
| NaiveBayes                   |   0.526 |                       0.557 |                   0.589 |                    0.615 |                                     0.615 | **0.666**  |
| RandomForest                 |   0.516 |                       0.492 |                   0.48  |                    0.477 |                                     0.555 | 0.609      |
| RidgeClassifier              |   0.538 |                       0.472 |                   0.526 |                    0.489 |                                     0.572 | 0.594      |
| SVM                          |   0.489 |                       0.323 |                   0.351 |                    0.435 |                                     0.445 | 0.437      |
| XGBoost                      |   0.48  |                       0.49  |                   0.489 |                    0.505 |                                     0.548 | 0.624      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.578 |                       0.466 |                   0.529 |                    0.417 |                                     0.5   | 0.549      |
| KNN                          |   0.714 |                       0.549 |                   0.704 |                    0.641 |                                     0.33  | 0.636      |
| LinearSVM                    |   0.519 |                       0.427 |                   0.505 |                    0.437 |                                     0.524 | 0.529      |
| LogisticRegression           |   0.524 |                       0.451 |                   0.524 |                    0.466 |                                     0.515 | 0.553      |
| LogisticRegressionElasticNet |   0.5   |                       0.456 |                   0.485 |                    0.456 |                                     0.568 | 0.587      |
| LogisticRegressionLasso      |   0.345 |                       0.456 |                   0.51  |                    0.485 |                                     0.549 | 0.544      |
| LogisticRegressionRidge      |   0.529 |                       0.471 |                   0.558 |                    0.476 |                                     0.568 | 0.549      |
| NaiveBayes                   |   0.505 |                       0.592 |                   0.621 |                    0.655 |                                     0.684 | **0.830**  |
| RandomForest                 |   0.5   |                       0.495 |                   0.485 |                    0.461 |                                     0.471 | 0.544      |
| RidgeClassifier              |   0.524 |                       0.461 |                   0.534 |                    0.456 |                                     0.549 | 0.563      |
| SVM                          |   0.505 |                       0.379 |                   0.403 |                    0.539 |                                     0.563 | 0.529      |
| XGBoost                      |   0.481 |                       0.505 |                   0.476 |                    0.519 |                                     0.505 | 0.563      |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.472 |                       0.463 |                   0.506 |                    0.454 |                                     0.453 | 0.464      |
| KNN                          |   0.525 |                       0.418 |                   0.587 |                    0.349 |                                     0.571 | 0.225      |
| LinearSVM                    |   0.57  |                       0.526 |                   0.579 |                    0.572 |                                     0.675 | 0.650      |
| LogisticRegression           |   0.58  |                       0.534 |                   0.562 |                    0.608 |                                     0.632 | 0.750      |
| LogisticRegressionElasticNet |   0.594 |                       0.542 |                   0.581 |                    0.58  |                                     0.597 | 0.686      |
| LogisticRegressionLasso      |   0.593 |                       0.583 |                   0.524 |                    0.585 |                                     0.632 | 0.617      |
| LogisticRegressionRidge      |   0.595 |                       0.524 |                   0.566 |                    0.595 |                                     0.715 | 0.718      |
| NaiveBayes                   |   0.584 |                       0.57  |                   0.583 |                    0.601 |                                     0.575 | 0.580      |
| RandomForest                 |   0.639 |                       0.52  |                   0.517 |                    0.558 |                                     0.752 | **0.768**  |
| RidgeClassifier              |   0.583 |                       0.515 |                   0.562 |                    0.589 |                                     0.713 | 0.736      |
| SVM                          |   0.482 |                       0.43  |                   0.422 |                    0.375 |                                     0.376 | 0.417      |
| XGBoost                      |   0.523 |                       0.528 |                   0.553 |                    0.537 |                                     0.633 | 0.766      |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences | title and first sentence each paragraph   |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|:------------------------------------------|-----------:|
| Dummy Classifier             |       0 |                       0     |                   0     |                    0     | 0.000                                     |      0     |
| KNN                          |       0 |                       0     |                   0     |                    0     | 0.000                                     |      0     |
| LinearSVM                    |       0 |                       0.026 |                   0.026 |                    0.026 | 0.000                                     |      0.026 |
| LogisticRegression           |       0 |                       0     |                   0.026 |                    0.026 | 0.000                                     |      0.026 |
| LogisticRegressionElasticNet |       0 |                       0.026 |                   0     |                    0.026 | 0.000                                     |      0     |
| LogisticRegressionLasso      |       0 |                       0.026 |                   0     |                    0     | 0.000                                     |      0     |
| LogisticRegressionRidge      |       0 |                       0     |                   0.026 |                    0.026 | 0.026                                     |      0.026 |
| NaiveBayes                   |       0 |                       0     |                   0.026 |                    0.026 | 0.026                                     |      0     |
| RandomForest                 |       0 |                       0     |                   0.026 |                    0.051 | **0.077**                                 |      0.051 |
| RidgeClassifier              |       0 |                       0     |                   0.026 |                    0.026 | 0.026                                     |      0.026 |
| SVM                          |       0 |                       0     |                   0     |                    0     | 0.000                                     |      0     |
| XGBoost                      |       0 |                       0     |                   0.026 |                    0     | 0.026                                     |      0.051 |

  report_table.reset_index().to_latex(latex_file, index=False)


# Russian

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| Dummy Classifier             |   0.296 |                       0.272 |                   0.284 |                    0.291 |                                     0.321 | 0.265      |
| KNN                          |   0.358 |                       0.368 |                   0.349 |                    0.349 |                                     0.349 | 0.349      |
| LinearSVM                    |   0.321 |                       0.192 |                   0.221 |                    0.253 |                                     0.242 | 0.312      |
| LogisticRegression           |   0.309 |                       0.264 |                   0.279 |                    0.261 |                                     0.295 | 0.327      |
| LogisticRegressionElasticNet |   0.308 |                       0.22  |                   0.221 |                    0.256 |                                     0.278 | 0.343      |
| LogisticRegressionLasso      |   0.299 |                       0.24  |                   0.221 |                    0.38  |                                     0.347 | 0.388      |
| LogisticRegressionRidge      |   0.335 |                       0.38  |                   0.332 |                    0.334 |                                     0.313 | 0.327      |
| NaiveBayes                   |   0.391 |                       0.435 |                   0.439 |                    0.463 |                                     0.487 | **0.504**  |
| RandomForest                 |   0.296 |                       0.23  |                   0.208 |                    0.249 |                                     0.337 | 0.318      |
| RidgeClassifier              |   0.328 |                       0.378 |                   0.301 |                    0.321 |                                     0.313 | 0.327      |
| SVM                          |   0.181 |                       0.022 |                   0.022 |                    0.027 |                                     0.048 | 0.031      |
| XGBoost                      |   0.245 |                       0.246 |                   0.249 |                    0.332 |                                     0.366 | 0.429      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   | title and first sentence each paragraph   | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|:------------------------------------------|:-----------|
| Dummy Classifier             |   0.523 |                       0.477 |                   0.477 | 0.512                    | 0.535                                     | 0.453      |
| KNN                          |   0.488 |                       0.907 |                   0.965 | **1.000**                | **1.000**                                 | **1.000**  |
| LinearSVM                    |   0.302 |                       0.163 |                   0.163 | 0.186                    | 0.186                                     | 0.233      |
| LogisticRegression           |   0.279 |                       0.244 |                   0.209 | 0.198                    | 0.256                                     | 0.244      |
| LogisticRegressionElasticNet |   0.244 |                       0.186 |                   0.174 | 0.186                    | 0.233                                     | 0.267      |
| LogisticRegressionLasso      |   0.233 |                       0.221 |                   0.186 | 0.326                    | 0.326                                     | 0.395      |
| LogisticRegressionRidge      |   0.349 |                       0.372 |                   0.267 | 0.279                    | 0.256                                     | 0.267      |
| NaiveBayes                   |   0.453 |                       0.558 |                   0.558 | 0.570                    | 0.605                                     | 0.663      |
| RandomForest                 |   0.267 |                       0.198 |                   0.151 | 0.209                    | 0.267                                     | 0.256      |
| RidgeClassifier              |   0.314 |                       0.384 |                   0.267 | 0.279                    | 0.267                                     | 0.267      |
| SVM                          |   0.163 |                       0.012 |                   0.012 | 0.035                    | 0.070                                     | 0.070      |
| XGBoost                      |   0.233 |                       0.233 |                   0.244 | 0.314                    | 0.337                                     | 0.442      |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|------------------------------------------:|-----------:|
| Dummy Classifier             |   0.212 |                       0.21  |                   0.216 | 0.219                    |                                     0.245 |      0.194 |
| KNN                          |   0.307 |                       0.236 |                   0.219 | 0.218                    |                                     0.219 |      0.219 |
| LinearSVM                    |   0.46  |                       0.298 |                   0.363 | 0.435                    |                                     0.432 |      0.503 |
| LogisticRegression           |   0.452 |                       0.344 |                   0.469 | 0.430                    |                                     0.387 |      0.527 |
| LogisticRegressionElasticNet |   0.5   |                       0.287 |                   0.383 | 0.417                    |                                     0.372 |      0.483 |
| LogisticRegressionLasso      |   0.505 |                       0.282 |                   0.291 | **0.565**                |                                     0.409 |      0.421 |
| LogisticRegressionRidge      |   0.368 |                       0.45  |                   0.494 | 0.466                    |                                     0.44  |      0.48  |
| NaiveBayes                   |   0.378 |                       0.377 |                   0.393 | 0.426                    |                                     0.434 |      0.433 |
| RandomForest                 |   0.447 |                       0.282 |                   0.378 | 0.355                    |                                     0.488 |      0.508 |
| RidgeClassifier              |   0.397 |                       0.422 |                   0.377 | 0.405                    |                                     0.412 |      0.48  |
| SVM                          |   0.211 |                       0.163 |                   0.163 | 0.021                    |                                     0.157 |      0.02  |
| XGBoost                      |   0.352 |                       0.289 |                   0.322 | 0.378                    |                                     0.427 |      0.55  |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences | title and first sentence each paragraph   |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|:------------------------------------------|-----------:|
| Dummy Classifier             |   0     |                       0     |                   0     |                    0     | 0.000                                     |      0     |
| KNN                          |   0     |                       0     |                   0     |                    0     | 0.000                                     |      0     |
| LinearSVM                    |   0.026 |                       0.053 |                   0.079 |                    0.105 | 0.053                                     |      0.053 |
| LogisticRegression           |   0.026 |                       0.053 |                   0.105 |                    0.105 | 0.053                                     |      0.079 |
| LogisticRegressionElasticNet |   0.026 |                       0.026 |                   0.026 |                    0.053 | 0.053                                     |      0.158 |
| LogisticRegressionLasso      |   0.026 |                       0.026 |                   0.026 |                    0.053 | 0.105                                     |      0.079 |
| LogisticRegressionRidge      |   0     |                       0.053 |                   0.105 |                    0.132 | 0.079                                     |      0.053 |
| NaiveBayes                   |   0     |                       0.026 |                   0.079 |                    0.079 | 0.026                                     |      0.053 |
| RandomForest                 |   0.026 |                       0.026 |                   0.053 |                    0.026 | 0.132                                     |      0.132 |
| RidgeClassifier              |   0.026 |                       0.053 |                   0.053 |                    0.132 | 0.053                                     |      0.053 |
| SVM                          |   0     |                       0     |                   0     |                    0     | 0.000                                     |      0     |
| XGBoost                      |   0.053 |                       0     |                   0     |                    0.053 | **0.184**                                 |      0.079 |

  report_table.reset_index().to_latex(latex_file, index=False)


# All 6 Languages

## f1_micro

| language   | model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| en         | Dummy Classifier             |   0.427 |                       0.459 |                   0.437 |                    0.433 |                                     0.43  | 0.435      |
| en         | KNN                          |   0.49  |                       0.54  |                   0.527 |                    0.478 |                                     0.466 | 0.543      |
| en         | LinearSVM                    |   0.593 |                       0.612 |                   0.639 |                    0.651 |                                     0.649 | 0.679      |
| en         | LogisticRegression           |   0.594 |                       0.618 |                   0.628 |                    0.653 |                                     0.659 | 0.694      |
| en         | LogisticRegressionElasticNet |   0.604 |                       0.612 |                   0.638 |                    0.64  |                                     0.669 | 0.685      |
| en         | LogisticRegressionLasso      |   0.555 |                       0.615 |                   0.624 |                    0.624 |                                     0.655 | 0.645      |
| en         | LogisticRegressionRidge      |   0.61  |                       0.629 |                   0.643 |                    0.669 |                                     0.659 | 0.700      |
| en         | NaiveBayes                   |   0.631 |                       0.695 |                   0.718 |                    0.712 |                                     0.713 | **0.719**  |
| en         | RandomForest                 |   0.581 |                       0.616 |                   0.632 |                    0.654 |                                     0.663 | 0.656      |
| en         | RidgeClassifier              |   0.602 |                       0.64  |                   0.636 |                    0.66  |                                     0.656 | 0.689      |
| en         | SVM                          |   0.467 |                       0.472 |                   0.489 |                    0.523 |                                     0.541 | 0.569      |
| en         | XGBoost                      |   0.526 |                       0.6   |                   0.615 |                    0.616 |                                     0.637 | 0.648      |
| fr         | Dummy Classifier             |   0.389 |                       0.365 |                   0.341 |                    0.329 |                                     0.371 | 0.332      |
| fr         | KNN                          |   0.414 |                       0.435 |                   0.436 |                    0.433 |                                     0.348 | 0.436      |
| fr         | LinearSVM                    |   0.248 |                       0.319 |                   0.322 |                    0.251 |                                     0.327 | 0.311      |
| fr         | LogisticRegression           |   0.317 |                       0.349 |                   0.327 |                    0.256 |                                     0.336 | 0.337      |
| fr         | LogisticRegressionElasticNet |   0.241 |                       0.365 |                   0.339 |                    0.301 |                                     0.361 | 0.351      |
| fr         | LogisticRegressionLasso      |   0.281 |                       0.44  |                   0.365 |                    0.298 |                                     0.389 | 0.430      |
| fr         | LogisticRegressionRidge      |   0.332 |                       0.385 |                   0.327 |                    0.308 |                                     0.315 | 0.323      |
| fr         | NaiveBayes                   |   0.384 |                       0.434 |                   0.447 |                    0.482 |                                     0.472 | **0.556**  |
| fr         | RandomForest                 |   0.271 |                       0.308 |                   0.272 |                    0.284 |                                     0.314 | 0.343      |
| fr         | RidgeClassifier              |   0.355 |                       0.394 |                   0.324 |                    0.292 |                                     0.312 | 0.328      |
| fr         | SVM                          |   0.297 |                       0.049 |                   0.094 |                    0.044 |                                     0.063 | 0.015      |
| fr         | XGBoost                      |   0.237 |                       0.339 |                   0.313 |                    0.303 |                                     0.344 | 0.419      |
| ge         | Dummy Classifier             |   0.434 |                       0.453 |                   0.449 |                    0.423 |                                     0.474 | 0.443      |
| ge         | KNN                          |   0.44  |                       0.513 |                   0.509 |                    0.518 |                                     0.443 | 0.407      |
| ge         | LinearSVM                    |   0.338 |                       0.365 |                   0.395 |                    0.382 |                                     0.426 | 0.459      |
| ge         | LogisticRegression           |   0.34  |                       0.401 |                   0.409 |                    0.413 |                                     0.441 | 0.466      |
| ge         | LogisticRegressionElasticNet |   0.284 |                       0.392 |                   0.419 |                    0.397 |                                     0.431 | 0.491      |
| ge         | LogisticRegressionLasso      |   0.217 |                       0.402 |                   0.429 |                    0.389 |                                     0.467 | 0.521      |
| ge         | LogisticRegressionRidge      |   0.35  |                       0.481 |                   0.47  |                    0.437 |                                     0.461 | 0.449      |
| ge         | NaiveBayes                   |   0.376 |                       0.555 |                   0.556 |                    0.572 |                                     0.551 | **0.585**  |
| ge         | RandomForest                 |   0.324 |                       0.55  |                   0.409 |                    0.405 |                                     0.418 | 0.436      |
| ge         | RidgeClassifier              |   0.347 |                       0.504 |                   0.46  |                    0.436 |                                     0.451 | 0.457      |
| ge         | SVM                          |   0.417 |                       0.199 |                   0.274 |                    0.359 |                                     0.32  | 0.424      |
| ge         | XGBoost                      |   0.29  |                       0.543 |                   0.402 |                    0.395 |                                     0.442 | 0.501      |
| it         | Dummy Classifier             |   0.388 |                       0.395 |                   0.376 |                    0.386 |                                     0.36  | 0.379      |
| it         | KNN                          |   0.523 |                       0.491 |                   0.502 |                    0.502 |                                     0.459 | 0.503      |
| it         | LinearSVM                    |   0.406 |                       0.401 |                   0.387 |                    0.432 |                                     0.42  | 0.476      |
| it         | LogisticRegression           |   0.446 |                       0.391 |                   0.395 |                    0.433 |                                     0.442 | 0.486      |
| it         | LogisticRegressionElasticNet |   0.365 |                       0.395 |                   0.407 |                    0.433 |                                     0.496 | 0.482      |
| it         | LogisticRegressionLasso      |   0.343 |                       0.42  |                   0.399 |                    0.457 |                                     0.481 | 0.487      |
| it         | LogisticRegressionRidge      |   0.431 |                       0.439 |                   0.417 |                    0.466 |                                     0.435 | 0.473      |
| it         | NaiveBayes                   |   0.509 |                       0.543 |                   0.553 |                    0.589 |                                     0.594 | **0.629**  |
| it         | RandomForest                 |   0.351 |                       0.457 |                   0.43  |                    0.476 |                                     0.446 | 0.502      |
| it         | RidgeClassifier              |   0.45  |                       0.431 |                   0.412 |                    0.459 |                                     0.435 | 0.508      |
| it         | SVM                          |   0.359 |                       0.033 |                   0.041 |                    0.218 |                                     0.166 | 0.338      |
| it         | XGBoost                      |   0.308 |                       0.456 |                   0.401 |                    0.431 |                                     0.452 | 0.488      |
| po         | Dummy Classifier             |   0.501 |                       0.454 |                   0.491 |                    0.427 |                                     0.461 | 0.485      |
| po         | KNN                          |   0.534 |                       0.444 |                   0.452 |                    0.342 |                                     0.337 | 0.325      |
| po         | LinearSVM                    |   0.531 |                       0.454 |                   0.503 |                    0.465 |                                     0.545 | 0.550      |
| po         | LogisticRegression           |   0.538 |                       0.471 |                   0.519 |                    0.492 |                                     0.537 | 0.579      |
| po         | LogisticRegressionElasticNet |   0.513 |                       0.47  |                   0.486 |                    0.482 |                                     0.559 | 0.581      |
| po         | LogisticRegressionLasso      |   0.412 |                       0.484 |                   0.491 |                    0.509 |                                     0.57  | 0.566      |
| po         | LogisticRegressionRidge      |   0.546 |                       0.48  |                   0.539 |                    0.504 |                                     0.581 | 0.577      |
| po         | NaiveBayes                   |   0.526 |                       0.557 |                   0.589 |                    0.615 |                                     0.615 | **0.666**  |
| po         | RandomForest                 |   0.516 |                       0.492 |                   0.48  |                    0.477 |                                     0.555 | 0.609      |
| po         | RidgeClassifier              |   0.538 |                       0.472 |                   0.526 |                    0.489 |                                     0.572 | 0.594      |
| po         | SVM                          |   0.489 |                       0.323 |                   0.351 |                    0.435 |                                     0.445 | 0.437      |
| po         | XGBoost                      |   0.48  |                       0.49  |                   0.489 |                    0.505 |                                     0.548 | 0.624      |
| ru         | Dummy Classifier             |   0.296 |                       0.272 |                   0.284 |                    0.291 |                                     0.321 | 0.265      |
| ru         | KNN                          |   0.358 |                       0.368 |                   0.349 |                    0.349 |                                     0.349 | 0.349      |
| ru         | LinearSVM                    |   0.321 |                       0.192 |                   0.221 |                    0.253 |                                     0.242 | 0.312      |
| ru         | LogisticRegression           |   0.309 |                       0.264 |                   0.279 |                    0.261 |                                     0.295 | 0.327      |
| ru         | LogisticRegressionElasticNet |   0.308 |                       0.22  |                   0.221 |                    0.256 |                                     0.278 | 0.343      |
| ru         | LogisticRegressionLasso      |   0.299 |                       0.24  |                   0.221 |                    0.38  |                                     0.347 | 0.388      |
| ru         | LogisticRegressionRidge      |   0.335 |                       0.38  |                   0.332 |                    0.334 |                                     0.313 | 0.327      |
| ru         | NaiveBayes                   |   0.391 |                       0.435 |                   0.439 |                    0.463 |                                     0.487 | **0.504**  |
| ru         | RandomForest                 |   0.296 |                       0.23  |                   0.208 |                    0.249 |                                     0.337 | 0.318      |
| ru         | RidgeClassifier              |   0.328 |                       0.378 |                   0.301 |                    0.321 |                                     0.313 | 0.327      |
| ru         | SVM                          |   0.181 |                       0.022 |                   0.022 |                    0.027 |                                     0.048 | 0.031      |
| ru         | XGBoost                      |   0.245 |                       0.246 |                   0.249 |                    0.332 |                                     0.366 | 0.429      |

  multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)


## recall_micro

| language   | model_type                   |   title |   title and first paragraph | title and 5 sentences   | title and 10 sentences   | title and first sentence each paragraph   | raw text   |
|:-----------|:-----------------------------|--------:|----------------------------:|:------------------------|:-------------------------|:------------------------------------------|:-----------|
| en         | Dummy Classifier             |   0.482 |                       0.533 | 0.499                   | 0.416                    | 0.491                                     | 0.491      |
| en         | KNN                          |   0.792 |                       0.719 | 0.680                   | **0.829**                | 0.800                                     | 0.523      |
| en         | LinearSVM                    |   0.592 |                       0.577 | 0.587                   | 0.597                    | 0.606                                     | 0.641      |
| en         | LogisticRegression           |   0.555 |                       0.589 | 0.582                   | 0.606                    | 0.623                                     | 0.658      |
| en         | LogisticRegressionElasticNet |   0.56  |                       0.572 | 0.582                   | 0.589                    | 0.636                                     | 0.631      |
| en         | LogisticRegressionLasso      |   0.482 |                       0.577 | 0.575                   | 0.587                    | 0.623                                     | 0.601      |
| en         | LogisticRegressionRidge      |   0.589 |                       0.592 | 0.601                   | 0.626                    | 0.616                                     | 0.660      |
| en         | NaiveBayes                   |   0.667 |                       0.719 | 0.753                   | 0.751                    | 0.760                                     | 0.760      |
| en         | RandomForest                 |   0.518 |                       0.567 | 0.577                   | 0.604                    | 0.604                                     | 0.609      |
| en         | RidgeClassifier              |   0.57  |                       0.614 | 0.587                   | 0.621                    | 0.614                                     | 0.653      |
| en         | SVM                          |   0.45  |                       0.428 | 0.457                   | 0.479                    | 0.516                                     | 0.535      |
| en         | XGBoost                      |   0.479 |                       0.567 | 0.584                   | 0.570                    | 0.592                                     | 0.614      |
| fr         | Dummy Classifier             |   0.563 |                       0.532 | 0.492                   | 0.468                    | 0.532                                     | 0.484      |
| fr         | KNN                          |   0.865 |                       0.968 | 0.992                   | **1.000**                | 0.659                                     | **1.000**  |
| fr         | LinearSVM                    |   0.238 |                       0.286 | 0.302                   | 0.206                    | 0.286                                     | 0.270      |
| fr         | LogisticRegression           |   0.341 |                       0.317 | 0.286                   | 0.206                    | 0.294                                     | 0.286      |
| fr         | LogisticRegressionElasticNet |   0.214 |                       0.333 | 0.302                   | 0.230                    | 0.317                                     | 0.302      |
| fr         | LogisticRegressionLasso      |   0.246 |                       0.397 | 0.325                   | 0.270                    | 0.389                                     | 0.421      |
| fr         | LogisticRegressionRidge      |   0.357 |                       0.373 | 0.317                   | 0.254                    | 0.278                                     | 0.278      |
| fr         | NaiveBayes                   |   0.429 |                       0.532 | 0.524                   | 0.563                    | 0.571                                     | 0.690      |
| fr         | RandomForest                 |   0.246 |                       0.278 | 0.238                   | 0.246                    | 0.278                                     | 0.325      |
| fr         | RidgeClassifier              |   0.381 |                       0.381 | 0.317                   | 0.246                    | 0.278                                     | 0.294      |
| fr         | SVM                          |   0.317 |                       0.063 | 0.079                   | 0.032                    | 0.063                                     | 0.008      |
| fr         | XGBoost                      |   0.23  |                       0.381 | 0.294                   | 0.278                    | 0.365                                     | 0.405      |
| ge         | Dummy Classifier             |   0.517 |                       0.5   | 0.494                   | 0.459                    | 0.529                                     | 0.523      |
| ge         | KNN                          |   0.686 |                       0.756 | 0.767                   | **0.860**                | 0.767                                     | 0.756      |
| ge         | LinearSVM                    |   0.302 |                       0.331 | 0.355                   | 0.343                    | 0.419                                     | 0.442      |
| ge         | LogisticRegression           |   0.308 |                       0.378 | 0.378                   | 0.372                    | 0.424                                     | 0.448      |
| ge         | LogisticRegressionElasticNet |   0.25  |                       0.36  | 0.390                   | 0.366                    | 0.424                                     | 0.494      |
| ge         | LogisticRegressionLasso      |   0.145 |                       0.366 | 0.384                   | 0.343                    | 0.436                                     | 0.506      |
| ge         | LogisticRegressionRidge      |   0.349 |                       0.477 | 0.459                   | 0.413                    | 0.453                                     | 0.436      |
| ge         | NaiveBayes                   |   0.314 |                       0.605 | 0.581                   | 0.599                    | 0.663                                     | 0.698      |
| ge         | RandomForest                 |   0.291 |                       0.576 | 0.378                   | 0.372                    | 0.378                                     | 0.395      |
| ge         | RidgeClassifier              |   0.343 |                       0.494 | 0.448                   | 0.419                    | 0.448                                     | 0.442      |
| ge         | SVM                          |   0.483 |                       0.25  | 0.314                   | 0.436                    | 0.372                                     | 0.547      |
| ge         | XGBoost                      |   0.279 |                       0.669 | 0.343                   | 0.343                    | 0.401                                     | 0.453      |
| it         | Dummy Classifier             |   0.487 |                       0.5   | 0.461                   | 0.374                    | 0.470                                     | 0.487      |
| it         | KNN                          |   0.891 |                       0.904 | **1.000**               | **1.000**                | 0.839                                     | **1.000**  |
| it         | LinearSVM                    |   0.374 |                       0.374 | 0.330                   | 0.365                    | 0.374                                     | 0.417      |
| it         | LogisticRegression           |   0.43  |                       0.357 | 0.343                   | 0.374                    | 0.391                                     | 0.439      |
| it         | LogisticRegressionElasticNet |   0.309 |                       0.357 | 0.339                   | 0.361                    | 0.435                                     | 0.443      |
| it         | LogisticRegressionLasso      |   0.283 |                       0.361 | 0.348                   | 0.400                    | 0.461                                     | 0.470      |
| it         | LogisticRegressionRidge      |   0.426 |                       0.413 | 0.378                   | 0.409                    | 0.383                                     | 0.417      |
| it         | NaiveBayes                   |   0.552 |                       0.622 | 0.622                   | 0.613                    | 0.691                                     | 0.717      |
| it         | RandomForest                 |   0.304 |                       0.439 | 0.391                   | 0.435                    | 0.374                                     | 0.435      |
| it         | RidgeClassifier              |   0.448 |                       0.409 | 0.378                   | 0.404                    | 0.387                                     | 0.452      |
| it         | SVM                          |   0.3   |                       0.017 | 0.022                   | 0.174                    | 0.209                                     | 0.335      |
| it         | XGBoost                      |   0.27  |                       0.435 | 0.400                   | 0.413                    | 0.413                                     | 0.443      |
| po         | Dummy Classifier             |   0.578 |                       0.466 | 0.529                   | 0.417                    | 0.500                                     | 0.549      |
| po         | KNN                          |   0.714 |                       0.549 | 0.704                   | 0.641                    | 0.330                                     | 0.636      |
| po         | LinearSVM                    |   0.519 |                       0.427 | 0.505                   | 0.437                    | 0.524                                     | 0.529      |
| po         | LogisticRegression           |   0.524 |                       0.451 | 0.524                   | 0.466                    | 0.515                                     | 0.553      |
| po         | LogisticRegressionElasticNet |   0.5   |                       0.456 | 0.485                   | 0.456                    | 0.568                                     | 0.587      |
| po         | LogisticRegressionLasso      |   0.345 |                       0.456 | 0.510                   | 0.485                    | 0.549                                     | 0.544      |
| po         | LogisticRegressionRidge      |   0.529 |                       0.471 | 0.558                   | 0.476                    | 0.568                                     | 0.549      |
| po         | NaiveBayes                   |   0.505 |                       0.592 | 0.621                   | 0.655                    | 0.684                                     | **0.830**  |
| po         | RandomForest                 |   0.5   |                       0.495 | 0.485                   | 0.461                    | 0.471                                     | 0.544      |
| po         | RidgeClassifier              |   0.524 |                       0.461 | 0.534                   | 0.456                    | 0.549                                     | 0.563      |
| po         | SVM                          |   0.505 |                       0.379 | 0.403                   | 0.539                    | 0.563                                     | 0.529      |
| po         | XGBoost                      |   0.481 |                       0.505 | 0.476                   | 0.519                    | 0.505                                     | 0.563      |
| ru         | Dummy Classifier             |   0.523 |                       0.477 | 0.477                   | 0.512                    | 0.535                                     | 0.453      |
| ru         | KNN                          |   0.488 |                       0.907 | 0.965                   | **1.000**                | **1.000**                                 | **1.000**  |
| ru         | LinearSVM                    |   0.302 |                       0.163 | 0.163                   | 0.186                    | 0.186                                     | 0.233      |
| ru         | LogisticRegression           |   0.279 |                       0.244 | 0.209                   | 0.198                    | 0.256                                     | 0.244      |
| ru         | LogisticRegressionElasticNet |   0.244 |                       0.186 | 0.174                   | 0.186                    | 0.233                                     | 0.267      |
| ru         | LogisticRegressionLasso      |   0.233 |                       0.221 | 0.186                   | 0.326                    | 0.326                                     | 0.395      |
| ru         | LogisticRegressionRidge      |   0.349 |                       0.372 | 0.267                   | 0.279                    | 0.256                                     | 0.267      |
| ru         | NaiveBayes                   |   0.453 |                       0.558 | 0.558                   | 0.570                    | 0.605                                     | 0.663      |
| ru         | RandomForest                 |   0.267 |                       0.198 | 0.151                   | 0.209                    | 0.267                                     | 0.256      |
| ru         | RidgeClassifier              |   0.314 |                       0.384 | 0.267                   | 0.279                    | 0.267                                     | 0.267      |
| ru         | SVM                          |   0.163 |                       0.012 | 0.012                   | 0.035                    | 0.070                                     | 0.070      |
| ru         | XGBoost                      |   0.233 |                       0.233 | 0.244                   | 0.314                    | 0.337                                     | 0.442      |

  multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)


## precision_micro

| language   | model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   | title and first sentence each paragraph   | raw text   |
|:-----------|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|:------------------------------------------|:-----------|
| en         | Dummy Classifier             |   0.425 |                       0.443 |                   0.428 | 0.460                    | 0.418                                     | 0.426      |
| en         | KNN                          |   0.519 |                       0.506 |                   0.552 | 0.456                    | 0.365                                     | 0.668      |
| en         | LinearSVM                    |   0.619 |                       0.69  |                   0.734 | 0.745                    | 0.734                                     | 0.746      |
| en         | LogisticRegression           |   0.647 |                       0.669 |                   0.719 | 0.736                    | 0.723                                     | 0.751      |
| en         | LogisticRegressionElasticNet |   0.664 |                       0.684 |                   0.729 | 0.721                    | 0.731                                     | 0.770      |
| en         | LogisticRegressionLasso      |   0.674 |                       0.679 |                   0.697 | 0.685                    | 0.702                                     | 0.714      |
| en         | LogisticRegressionRidge      |   0.645 |                       0.708 |                   0.71  | 0.737                    | 0.737                                     | 0.761      |
| en         | NaiveBayes                   |   0.626 |                       0.697 |                   0.709 | 0.701                    | 0.698                                     | 0.707      |
| en         | RandomForest                 |   0.698 |                       0.714 |                   0.736 | 0.788                    | **0.808**                                 | 0.749      |
| en         | RidgeClassifier              |   0.645 |                       0.69  |                   0.722 | 0.720                    | 0.721                                     | 0.748      |
| en         | SVM                          |   0.631 |                       0.759 |                   0.712 | 0.644                    | 0.699                                     | 0.719      |
| en         | XGBoost                      |   0.636 |                       0.647 |                   0.664 | 0.717                    | 0.707                                     | 0.696      |
| fr         | Dummy Classifier             |   0.317 |                       0.3   |                   0.281 | 0.300                    | 0.308                                     | 0.292      |
| fr         | KNN                          |   0.292 |                       0.3   |                   0.297 | 0.295                    | 0.317                                     | 0.297      |
| fr         | LinearSVM                    |   0.335 |                       0.4   |                   0.43  | 0.383                    | 0.425                                     | 0.396      |
| fr         | LogisticRegression           |   0.364 |                       0.442 |                   0.465 | 0.394                    | 0.425                                     | 0.448      |
| fr         | LogisticRegressionElasticNet |   0.358 |                       0.569 |                   0.443 | **0.588**                | 0.462                                     | 0.482      |
| fr         | LogisticRegressionLasso      |   0.458 |                       0.583 |                   0.523 | 0.461                    | 0.439                                     | 0.468      |
| fr         | LogisticRegressionRidge      |   0.342 |                       0.463 |                   0.41  | 0.499                    | 0.420                                     | 0.485      |
| fr         | NaiveBayes                   |   0.412 |                       0.4   |                   0.423 | 0.468                    | 0.450                                     | 0.519      |
| fr         | RandomForest                 |   0.446 |                       0.399 |                   0.467 | 0.504                    | 0.412                                     | 0.405      |
| fr         | RidgeClassifier              |   0.37  |                       0.494 |                   0.382 | 0.466                    | 0.403                                     | 0.482      |
| fr         | SVM                          |   0.341 |                       0.112 |                   0.319 | 0.070                    | 0.063                                     | 0.087      |
| fr         | XGBoost                      |   0.339 |                       0.355 |                   0.417 | 0.394                    | 0.379                                     | 0.502      |
| ge         | Dummy Classifier             |   0.403 |                       0.449 |                   0.444 | 0.442                    | 0.462                                     | 0.408      |
| ge         | KNN                          |   0.433 |                       0.452 |                   0.485 | 0.507                    | 0.439                                     | 0.420      |
| ge         | LinearSVM                    |   0.477 |                       0.451 |                   0.506 | 0.458                    | 0.464                                     | 0.525      |
| ge         | LogisticRegression           |   0.474 |                       0.466 |                   0.53  | 0.518                    | 0.485                                     | 0.521      |
| ge         | LogisticRegressionElasticNet |   0.416 |                       0.507 |                   0.489 | 0.460                    | 0.467                                     | 0.507      |
| ge         | LogisticRegressionLasso      |   0.451 |                       0.553 |                   0.524 | 0.493                    | 0.523                                     | 0.587      |
| ge         | LogisticRegressionRidge      |   0.45  |                       0.528 |                   0.501 | 0.494                    | 0.519                                     | 0.484      |
| ge         | NaiveBayes                   |   0.505 |                       0.539 |                   0.557 | 0.572                    | 0.498                                     | 0.531      |
| ge         | RandomForest                 |   0.478 |                       0.59  |                   0.503 | 0.476                    | 0.550                                     | 0.496      |
| ge         | RidgeClassifier              |   0.451 |                       0.556 |                   0.492 | 0.486                    | 0.498                                     | 0.493      |
| ge         | SVM                          |   0.396 |                       0.207 |                   0.288 | 0.441                    | 0.310                                     | 0.351      |
| ge         | XGBoost                      |   0.502 |                       0.502 |                   0.553 | 0.516                    | 0.608                                     | **0.612**  |
| it         | Dummy Classifier             |   0.35  |                       0.362 |                   0.352 | 0.413                    | 0.324                                     | 0.352      |
| it         | KNN                          |   0.387 |                       0.357 |                   0.352 | 0.352                    | 0.348                                     | 0.353      |
| it         | LinearSVM                    |   0.471 |                       0.449 |                   0.505 | 0.575                    | 0.545                                     | 0.652      |
| it         | LogisticRegression           |   0.487 |                       0.474 |                   0.501 | 0.577                    | 0.571                                     | 0.633      |
| it         | LogisticRegressionElasticNet |   0.472 |                       0.485 |                   0.574 | 0.563                    | 0.656                                     | 0.601      |
| it         | LogisticRegressionLasso      |   0.523 |                       0.526 |                   0.494 | 0.583                    | 0.517                                     | 0.564      |
| it         | LogisticRegressionRidge      |   0.466 |                       0.496 |                   0.502 | 0.595                    | 0.584                                     | 0.585      |
| it         | NaiveBayes                   |   0.488 |                       0.501 |                   0.519 | 0.583                    | 0.533                                     | 0.583      |
| it         | RandomForest                 |   0.448 |                       0.541 |                   0.514 | 0.599                    | 0.581                                     | 0.629      |
| it         | RidgeClassifier              |   0.485 |                       0.479 |                   0.487 | 0.587                    | 0.582                                     | **0.666**  |
| it         | SVM                          |   0.474 |                       0.313 |                   0.343 | 0.453                    | 0.291                                     | 0.430      |
| it         | XGBoost                      |   0.381 |                       0.527 |                   0.411 | 0.460                    | 0.512                                     | 0.575      |
| po         | Dummy Classifier             |   0.472 |                       0.463 |                   0.506 | 0.454                    | 0.453                                     | 0.464      |
| po         | KNN                          |   0.525 |                       0.418 |                   0.587 | 0.349                    | 0.571                                     | 0.225      |
| po         | LinearSVM                    |   0.57  |                       0.526 |                   0.579 | 0.572                    | 0.675                                     | 0.650      |
| po         | LogisticRegression           |   0.58  |                       0.534 |                   0.562 | 0.608                    | 0.632                                     | 0.750      |
| po         | LogisticRegressionElasticNet |   0.594 |                       0.542 |                   0.581 | 0.580                    | 0.597                                     | 0.686      |
| po         | LogisticRegressionLasso      |   0.593 |                       0.583 |                   0.524 | 0.585                    | 0.632                                     | 0.617      |
| po         | LogisticRegressionRidge      |   0.595 |                       0.524 |                   0.566 | 0.595                    | 0.715                                     | 0.718      |
| po         | NaiveBayes                   |   0.584 |                       0.57  |                   0.583 | 0.601                    | 0.575                                     | 0.580      |
| po         | RandomForest                 |   0.639 |                       0.52  |                   0.517 | 0.558                    | 0.752                                     | **0.768**  |
| po         | RidgeClassifier              |   0.583 |                       0.515 |                   0.562 | 0.589                    | 0.713                                     | 0.736      |
| po         | SVM                          |   0.482 |                       0.43  |                   0.422 | 0.375                    | 0.376                                     | 0.417      |
| po         | XGBoost                      |   0.523 |                       0.528 |                   0.553 | 0.537                    | 0.633                                     | 0.766      |
| ru         | Dummy Classifier             |   0.212 |                       0.21  |                   0.216 | 0.219                    | 0.245                                     | 0.194      |
| ru         | KNN                          |   0.307 |                       0.236 |                   0.219 | 0.218                    | 0.219                                     | 0.219      |
| ru         | LinearSVM                    |   0.46  |                       0.298 |                   0.363 | 0.435                    | 0.432                                     | 0.503      |
| ru         | LogisticRegression           |   0.452 |                       0.344 |                   0.469 | 0.430                    | 0.387                                     | 0.527      |
| ru         | LogisticRegressionElasticNet |   0.5   |                       0.287 |                   0.383 | 0.417                    | 0.372                                     | 0.483      |
| ru         | LogisticRegressionLasso      |   0.505 |                       0.282 |                   0.291 | **0.565**                | 0.409                                     | 0.421      |
| ru         | LogisticRegressionRidge      |   0.368 |                       0.45  |                   0.494 | 0.466                    | 0.440                                     | 0.480      |
| ru         | NaiveBayes                   |   0.378 |                       0.377 |                   0.393 | 0.426                    | 0.434                                     | 0.433      |
| ru         | RandomForest                 |   0.447 |                       0.282 |                   0.378 | 0.355                    | 0.488                                     | 0.508      |
| ru         | RidgeClassifier              |   0.397 |                       0.422 |                   0.377 | 0.405                    | 0.412                                     | 0.480      |
| ru         | SVM                          |   0.211 |                       0.163 |                   0.163 | 0.021                    | 0.157                                     | 0.020      |
| ru         | XGBoost                      |   0.352 |                       0.289 |                   0.322 | 0.378                    | 0.427                                     | 0.550      |

  multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)


## accuracy

| language   | model_type                   |   title | title and first paragraph   | title and 5 sentences   | title and 10 sentences   | title and first sentence each paragraph   | raw text   |
|:-----------|:-----------------------------|--------:|:----------------------------|:------------------------|:-------------------------|:------------------------------------------|:-----------|
| en         | Dummy Classifier             |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| en         | KNN                          |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| en         | LinearSVM                    |   0.039 | 0.058                       | 0.087                   | 0.107                    | 0.117                                     | 0.107      |
| en         | LogisticRegression           |   0.078 | 0.029                       | 0.078                   | 0.078                    | 0.087                                     | 0.087      |
| en         | LogisticRegressionElasticNet |   0.097 | 0.068                       | 0.078                   | 0.078                    | 0.078                                     | 0.107      |
| en         | LogisticRegressionLasso      |   0.068 | 0.068                       | 0.078                   | 0.039                    | 0.068                                     | 0.058      |
| en         | LogisticRegressionRidge      |   0.087 | 0.068                       | 0.087                   | 0.097                    | 0.097                                     | 0.087      |
| en         | NaiveBayes                   |   0.029 | 0.029                       | 0.039                   | 0.039                    | 0.019                                     | 0.029      |
| en         | RandomForest                 |   0.068 | 0.078                       | 0.117                   | **0.146**                | 0.136                                     | **0.146**  |
| en         | RidgeClassifier              |   0.078 | 0.039                       | 0.087                   | 0.078                    | 0.087                                     | 0.078      |
| en         | SVM                          |   0.049 | 0.058                       | 0.058                   | 0.058                    | 0.078                                     | 0.097      |
| en         | XGBoost                      |   0.029 | 0.039                       | 0.058                   | 0.087                    | 0.068                                     | 0.107      |
| fr         | Dummy Classifier             |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| fr         | KNN                          |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| fr         | LinearSVM                    |   0     | 0.048                       | 0.048                   | 0.071                    | 0.024                                     | 0.071      |
| fr         | LogisticRegression           |   0.024 | 0.048                       | 0.048                   | 0.071                    | 0.048                                     | 0.071      |
| fr         | LogisticRegressionElasticNet |   0     | 0.048                       | 0.000                   | 0.071                    | 0.071                                     | **0.143**  |
| fr         | LogisticRegressionLasso      |   0     | 0.048                       | 0.024                   | 0.024                    | 0.024                                     | 0.119      |
| fr         | LogisticRegressionRidge      |   0     | 0.048                       | 0.024                   | 0.071                    | 0.000                                     | 0.095      |
| fr         | NaiveBayes                   |   0.024 | 0.000                       | 0.000                   | 0.000                    | 0.024                                     | 0.000      |
| fr         | RandomForest                 |   0     | 0.000                       | 0.000                   | 0.048                    | 0.024                                     | 0.071      |
| fr         | RidgeClassifier              |   0     | 0.048                       | 0.024                   | 0.071                    | 0.000                                     | 0.095      |
| fr         | SVM                          |   0     | 0.000                       | 0.000                   | 0.000                    | 0.048                                     | 0.000      |
| fr         | XGBoost                      |   0     | 0.000                       | 0.024                   | 0.000                    | 0.048                                     | 0.024      |
| ge         | Dummy Classifier             |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| ge         | KNN                          |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| ge         | LinearSVM                    |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | **0.029**  |
| ge         | LogisticRegression           |   0     | 0.000                       | 0.000                   | **0.029**                | 0.000                                     | 0.000      |
| ge         | LogisticRegressionElasticNet |   0     | **0.029**                   | 0.000                   | 0.000                    | 0.000                                     | **0.029**  |
| ge         | LogisticRegressionLasso      |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| ge         | LogisticRegressionRidge      |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| ge         | NaiveBayes                   |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| ge         | RandomForest                 |   0     | 0.000                       | 0.000                   | **0.029**                | 0.000                                     | **0.029**  |
| ge         | RidgeClassifier              |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | **0.029**  |
| ge         | SVM                          |   0     | 0.000                       | **0.029**               | 0.000                    | **0.029**                                 | 0.000      |
| ge         | XGBoost                      |   0     | **0.029**                   | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| it         | Dummy Classifier             |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| it         | KNN                          |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| it         | LinearSVM                    |   0.067 | 0.050                       | 0.067                   | 0.067                    | 0.067                                     | 0.150      |
| it         | LogisticRegression           |   0.067 | 0.067                       | 0.067                   | 0.067                    | 0.033                                     | 0.150      |
| it         | LogisticRegressionElasticNet |   0.033 | 0.067                       | 0.050                   | 0.100                    | 0.067                                     | **0.167**  |
| it         | LogisticRegressionLasso      |   0.017 | 0.067                       | 0.017                   | 0.083                    | 0.033                                     | 0.083      |
| it         | LogisticRegressionRidge      |   0.033 | 0.083                       | 0.100                   | 0.100                    | 0.050                                     | 0.133      |
| it         | NaiveBayes                   |   0     | 0.017                       | 0.050                   | 0.100                    | 0.033                                     | 0.067      |
| it         | RandomForest                 |   0.033 | 0.033                       | 0.083                   | 0.100                    | 0.083                                     | 0.133      |
| it         | RidgeClassifier              |   0.017 | 0.083                       | 0.100                   | 0.083                    | 0.050                                     | 0.150      |
| it         | SVM                          |   0.05  | 0.033                       | 0.017                   | 0.067                    | 0.017                                     | 0.083      |
| it         | XGBoost                      |   0     | 0.017                       | 0.017                   | 0.033                    | 0.017                                     | 0.117      |
| po         | Dummy Classifier             |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| po         | KNN                          |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| po         | LinearSVM                    |   0     | 0.026                       | 0.026                   | 0.026                    | 0.000                                     | 0.026      |
| po         | LogisticRegression           |   0     | 0.000                       | 0.026                   | 0.026                    | 0.000                                     | 0.026      |
| po         | LogisticRegressionElasticNet |   0     | 0.026                       | 0.000                   | 0.026                    | 0.000                                     | 0.000      |
| po         | LogisticRegressionLasso      |   0     | 0.026                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| po         | LogisticRegressionRidge      |   0     | 0.000                       | 0.026                   | 0.026                    | 0.026                                     | 0.026      |
| po         | NaiveBayes                   |   0     | 0.000                       | 0.026                   | 0.026                    | 0.026                                     | 0.000      |
| po         | RandomForest                 |   0     | 0.000                       | 0.026                   | 0.051                    | **0.077**                                 | 0.051      |
| po         | RidgeClassifier              |   0     | 0.000                       | 0.026                   | 0.026                    | 0.026                                     | 0.026      |
| po         | SVM                          |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| po         | XGBoost                      |   0     | 0.000                       | 0.026                   | 0.000                    | 0.026                                     | 0.051      |
| ru         | Dummy Classifier             |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| ru         | KNN                          |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| ru         | LinearSVM                    |   0.026 | 0.053                       | 0.079                   | 0.105                    | 0.053                                     | 0.053      |
| ru         | LogisticRegression           |   0.026 | 0.053                       | 0.105                   | 0.105                    | 0.053                                     | 0.079      |
| ru         | LogisticRegressionElasticNet |   0.026 | 0.026                       | 0.026                   | 0.053                    | 0.053                                     | 0.158      |
| ru         | LogisticRegressionLasso      |   0.026 | 0.026                       | 0.026                   | 0.053                    | 0.105                                     | 0.079      |
| ru         | LogisticRegressionRidge      |   0     | 0.053                       | 0.105                   | 0.132                    | 0.079                                     | 0.053      |
| ru         | NaiveBayes                   |   0     | 0.026                       | 0.079                   | 0.079                    | 0.026                                     | 0.053      |
| ru         | RandomForest                 |   0.026 | 0.026                       | 0.053                   | 0.026                    | 0.132                                     | 0.132      |
| ru         | RidgeClassifier              |   0.026 | 0.053                       | 0.053                   | 0.132                    | 0.053                                     | 0.053      |
| ru         | SVM                          |   0     | 0.000                       | 0.000                   | 0.000                    | 0.000                                     | 0.000      |
| ru         | XGBoost                      |   0.053 | 0.000                       | 0.000                   | 0.053                    | **0.184**                                 | 0.079      |

  multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)



# Per model sub-type

In [13]:
%%capture
display_metrics_and_write_to_file(df=exps_df, grouping_criterion=['model_type','model_subtype'], output_dir='per_model_subtype_tables')

# Per model sub-type and exp name

In [14]:
%%capture
display_metrics_and_write_to_file(df=exps_df, grouping_criterion=['model_type','model_subtype', 'model_name'], output_dir='per_exp_name_tables')