In [1]:
import os
import glob

from IPython.display import display, Markdown
import pandas as pd

# Group types of models (experiment type and model type) and pick best performing in terms of f1-score per unit of analysis and report them in a table


In [3]:
df_filepaths = glob.glob('./experiment_results/*.csv')

In [4]:
exps_df = pd.concat([pd.read_csv(df_filepath) for df_filepath in df_filepaths])

In [5]:
exps_df.model_type = exps_df.model_type.str.strip('¶')

In [6]:
exps_df.shape

(2427, 19)

In [8]:
def get_best_models(df, grouping_criterion):
    return df.reset_index().loc[df.reset_index().groupby(grouping_criterion)['test_f1_micro'].idxmax().dropna()]\
        .set_index(grouping_criterion)

### Generate the tables to report

In [27]:
def display_performance_table(df, metric, index_cols=['model_type'], display_=True):
    report_table = df.reset_index().copy()
    report_table['result'] = report_table[f'test_{metric}'].map(lambda x: f'{x:.3f}')
    report_table['col_title'] = report_table.unit_of_analysis.str.split('_').str.join(' ') 
    report_table['col_title'] = pd.Categorical(
        report_table.col_title,
        categories=['title', 'title and first paragraph', 'title and 5 sentences', 'title and 10 sentences',
                    'title and first sentence each paragraph', 'raw text'],
        ordered=True)
    report_table = report_table[index_cols + ['col_title', 'result']]\
        .pivot_table(index=index_cols, columns=['col_title'], values=['result'], aggfunc='first', fill_value=0)\
        .droplevel(0, axis=1)

    report_table.columns.names = [None]

    # Highlight best scoring models according to their average
    mean_perf_arr = report_table.applymap(lambda x: float(str(x).split(' ')[0])).to_numpy()
    highlight_mask = mean_perf_arr == mean_perf_arr.max()
    report_table_arr = report_table.to_numpy()  # Note it passes the array by reference
    report_table_arr[highlight_mask] = '**' + report_table_arr[highlight_mask] + '**'

    if display_:
        display(Markdown(report_table.to_markdown()))
    
    return report_table

### Generate tables for all languages

In [28]:
metrics_to_report = ['f1_micro', 'recall_micro', 'precision_micro', 'accuracy'] 

In [29]:
language_dict = {'en': 'English', 'it': 'Italian', 'fr': 'French', 'po': 'Polish', 'ru': 'Russian', 'ge': 'German'}

In [30]:
def display_metrics_and_write_to_file(df, grouping_criterion, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    best_model_dfs_list = []
    report_tables_dfs_dict = {metric: [] for metric in metrics_to_report}

    for language, results_df in df.groupby('language'):
        best_models_df = get_best_models(results_df, grouping_criterion + ['unit_of_analysis'])
        best_model_dfs_list.append(best_models_df.copy())

        display(Markdown(f'# {language_dict[language]}'))
        
        for metric in metrics_to_report:
            os.makedirs(os.path.join(output_dir, metric), exist_ok=True)

            output_dir_markdown = os.path.join(output_dir, metric, 'markdown')
            output_dir_latex = os.path.join(output_dir, metric, 'latex')
            output_dir_csv = os.path.join(output_dir, metric, 'csv')

            os.makedirs(output_dir_markdown, exist_ok=True)
            os.makedirs(output_dir_latex, exist_ok=True)
            os.makedirs(output_dir_csv, exist_ok=True)

            display(Markdown(f'## {metric}'))

            report_table = display_performance_table(df=best_models_df, index_cols=grouping_criterion, metric=metric, display_=True)

            # Export as markdown
            markdown_file = open(os.path.join(output_dir_markdown, f"{language_dict[language]}_{metric}.md"), "w")
            report_table.reset_index().to_markdown(markdown_file, index=False)
            markdown_file.close()

            # Export as latex table
            latex_file = open(os.path.join(output_dir_latex, f"{language_dict[language]}_{metric}.tex"), "w")
            report_table.reset_index().to_latex(latex_file, index=False)
            latex_file.close()

            # Export as csv
            report_table.to_csv(os.path.join(output_dir_csv, f"{language_dict[language]}_{metric}.csv"))

            # Stack all languages into single table
            report_table['language'] = language
            report_table = report_table.reset_index().set_index(['language'] + grouping_criterion)

            report_tables_dfs_dict[metric].append(report_table)

    # Store a csv with all the metrics and parameters of the best runs per language per grouping_criterion
    pd.concat(best_model_dfs_list)\
        .reset_index().set_index(['language'] + grouping_criterion).sort_index()\
        .to_csv(os.path.join(output_dir, 'best_exp_params_metrics_per_language.csv'))

    # Report or store unified table
    display(Markdown(f'# All 6 Languages'))
    for metric in metrics_to_report:
        display(Markdown(f'## {metric}'))
        multi_language_report_table_metric = pd.concat(report_tables_dfs_dict[metric])
        display(Markdown(multi_language_report_table_metric.reset_index().to_markdown(index=False)))

        output_dir_markdown = os.path.join(output_dir, metric, 'markdown')
        output_dir_latex = os.path.join(output_dir, metric, 'latex')
        output_dir_csv = os.path.join(output_dir, metric, 'csv')

        # Export as markdown
        markdown_file = open(os.path.join(output_dir_markdown, f"all_6_languages_{metric}.md"), "w")
        multi_language_report_table_metric.reset_index().to_markdown(markdown_file, index=False)
        markdown_file.close()

        # Export as latex table
        latex_file = open(os.path.join(output_dir_latex, f"all_6_languages_{metric}.tex"), "w")
        multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)
        latex_file.close()

        # Export as csv
        multi_language_report_table_metric.to_csv(os.path.join(output_dir_csv, f"all_6_languages_{metric}.csv"))

# Per model type

In [31]:
display_metrics_and_write_to_file(df=exps_df, grouping_criterion=['model_type'], output_dir='per_model_type_tables')
#display_metrics_and_write_to_file(df=exps_df, grouping_criterion=['model_type','model_subtype'], output_dir='per_model_subtype_tables')
#display_metrics_and_write_to_file(df=exps_df, grouping_criterion=['model_type','model_subtype', 'model_name'], output_dir='per_exp_name_tables')

# English

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0     |                       0.608 |                   0.664 |                    0.683 |                                     0.631 | 0.677      |
| LinearSVM                    |   0     |                       0.674 |                   0.689 |                    0.703 |                                     0.664 | 0.697      |
| LogisticRegression           |   0.565 |                       0.628 |                   0.621 |                    0.642 |                                     0.683 | 0.684      |
| LogisticRegressionElasticNet |   0     |                       0.641 |                   0.642 |                    0.657 |                                     0.683 | 0.693      |
| LogisticRegressionLasso      |   0.542 |                       0.631 |                   0.653 |                    0.634 |                                     0.664 | 0.664      |
| LogisticRegressionRidge      |   0.621 |                       0.657 |                   0.673 |                    0.698 |                                     0.669 | 0.696      |
| NaiveBayes                   |   0     |                       0     |                   0.692 |                    0.693 |                                     0.687 | **0.705**  |
| RandomForest                 |   0     |                       0     |                   0.64  |                    0.65  |                                     0.646 | 0.695      |
| RidgeClassifier              |   0     |                       0.638 |                   0.672 |                    0.69  |                                     0.653 | 0.675      |
| SVM                          |   0     |                       0.68  |                   0.689 |                    0.689 |                                     0.679 | 0.696      |
| XGBoost                      |   0     |                       0.65  |                   0.657 |                    0.661 |                                     0.66  | 0.684      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0     |                       0.555 |                   0.66  |                    0.699 |                                     0.555 | 0.638      |
| LinearSVM                    |   0     |                       0.665 |                   0.665 |                    0.697 |                                     0.621 | 0.667      |
| LogisticRegression           |   0.548 |                       0.611 |                   0.614 |                    0.638 |                                     0.677 | 0.648      |
| LogisticRegressionElasticNet |   0     |                       0.619 |                   0.645 |                    0.641 |                                     0.65  | 0.653      |
| LogisticRegressionLasso      |   0.491 |                       0.601 |                   0.619 |                    0.606 |                                     0.628 | 0.636      |
| LogisticRegressionRidge      |   0.611 |                       0.611 |                   0.648 |                    0.692 |                                     0.631 | 0.660      |
| NaiveBayes                   |   0     |                       0     |                   0.655 |                    0.667 |                                     0.653 | 0.694      |
| RandomForest                 |   0     |                       0     |                   0.621 |                    0.623 |                                     0.592 | 0.675      |
| RidgeClassifier              |   0     |                       0.579 |                   0.623 |                    0.648 |                                     0.614 | 0.643      |
| SVM                          |   0     |                       0.628 |                   0.66  |                    0.66  |                                     0.631 | 0.680      |
| XGBoost                      |   0     |                       0.68  |                   0.689 |                    0.694 |                                     0.655 | **0.711**  |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences | title and first sentence each paragraph   |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|:------------------------------------------|-----------:|
| KNN                          |   0     |                       0.732 |                   0.676 |                    0.679 | **0.790**                                 |      0.77  |
| LinearSVM                    |   0     |                       0.693 |                   0.739 |                    0.726 | 0.733                                     |      0.746 |
| LogisticRegression           |   0.604 |                       0.66  |                   0.636 |                    0.656 | 0.701                                     |      0.742 |
| LogisticRegressionElasticNet |   0     |                       0.677 |                   0.649 |                    0.685 | 0.737                                     |      0.755 |
| LogisticRegressionLasso      |   0.623 |                       0.678 |                   0.701 |                    0.673 | 0.723                                     |      0.714 |
| LogisticRegressionRidge      |   0.644 |                       0.734 |                   0.747 |                    0.713 | 0.728                                     |      0.752 |
| NaiveBayes                   |   0     |                       0     |                   0.749 |                    0.729 | 0.747                                     |      0.728 |
| RandomForest                 |   0     |                       0     |                   0.684 |                    0.692 | 0.752                                     |      0.729 |
| RidgeClassifier              |   0     |                       0.744 |                   0.752 |                    0.755 | 0.718                                     |      0.726 |
| SVM                          |   0     |                       0.756 |                   0.737 |                    0.732 | 0.757                                     |      0.737 |
| XGBoost                      |   0     |                       0.634 |                   0.649 |                    0.664 | 0.682                                     |      0.672 |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title | title and first paragraph   |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|:----------------------------|------------------------:|:-------------------------|------------------------------------------:|:-----------|
| KNN                          |   0     | **0.126**                   |                   0.019 | 0.058                    |                                     0.097 | 0.107      |
| LinearSVM                    |   0     | 0.058                       |                   0.087 | 0.049                    |                                     0.117 | 0.068      |
| LogisticRegression           |   0.029 | 0.039                       |                   0.049 | 0.019                    |                                     0.068 | 0.068      |
| LogisticRegressionElasticNet |   0     | 0.058                       |                   0.058 | 0.049                    |                                     0.117 | 0.087      |
| LogisticRegressionLasso      |   0.049 | 0.058                       |                   0.068 | 0.068                    |                                     0.058 | 0.068      |
| LogisticRegressionRidge      |   0.058 | 0.117                       |                   0.107 | 0.078                    |                                     0.087 | 0.058      |
| NaiveBayes                   |   0     | 0                           |                   0.087 | 0.107                    |                                     0.097 | 0.068      |
| RandomForest                 |   0     | 0                           |                   0.117 | 0.049                    |                                     0.117 | 0.117      |
| RidgeClassifier              |   0     | 0.117                       |                   0.117 | **0.126**                |                                     0.087 | 0.078      |
| SVM                          |   0     | 0.087                       |                   0.097 | 0.097                    |                                     0.078 | 0.058      |
| XGBoost                      |   0     | 0.049                       |                   0.039 | 0.107                    |                                     0.078 | **0.126**  |

  report_table.reset_index().to_latex(latex_file, index=False)


# French

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.278 |                       0.344 |                   0.217 |                    0.415 |                                     0.344 | 0.437      |
| LinearSVM                    |   0.27  |                       0.408 |                   0.36  |                    0.325 |                                     0.366 | 0.389      |
| LogisticRegression           |   0.295 |                       0.428 |                   0.362 |                    0.356 |                                     0.354 | 0.402      |
| LogisticRegressionElasticNet |   0.28  |                       0.436 |                   0.364 |                    0.314 |                                     0.401 | 0.398      |
| LogisticRegressionLasso      |   0.279 |                       0.423 |                   0.374 |                    0.369 |                                     0.389 | 0.382      |
| LogisticRegressionRidge      |   0.316 |                       0.424 |                   0.396 |                    0.324 |                                     0.365 | **0.452**  |
| NaiveBayes                   |   0.358 |                       0.388 |                   0.391 |                    0.365 |                                     0.431 | 0.400      |
| RandomForest                 |   0.306 |                       0.395 |                   0.361 |                    0.365 |                                     0.394 | 0.437      |
| RidgeClassifier              |   0.332 |                       0.412 |                   0.35  |                    0.288 |                                     0.319 | 0.317      |
| SVM                          |   0.371 |                       0.398 |                   0.374 |                    0.353 |                                     0.365 | 0.381      |
| XGBoost                      |   0.346 |                       0.371 |                   0.374 |                    0.375 |                                     0.373 | 0.448      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   | title     |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph |   raw text |
|:-----------------------------|:----------|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|-----------:|
| KNN                          | 0.254     |                       0.31  |                   0.198 |                    0.397 |                                     0.341 |      0.492 |
| LinearSVM                    | 0.270     |                       0.381 |                   0.357 |                    0.294 |                                     0.357 |      0.365 |
| LogisticRegression           | 0.294     |                       0.413 |                   0.365 |                    0.325 |                                     0.357 |      0.381 |
| LogisticRegressionElasticNet | 0.246     |                       0.437 |                   0.357 |                    0.27  |                                     0.397 |      0.365 |
| LogisticRegressionLasso      | 0.246     |                       0.405 |                   0.333 |                    0.357 |                                     0.357 |      0.373 |
| LogisticRegressionRidge      | 0.333     |                       0.389 |                   0.405 |                    0.286 |                                     0.349 |      0.5   |
| NaiveBayes                   | 0.381     |                       0.397 |                   0.389 |                    0.341 |                                     0.444 |      0.413 |
| RandomForest                 | **0.556** |                       0.532 |                   0.452 |                    0.397 |                                     0.421 |      0.421 |
| RidgeClassifier              | 0.333     |                       0.397 |                   0.325 |                    0.246 |                                     0.294 |      0.278 |
| SVM                          | 0.381     |                       0.413 |                   0.381 |                    0.317 |                                     0.357 |      0.349 |
| XGBoost                      | 0.437     |                       0.444 |                   0.373 |                    0.397 |                                     0.397 |      0.476 |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|------------------------------------------:|-----------:|
| KNN                          |   0.363 |                       0.454 |                   0.281 | 0.482                    |                                     0.469 |      0.434 |
| LinearSVM                    |   0.356 |                       0.495 |                   0.39  | 0.472                    |                                     0.42  |      0.505 |
| LogisticRegression           |   0.36  |                       0.478 |                   0.376 | 0.450                    |                                     0.369 |      0.525 |
| LogisticRegressionElasticNet |   0.42  |                       0.467 |                   0.425 | 0.478                    |                                     0.463 |      0.505 |
| LogisticRegressionLasso      |   0.454 |                       0.542 |                   0.529 | 0.486                    |                                     0.481 |      0.447 |
| LogisticRegressionRidge      |   0.365 |                       0.526 |                   0.438 | 0.481                    |                                     0.422 |      0.56  |
| NaiveBayes                   |   0.394 |                       0.426 |                   0.444 | 0.441                    |                                     0.535 |      0.485 |
| RandomForest                 |   0.42  |                       0.459 |                   0.357 | 0.386                    |                                     0.437 |      0.529 |
| RidgeClassifier              |   0.401 |                       0.502 |                   0.473 | 0.434                    |                                     0.382 |      0.407 |
| SVM                          |   0.424 |                       0.419 |                   0.412 | **0.573**                |                                     0.431 |      0.476 |
| XGBoost                      |   0.338 |                       0.427 |                   0.459 | 0.413                    |                                     0.422 |      0.481 |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.048 |                       0.048 |                   0.024 |                    0.071 |                                     0     | 0.048      |
| LinearSVM                    |   0     |                       0.071 |                   0.024 |                    0.048 |                                     0     | **0.143**  |
| LogisticRegression           |   0     |                       0.048 |                   0.048 |                    0.095 |                                     0     | 0.095      |
| LogisticRegressionElasticNet |   0.024 |                       0.024 |                   0.024 |                    0.071 |                                     0.024 | 0.119      |
| LogisticRegressionLasso      |   0.024 |                       0.024 |                   0.024 |                    0.024 |                                     0.071 | 0.095      |
| LogisticRegressionRidge      |   0     |                       0.071 |                   0     |                    0.024 |                                     0     | 0.024      |
| NaiveBayes                   |   0.024 |                       0     |                   0.024 |                    0.048 |                                     0.048 | 0.071      |
| RandomForest                 |   0     |                       0     |                   0     |                    0     |                                     0     | 0.024      |
| RidgeClassifier              |   0     |                       0.048 |                   0.048 |                    0.048 |                                     0     | 0.071      |
| SVM                          |   0.024 |                       0     |                   0.024 |                    0.024 |                                     0.024 | 0.119      |
| XGBoost                      |   0.024 |                       0.024 |                   0     |                    0.024 |                                     0.024 | 0.024      |

  report_table.reset_index().to_latex(latex_file, index=False)


# German

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.156 |                       0.361 |                   0.388 |                    0.379 |                                     0.417 | 0.449      |
| LinearSVM                    |   0.364 |                       0.458 |                   0.48  |                    0.462 |                                     0.433 | 0.452      |
| LogisticRegression           |   0.347 |                       0.465 |                   0.443 |                    0.489 |                                     0.439 | 0.518      |
| LogisticRegressionElasticNet |   0.339 |                       0.469 |                   0.455 |                    0.474 |                                     0.447 | 0.534      |
| LogisticRegressionLasso      |   0.348 |                       0.467 |                   0.449 |                    0.463 |                                     0.458 | 0.542      |
| LogisticRegressionRidge      |   0.363 |                       0.514 |                   0.445 |                    0.483 |                                     0.457 | 0.523      |
| NaiveBayes                   |   0.337 |                       0.494 |                   0.485 |                    0.484 |                                     0.529 | 0.488      |
| RandomForest                 |   0.389 |                       0.522 |                   0.527 |                    0.511 |                                     0.499 | 0.541      |
| RidgeClassifier              |   0.353 |                       0.46  |                   0.427 |                    0.449 |                                     0.485 | 0.477      |
| SVM                          |   0.397 |                       0.5   |                   0.465 |                    0.501 |                                     0.532 | 0.526      |
| XGBoost                      |   0.475 |                       0.55  |                   0.501 |                    0.509 |                                     0.501 | **0.567**  |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   | title     |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|:----------|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          | 0.140     |                       0.36  |                   0.366 |                    0.36  |                                     0.436 | 0.471      |
| LinearSVM                    | 0.331     |                       0.453 |                   0.465 |                    0.442 |                                     0.43  | 0.453      |
| LogisticRegression           | 0.314     |                       0.448 |                   0.43  |                    0.488 |                                     0.442 | 0.535      |
| LogisticRegressionElasticNet | 0.297     |                       0.436 |                   0.43  |                    0.453 |                                     0.453 | 0.552      |
| LogisticRegressionLasso      | 0.314     |                       0.448 |                   0.407 |                    0.436 |                                     0.436 | 0.552      |
| LogisticRegressionRidge      | 0.331     |                       0.517 |                   0.43  |                    0.471 |                                     0.477 | 0.558      |
| NaiveBayes                   | 0.267     |                       0.488 |                   0.483 |                    0.483 |                                     0.576 | 0.500      |
| RandomForest                 | 0.477     |                       0.587 |                   0.558 |                    0.506 |                                     0.483 | 0.512      |
| RidgeClassifier              | 0.320     |                       0.442 |                   0.395 |                    0.424 |                                     0.471 | 0.448      |
| SVM                          | 0.366     |                       0.506 |                   0.442 |                    0.506 |                                     0.535 | 0.517      |
| XGBoost                      | **0.622** |                       0.616 |                   0.529 |                    0.517 |                                     0.535 | **0.622**  |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title | title and first paragraph   |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph |   raw text |
|:-----------------------------|--------:|:----------------------------|------------------------:|-------------------------:|------------------------------------------:|-----------:|
| KNN                          |   0.309 | 0.371                       |                   0.426 |                    0.417 |                                     0.402 |      0.492 |
| LinearSVM                    |   0.462 | 0.518                       |                   0.528 |                    0.514 |                                     0.465 |      0.503 |
| LogisticRegression           |   0.459 | 0.535                       |                   0.492 |                    0.527 |                                     0.45  |      0.516 |
| LogisticRegressionElasticNet |   0.499 | **0.617**                   |                   0.499 |                    0.532 |                                     0.468 |      0.535 |
| LogisticRegressionLasso      |   0.454 | 0.599                       |                   0.512 |                    0.504 |                                     0.508 |      0.552 |
| LogisticRegressionRidge      |   0.458 | 0.550                       |                   0.493 |                    0.521 |                                     0.451 |      0.52  |
| NaiveBayes                   |   0.493 | 0.566                       |                   0.522 |                    0.511 |                                     0.5   |      0.536 |
| RandomForest                 |   0.523 | 0.562                       |                   0.534 |                    0.542 |                                     0.549 |      0.593 |
| RidgeClassifier              |   0.48  | 0.549                       |                   0.512 |                    0.5   |                                     0.575 |      0.552 |
| SVM                          |   0.49  | 0.528                       |                   0.576 |                    0.557 |                                     0.545 |      0.55  |
| XGBoost                      |   0.436 | 0.555                       |                   0.507 |                    0.547 |                                     0.55  |      0.549 |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.029 |                           0 |                       0 |                    0     |                                     0     | 0.029      |
| LinearSVM                    |   0     |                           0 |                       0 |                    0     |                                     0     | 0.000      |
| LogisticRegression           |   0     |                           0 |                       0 |                    0     |                                     0     | 0.000      |
| LogisticRegressionElasticNet |   0     |                           0 |                       0 |                    0     |                                     0.029 | 0.000      |
| LogisticRegressionLasso      |   0     |                           0 |                       0 |                    0     |                                     0     | 0.000      |
| LogisticRegressionRidge      |   0     |                           0 |                       0 |                    0     |                                     0     | 0.029      |
| NaiveBayes                   |   0     |                           0 |                       0 |                    0     |                                     0     | **0.057**  |
| RandomForest                 |   0     |                           0 |                       0 |                    0.029 |                                     0     | 0.029      |
| RidgeClassifier              |   0     |                           0 |                       0 |                    0     |                                     0     | 0.029      |
| SVM                          |   0     |                           0 |                       0 |                    0     |                                     0.029 | 0.029      |
| XGBoost                      |   0.029 |                           0 |                       0 |                    0     |                                     0     | 0.000      |

  report_table.reset_index().to_latex(latex_file, index=False)


# Italian

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.391 |                       0.364 |                   0.322 |                    0.549 |                                     0.379 | 0.470      |
| LinearSVM                    |   0.441 |                       0.428 |                   0.472 |                    0.486 |                                     0.414 | 0.552      |
| LogisticRegression           |   0.444 |                       0.444 |                   0.442 |                    0.498 |                                     0.488 | 0.558      |
| LogisticRegressionElasticNet |   0.416 |                       0.431 |                   0.464 |                    0.483 |                                     0.479 | 0.552      |
| LogisticRegressionLasso      |   0.388 |                       0.467 |                   0.468 |                    0.474 |                                     0.488 | 0.507      |
| LogisticRegressionRidge      |   0.429 |                       0.432 |                   0.472 |                    0.498 |                                     0.444 | 0.538      |
| NaiveBayes                   |   0.512 |                       0.481 |                   0.474 |                    0.475 |                                     0.484 | 0.537      |
| RandomForest                 |   0.441 |                       0.497 |                   0.501 |                    0.519 |                                     0.516 | **0.561**  |
| RidgeClassifier              |   0.439 |                       0.439 |                   0.452 |                    0.467 |                                     0.462 | 0.534      |
| SVM                          |   0.485 |                       0.485 |                   0.5   |                    0.524 |                                     0.514 | 0.533      |
| XGBoost                      |   0.411 |                       0.468 |                   0.48  |                    0.501 |                                     0.508 | 0.535      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph |   raw text |
|:-----------------------------|--------:|----------------------------:|------------------------:|:-------------------------|------------------------------------------:|-----------:|
| KNN                          |   0.357 |                       0.343 |                   0.257 | **0.661**                |                                     0.348 |      0.465 |
| LinearSVM                    |   0.443 |                       0.413 |                   0.457 | 0.448                    |                                     0.365 |      0.53  |
| LogisticRegression           |   0.43  |                       0.443 |                   0.43  | 0.517                    |                                     0.474 |      0.565 |
| LogisticRegressionElasticNet |   0.4   |                       0.422 |                   0.439 | 0.461                    |                                     0.422 |      0.522 |
| LogisticRegressionLasso      |   0.348 |                       0.439 |                   0.474 | 0.435                    |                                     0.461 |      0.474 |
| LogisticRegressionRidge      |   0.422 |                       0.417 |                   0.452 | 0.478                    |                                     0.413 |      0.509 |
| NaiveBayes                   |   0.561 |                       0.491 |                   0.439 | 0.435                    |                                     0.47  |      0.526 |
| RandomForest                 |   0.478 |                       0.526 |                   0.565 | 0.509                    |                                     0.522 |      0.509 |
| RidgeClassifier              |   0.443 |                       0.413 |                   0.404 | 0.413                    |                                     0.404 |      0.465 |
| SVM                          |   0.513 |                       0.491 |                   0.452 | 0.487                    |                                     0.474 |      0.483 |
| XGBoost                      |   0.522 |                       0.548 |                   0.526 | 0.574                    |                                     0.517 |      0.57  |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.451 |                       0.42  |                   0.496 |                    0.49  |                                     0.471 | 0.560      |
| LinearSVM                    |   0.46  |                       0.47  |                   0.508 |                    0.556 |                                     0.537 | 0.614      |
| LogisticRegression           |   0.479 |                       0.463 |                   0.471 |                    0.492 |                                     0.549 | 0.578      |
| LogisticRegressionElasticNet |   0.465 |                       0.463 |                   0.509 |                    0.517 |                                     0.668 | 0.643      |
| LogisticRegressionLasso      |   0.49  |                       0.515 |                   0.479 |                    0.55  |                                     0.57  | 0.616      |
| LogisticRegressionRidge      |   0.467 |                       0.476 |                   0.52  |                    0.532 |                                     0.55  | 0.626      |
| NaiveBayes                   |   0.487 |                       0.496 |                   0.546 |                    0.568 |                                     0.602 | 0.595      |
| RandomForest                 |   0.473 |                       0.493 |                   0.477 |                    0.54  |                                     0.532 | 0.667      |
| RidgeClassifier              |   0.462 |                       0.493 |                   0.551 |                    0.581 |                                     0.602 | **0.722**  |
| SVM                          |   0.486 |                       0.508 |                   0.575 |                    0.637 |                                     0.612 | 0.631      |
| XGBoost                      |   0.353 |                       0.417 |                   0.448 |                    0.453 |                                     0.512 | 0.527      |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.017 |                       0.067 |                   0.05  |                    0.017 |                                     0.067 | 0.117      |
| LinearSVM                    |   0.017 |                       0.083 |                   0.033 |                    0.067 |                                     0.05  | 0.083      |
| LogisticRegression           |   0.067 |                       0.05  |                   0.05  |                    0.033 |                                     0.033 | 0.050      |
| LogisticRegressionElasticNet |   0.033 |                       0.083 |                   0.05  |                    0.067 |                                     0.083 | 0.100      |
| LogisticRegressionLasso      |   0.083 |                       0.033 |                   0.033 |                    0.05  |                                     0.067 | 0.050      |
| LogisticRegressionRidge      |   0.033 |                       0.083 |                   0.033 |                    0.017 |                                     0.067 | 0.100      |
| NaiveBayes                   |   0.033 |                       0.067 |                   0.05  |                    0.067 |                                     0.067 | 0.083      |
| RandomForest                 |   0     |                       0     |                   0.033 |                    0.067 |                                     0.017 | 0.067      |
| RidgeClassifier              |   0.05  |                       0.083 |                   0.067 |                    0.067 |                                     0.083 | **0.150**  |
| SVM                          |   0.033 |                       0.05  |                   0.067 |                    0.067 |                                     0.083 | 0.083      |
| XGBoost                      |   0     |                       0     |                   0.033 |                    0.05  |                                     0     | 0.050      |

  report_table.reset_index().to_latex(latex_file, index=False)


# Polish

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.482 |                       0.429 |                   0.428 |                    0.39  |                                     0.066 | 0.462      |
| LinearSVM                    |   0.545 |                       0.48  |                   0.537 |                    0.557 |                                     0.572 | 0.601      |
| LogisticRegression           |   0.54  |                       0.486 |                   0.535 |                    0.545 |                                     0.553 | 0.629      |
| LogisticRegressionElasticNet |   0.543 |                       0.502 |                   0.554 |                    0.533 |                                     0.566 | 0.623      |
| LogisticRegressionLasso      |   0.422 |                       0.494 |                   0.5   |                    0.549 |                                     0.547 | **0.652**  |
| LogisticRegressionRidge      |   0.553 |                       0.506 |                   0.554 |                    0.556 |                                     0.596 | 0.606      |
| NaiveBayes                   |   0.537 |                       0.505 |                   0.526 |                    0.552 |                                     0.61  | 0.624      |
| RandomForest                 |   0.379 |                       0.528 |                   0.544 |                    0.519 |                                     0.58  | 0.640      |
| RidgeClassifier              |   0.561 |                       0.503 |                   0.509 |                    0.531 |                                     0.592 | 0.590      |
| SVM                          |   0.55  |                       0.541 |                   0.527 |                    0.539 |                                     0.599 | 0.614      |
| XGBoost                      |   0.57  |                       0.563 |                   0.564 |                    0.563 |                                     0.59  | 0.649      |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.451 |                       0.427 |                   0.388 |                    0.335 |                                     0.097 | 0.403      |
| LinearSVM                    |   0.558 |                       0.471 |                   0.539 |                    0.558 |                                     0.568 | 0.597      |
| LogisticRegression           |   0.524 |                       0.485 |                   0.563 |                    0.534 |                                     0.544 | 0.655      |
| LogisticRegressionElasticNet |   0.519 |                       0.495 |                   0.553 |                    0.51  |                                     0.573 | 0.641      |
| LogisticRegressionLasso      |   0.35  |                       0.427 |                   0.466 |                    0.519 |                                     0.529 | **0.733**  |
| LogisticRegressionRidge      |   0.578 |                       0.549 |                   0.549 |                    0.544 |                                     0.621 | 0.612      |
| NaiveBayes                   |   0.49  |                       0.481 |                   0.524 |                    0.549 |                                     0.626 | 0.646      |
| RandomForest                 |   0.398 |                       0.539 |                   0.553 |                    0.524 |                                     0.505 | 0.563      |
| RidgeClassifier              |   0.534 |                       0.505 |                   0.51  |                    0.49  |                                     0.578 | 0.558      |
| SVM                          |   0.549 |                       0.539 |                   0.524 |                    0.524 |                                     0.597 | 0.558      |
| XGBoost                      |   0.728 |                       0.67  |                   0.646 |                    0.626 |                                     0.617 | 0.709      |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.583 |                       0.453 |                   0.575 |                    0.51  |                                     0.05  | 0.577      |
| LinearSVM                    |   0.549 |                       0.52  |                   0.586 |                    0.593 |                                     0.67  | 0.685      |
| LogisticRegression           |   0.584 |                       0.516 |                   0.536 |                    0.571 |                                     0.621 | 0.645      |
| LogisticRegressionElasticNet |   0.619 |                       0.542 |                   0.602 |                    0.586 |                                     0.597 | 0.667      |
| LogisticRegressionLasso      |   0.614 |                       0.614 |                   0.585 |                    0.614 |                                     0.601 | 0.624      |
| LogisticRegressionRidge      |   0.549 |                       0.515 |                   0.62  |                    0.608 |                                     0.61  | 0.669      |
| NaiveBayes                   |   0.625 |                       0.564 |                   0.567 |                    0.584 |                                     0.624 | 0.658      |
| RandomForest                 |   0.636 |                       0.591 |                   0.57  |                    0.54  |                                     0.709 | **0.784**  |
| RidgeClassifier              |   0.625 |                       0.526 |                   0.554 |                    0.657 |                                     0.724 | 0.713      |
| SVM                          |   0.569 |                       0.581 |                   0.625 |                    0.579 |                                     0.733 | 0.737      |
| XGBoost                      |   0.501 |                       0.524 |                   0.535 |                    0.55  |                                     0.601 | 0.640      |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph | title and 5 sentences   |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|:------------------------|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |       0 |                       0     | 0.000                   |                    0.026 |                                     0.026 | 0.026      |
| LinearSVM                    |       0 |                       0.026 | 0.026                   |                    0.026 |                                     0.026 | **0.051**  |
| LogisticRegression           |       0 |                       0     | 0.000                   |                    0.026 |                                     0     | **0.051**  |
| LogisticRegressionElasticNet |       0 |                       0     | 0.026                   |                    0.026 |                                     0     | 0.026      |
| LogisticRegressionLasso      |       0 |                       0     | 0.000                   |                    0.026 |                                     0     | 0.026      |
| LogisticRegressionRidge      |       0 |                       0     | 0.026                   |                    0.026 |                                     0     | 0.026      |
| NaiveBayes                   |       0 |                       0     | 0.026                   |                    0.026 |                                     0     | 0.026      |
| RandomForest                 |       0 |                       0     | 0.026                   |                    0.026 |                                     0.026 | 0.026      |
| RidgeClassifier              |       0 |                       0     | 0.026                   |                    0.026 |                                     0.026 | 0.026      |
| SVM                          |       0 |                       0     | **0.051**               |                    0.026 |                                     0.026 | **0.051**  |
| XGBoost                      |       0 |                       0     | 0.000                   |                    0     |                                     0     | 0.026      |

  report_table.reset_index().to_latex(latex_file, index=False)


# Russian

## f1_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.275 |                       0.203 |                   0.27  |                    0.3   |                                     0.323 | 0.258      |
| LinearSVM                    |   0.324 |                       0.347 |                   0.273 |                    0.369 |                                     0.324 | 0.353      |
| LogisticRegression           |   0.354 |                       0.314 |                   0.299 |                    0.329 |                                     0.3   | 0.381      |
| LogisticRegressionElasticNet |   0.331 |                       0.366 |                   0.339 |                    0.352 |                                     0.3   | 0.402      |
| LogisticRegressionLasso      |   0.309 |                       0.307 |                   0.283 |                    0.39  |                                     0.316 | 0.411      |
| LogisticRegressionRidge      |   0.345 |                       0.359 |                   0.343 |                    0.346 |                                     0.311 | 0.371      |
| NaiveBayes                   |   0.363 |                       0.317 |                   0.33  |                    0.372 |                                     0.351 | 0.398      |
| RandomForest                 |   0.3   |                       0.311 |                   0.313 |                    0.277 |                                     0.341 | 0.324      |
| RidgeClassifier              |   0.333 |                       0.365 |                   0.299 |                    0.333 |                                     0.318 | 0.338      |
| SVM                          |   0.388 |                       0.435 |                   0.365 |                    0.354 |                                     0.364 | 0.379      |
| XGBoost                      |   0.275 |                       0.259 |                   0.32  |                    0.347 |                                     0.367 | **0.454**  |

  report_table.reset_index().to_latex(latex_file, index=False)


## recall_micro

| model_type                   | title     |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|:----------|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          | 0.256     |                       0.174 |                   0.267 |                    0.279 |                                     0.326 | 0.233      |
| LinearSVM                    | 0.314     |                       0.349 |                   0.221 |                    0.326 |                                     0.267 | 0.326      |
| LogisticRegression           | 0.360     |                       0.302 |                   0.256 |                    0.326 |                                     0.267 | 0.395      |
| LogisticRegressionElasticNet | 0.314     |                       0.337 |                   0.314 |                    0.326 |                                     0.267 | 0.372      |
| LogisticRegressionLasso      | 0.233     |                       0.326 |                   0.256 |                    0.419 |                                     0.302 | 0.395      |
| LogisticRegressionRidge      | 0.349     |                       0.337 |                   0.326 |                    0.302 |                                     0.256 | 0.337      |
| NaiveBayes                   | 0.419     |                       0.302 |                   0.256 |                    0.326 |                                     0.337 | 0.360      |
| RandomForest                 | **0.535** |                       0.477 |                   0.326 |                    0.233 |                                     0.326 | 0.302      |
| RidgeClassifier              | 0.326     |                       0.36  |                   0.244 |                    0.279 |                                     0.267 | 0.291      |
| SVM                          | 0.419     |                       0.465 |                   0.326 |                    0.302 |                                     0.326 | 0.326      |
| XGBoost                      | 0.407     |                       0.326 |                   0.337 |                    0.36  |                                     0.372 | **0.535**  |

  report_table.reset_index().to_latex(latex_file, index=False)


## precision_micro

| model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0.413 |                       0.342 |                   0.298 |                    0.362 |                                     0.375 | 0.317      |
| LinearSVM                    |   0.392 |                       0.41  |                   0.376 |                    0.444 |                                     0.452 | 0.459      |
| LogisticRegression           |   0.403 |                       0.376 |                   0.436 |                    0.355 |                                     0.38  | 0.401      |
| LogisticRegressionElasticNet |   0.457 |                       0.459 |                   0.447 |                    0.411 |                                     0.368 | 0.461      |
| LogisticRegressionLasso      |   0.539 |                       0.331 |                   0.362 |                    0.388 |                                     0.358 | 0.464      |
| LogisticRegressionRidge      |   0.39  |                       0.455 |                   0.447 |                    0.422 |                                     0.446 | 0.470      |
| NaiveBayes                   |   0.358 |                       0.421 |                   0.476 |                    0.453 |                                     0.405 | **0.567**  |
| RandomForest                 |   0.245 |                       0.246 |                   0.313 |                    0.366 |                                     0.402 | 0.409      |
| RidgeClassifier              |   0.398 |                       0.43  |                   0.42  |                    0.441 |                                     0.448 | 0.422      |
| SVM                          |   0.385 |                       0.474 |                   0.498 |                    0.497 |                                     0.526 | 0.555      |
| XGBoost                      |   0.335 |                       0.264 |                   0.354 |                    0.385 |                                     0.4   | 0.452      |

  report_table.reset_index().to_latex(latex_file, index=False)


## accuracy

| model_type                   |   title |   title and first paragraph | title and 5 sentences   |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------------------------|--------:|----------------------------:|:------------------------|-------------------------:|------------------------------------------:|:-----------|
| KNN                          |   0     |                       0.026 | 0.079                   |                    0.105 |                                     0.053 | 0.079      |
| LinearSVM                    |   0     |                       0.053 | 0.053                   |                    0.105 |                                     0.053 | 0.105      |
| LogisticRegression           |   0     |                       0.053 | 0.053                   |                    0     |                                     0.053 | 0.105      |
| LogisticRegressionElasticNet |   0     |                       0.079 | 0.053                   |                    0.079 |                                     0.079 | **0.158**  |
| LogisticRegressionLasso      |   0.026 |                       0.026 | 0.053                   |                    0.053 |                                     0.053 | 0.132      |
| LogisticRegressionRidge      |   0     |                       0.079 | 0.026                   |                    0.079 |                                     0.079 | 0.105      |
| NaiveBayes                   |   0     |                       0.053 | **0.158**               |                    0.132 |                                     0.053 | 0.079      |
| RandomForest                 |   0     |                       0     | 0.000                   |                    0.026 |                                     0.053 | 0.105      |
| RidgeClassifier              |   0.026 |                       0.079 | 0.132                   |                    0.053 |                                     0.053 | 0.079      |
| SVM                          |   0     |                       0.053 | **0.158**               |                    0.053 |                                     0.105 | 0.053      |
| XGBoost                      |   0.026 |                       0     | 0.053                   |                    0.105 |                                     0.132 | **0.158**  |

  report_table.reset_index().to_latex(latex_file, index=False)


# All 6 Languages

## f1_micro

| language   | model_type                   |   title |   title and first paragraph |   title and 5 sentences |   title and 10 sentences |   title and first sentence each paragraph | raw text   |
|:-----------|:-----------------------------|--------:|----------------------------:|------------------------:|-------------------------:|------------------------------------------:|:-----------|
| en         | KNN                          |   0     |                       0.608 |                   0.664 |                    0.683 |                                     0.631 | 0.677      |
| en         | LinearSVM                    |   0     |                       0.674 |                   0.689 |                    0.703 |                                     0.664 | 0.697      |
| en         | LogisticRegression           |   0.565 |                       0.628 |                   0.621 |                    0.642 |                                     0.683 | 0.684      |
| en         | LogisticRegressionElasticNet |   0     |                       0.641 |                   0.642 |                    0.657 |                                     0.683 | 0.693      |
| en         | LogisticRegressionLasso      |   0.542 |                       0.631 |                   0.653 |                    0.634 |                                     0.664 | 0.664      |
| en         | LogisticRegressionRidge      |   0.621 |                       0.657 |                   0.673 |                    0.698 |                                     0.669 | 0.696      |
| en         | NaiveBayes                   |   0     |                       0     |                   0.692 |                    0.693 |                                     0.687 | **0.705**  |
| en         | RandomForest                 |   0     |                       0     |                   0.64  |                    0.65  |                                     0.646 | 0.695      |
| en         | RidgeClassifier              |   0     |                       0.638 |                   0.672 |                    0.69  |                                     0.653 | 0.675      |
| en         | SVM                          |   0     |                       0.68  |                   0.689 |                    0.689 |                                     0.679 | 0.696      |
| en         | XGBoost                      |   0     |                       0.65  |                   0.657 |                    0.661 |                                     0.66  | 0.684      |
| fr         | KNN                          |   0.278 |                       0.344 |                   0.217 |                    0.415 |                                     0.344 | 0.437      |
| fr         | LinearSVM                    |   0.27  |                       0.408 |                   0.36  |                    0.325 |                                     0.366 | 0.389      |
| fr         | LogisticRegression           |   0.295 |                       0.428 |                   0.362 |                    0.356 |                                     0.354 | 0.402      |
| fr         | LogisticRegressionElasticNet |   0.28  |                       0.436 |                   0.364 |                    0.314 |                                     0.401 | 0.398      |
| fr         | LogisticRegressionLasso      |   0.279 |                       0.423 |                   0.374 |                    0.369 |                                     0.389 | 0.382      |
| fr         | LogisticRegressionRidge      |   0.316 |                       0.424 |                   0.396 |                    0.324 |                                     0.365 | **0.452**  |
| fr         | NaiveBayes                   |   0.358 |                       0.388 |                   0.391 |                    0.365 |                                     0.431 | 0.400      |
| fr         | RandomForest                 |   0.306 |                       0.395 |                   0.361 |                    0.365 |                                     0.394 | 0.437      |
| fr         | RidgeClassifier              |   0.332 |                       0.412 |                   0.35  |                    0.288 |                                     0.319 | 0.317      |
| fr         | SVM                          |   0.371 |                       0.398 |                   0.374 |                    0.353 |                                     0.365 | 0.381      |
| fr         | XGBoost                      |   0.346 |                       0.371 |                   0.374 |                    0.375 |                                     0.373 | 0.448      |
| ge         | KNN                          |   0.156 |                       0.361 |                   0.388 |                    0.379 |                                     0.417 | 0.449      |
| ge         | LinearSVM                    |   0.364 |                       0.458 |                   0.48  |                    0.462 |                                     0.433 | 0.452      |
| ge         | LogisticRegression           |   0.347 |                       0.465 |                   0.443 |                    0.489 |                                     0.439 | 0.518      |
| ge         | LogisticRegressionElasticNet |   0.339 |                       0.469 |                   0.455 |                    0.474 |                                     0.447 | 0.534      |
| ge         | LogisticRegressionLasso      |   0.348 |                       0.467 |                   0.449 |                    0.463 |                                     0.458 | 0.542      |
| ge         | LogisticRegressionRidge      |   0.363 |                       0.514 |                   0.445 |                    0.483 |                                     0.457 | 0.523      |
| ge         | NaiveBayes                   |   0.337 |                       0.494 |                   0.485 |                    0.484 |                                     0.529 | 0.488      |
| ge         | RandomForest                 |   0.389 |                       0.522 |                   0.527 |                    0.511 |                                     0.499 | 0.541      |
| ge         | RidgeClassifier              |   0.353 |                       0.46  |                   0.427 |                    0.449 |                                     0.485 | 0.477      |
| ge         | SVM                          |   0.397 |                       0.5   |                   0.465 |                    0.501 |                                     0.532 | 0.526      |
| ge         | XGBoost                      |   0.475 |                       0.55  |                   0.501 |                    0.509 |                                     0.501 | **0.567**  |
| it         | KNN                          |   0.391 |                       0.364 |                   0.322 |                    0.549 |                                     0.379 | 0.470      |
| it         | LinearSVM                    |   0.441 |                       0.428 |                   0.472 |                    0.486 |                                     0.414 | 0.552      |
| it         | LogisticRegression           |   0.444 |                       0.444 |                   0.442 |                    0.498 |                                     0.488 | 0.558      |
| it         | LogisticRegressionElasticNet |   0.416 |                       0.431 |                   0.464 |                    0.483 |                                     0.479 | 0.552      |
| it         | LogisticRegressionLasso      |   0.388 |                       0.467 |                   0.468 |                    0.474 |                                     0.488 | 0.507      |
| it         | LogisticRegressionRidge      |   0.429 |                       0.432 |                   0.472 |                    0.498 |                                     0.444 | 0.538      |
| it         | NaiveBayes                   |   0.512 |                       0.481 |                   0.474 |                    0.475 |                                     0.484 | 0.537      |
| it         | RandomForest                 |   0.441 |                       0.497 |                   0.501 |                    0.519 |                                     0.516 | **0.561**  |
| it         | RidgeClassifier              |   0.439 |                       0.439 |                   0.452 |                    0.467 |                                     0.462 | 0.534      |
| it         | SVM                          |   0.485 |                       0.485 |                   0.5   |                    0.524 |                                     0.514 | 0.533      |
| it         | XGBoost                      |   0.411 |                       0.468 |                   0.48  |                    0.501 |                                     0.508 | 0.535      |
| po         | KNN                          |   0.482 |                       0.429 |                   0.428 |                    0.39  |                                     0.066 | 0.462      |
| po         | LinearSVM                    |   0.545 |                       0.48  |                   0.537 |                    0.557 |                                     0.572 | 0.601      |
| po         | LogisticRegression           |   0.54  |                       0.486 |                   0.535 |                    0.545 |                                     0.553 | 0.629      |
| po         | LogisticRegressionElasticNet |   0.543 |                       0.502 |                   0.554 |                    0.533 |                                     0.566 | 0.623      |
| po         | LogisticRegressionLasso      |   0.422 |                       0.494 |                   0.5   |                    0.549 |                                     0.547 | **0.652**  |
| po         | LogisticRegressionRidge      |   0.553 |                       0.506 |                   0.554 |                    0.556 |                                     0.596 | 0.606      |
| po         | NaiveBayes                   |   0.537 |                       0.505 |                   0.526 |                    0.552 |                                     0.61  | 0.624      |
| po         | RandomForest                 |   0.379 |                       0.528 |                   0.544 |                    0.519 |                                     0.58  | 0.640      |
| po         | RidgeClassifier              |   0.561 |                       0.503 |                   0.509 |                    0.531 |                                     0.592 | 0.590      |
| po         | SVM                          |   0.55  |                       0.541 |                   0.527 |                    0.539 |                                     0.599 | 0.614      |
| po         | XGBoost                      |   0.57  |                       0.563 |                   0.564 |                    0.563 |                                     0.59  | 0.649      |
| ru         | KNN                          |   0.275 |                       0.203 |                   0.27  |                    0.3   |                                     0.323 | 0.258      |
| ru         | LinearSVM                    |   0.324 |                       0.347 |                   0.273 |                    0.369 |                                     0.324 | 0.353      |
| ru         | LogisticRegression           |   0.354 |                       0.314 |                   0.299 |                    0.329 |                                     0.3   | 0.381      |
| ru         | LogisticRegressionElasticNet |   0.331 |                       0.366 |                   0.339 |                    0.352 |                                     0.3   | 0.402      |
| ru         | LogisticRegressionLasso      |   0.309 |                       0.307 |                   0.283 |                    0.39  |                                     0.316 | 0.411      |
| ru         | LogisticRegressionRidge      |   0.345 |                       0.359 |                   0.343 |                    0.346 |                                     0.311 | 0.371      |
| ru         | NaiveBayes                   |   0.363 |                       0.317 |                   0.33  |                    0.372 |                                     0.351 | 0.398      |
| ru         | RandomForest                 |   0.3   |                       0.311 |                   0.313 |                    0.277 |                                     0.341 | 0.324      |
| ru         | RidgeClassifier              |   0.333 |                       0.365 |                   0.299 |                    0.333 |                                     0.318 | 0.338      |
| ru         | SVM                          |   0.388 |                       0.435 |                   0.365 |                    0.354 |                                     0.364 | 0.379      |
| ru         | XGBoost                      |   0.275 |                       0.259 |                   0.32  |                    0.347 |                                     0.367 | **0.454**  |

  multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)


## recall_micro

| language   | model_type                   | title     |   title and first paragraph |   title and 5 sentences | title and 10 sentences   |   title and first sentence each paragraph | raw text   |
|:-----------|:-----------------------------|:----------|----------------------------:|------------------------:|:-------------------------|------------------------------------------:|:-----------|
| en         | KNN                          | 0         |                       0.555 |                   0.66  | 0.699                    |                                     0.555 | 0.638      |
| en         | LinearSVM                    | 0         |                       0.665 |                   0.665 | 0.697                    |                                     0.621 | 0.667      |
| en         | LogisticRegression           | 0.548     |                       0.611 |                   0.614 | 0.638                    |                                     0.677 | 0.648      |
| en         | LogisticRegressionElasticNet | 0         |                       0.619 |                   0.645 | 0.641                    |                                     0.65  | 0.653      |
| en         | LogisticRegressionLasso      | 0.491     |                       0.601 |                   0.619 | 0.606                    |                                     0.628 | 0.636      |
| en         | LogisticRegressionRidge      | 0.611     |                       0.611 |                   0.648 | 0.692                    |                                     0.631 | 0.660      |
| en         | NaiveBayes                   | 0         |                       0     |                   0.655 | 0.667                    |                                     0.653 | 0.694      |
| en         | RandomForest                 | 0         |                       0     |                   0.621 | 0.623                    |                                     0.592 | 0.675      |
| en         | RidgeClassifier              | 0         |                       0.579 |                   0.623 | 0.648                    |                                     0.614 | 0.643      |
| en         | SVM                          | 0         |                       0.628 |                   0.66  | 0.660                    |                                     0.631 | 0.680      |
| en         | XGBoost                      | 0         |                       0.68  |                   0.689 | 0.694                    |                                     0.655 | **0.711**  |
| fr         | KNN                          | 0.254     |                       0.31  |                   0.198 | 0.397                    |                                     0.341 | 0.492      |
| fr         | LinearSVM                    | 0.270     |                       0.381 |                   0.357 | 0.294                    |                                     0.357 | 0.365      |
| fr         | LogisticRegression           | 0.294     |                       0.413 |                   0.365 | 0.325                    |                                     0.357 | 0.381      |
| fr         | LogisticRegressionElasticNet | 0.246     |                       0.437 |                   0.357 | 0.270                    |                                     0.397 | 0.365      |
| fr         | LogisticRegressionLasso      | 0.246     |                       0.405 |                   0.333 | 0.357                    |                                     0.357 | 0.373      |
| fr         | LogisticRegressionRidge      | 0.333     |                       0.389 |                   0.405 | 0.286                    |                                     0.349 | 0.500      |
| fr         | NaiveBayes                   | 0.381     |                       0.397 |                   0.389 | 0.341                    |                                     0.444 | 0.413      |
| fr         | RandomForest                 | **0.556** |                       0.532 |                   0.452 | 0.397                    |                                     0.421 | 0.421      |
| fr         | RidgeClassifier              | 0.333     |                       0.397 |                   0.325 | 0.246                    |                                     0.294 | 0.278      |
| fr         | SVM                          | 0.381     |                       0.413 |                   0.381 | 0.317                    |                                     0.357 | 0.349      |
| fr         | XGBoost                      | 0.437     |                       0.444 |                   0.373 | 0.397                    |                                     0.397 | 0.476      |
| ge         | KNN                          | 0.140     |                       0.36  |                   0.366 | 0.360                    |                                     0.436 | 0.471      |
| ge         | LinearSVM                    | 0.331     |                       0.453 |                   0.465 | 0.442                    |                                     0.43  | 0.453      |
| ge         | LogisticRegression           | 0.314     |                       0.448 |                   0.43  | 0.488                    |                                     0.442 | 0.535      |
| ge         | LogisticRegressionElasticNet | 0.297     |                       0.436 |                   0.43  | 0.453                    |                                     0.453 | 0.552      |
| ge         | LogisticRegressionLasso      | 0.314     |                       0.448 |                   0.407 | 0.436                    |                                     0.436 | 0.552      |
| ge         | LogisticRegressionRidge      | 0.331     |                       0.517 |                   0.43  | 0.471                    |                                     0.477 | 0.558      |
| ge         | NaiveBayes                   | 0.267     |                       0.488 |                   0.483 | 0.483                    |                                     0.576 | 0.500      |
| ge         | RandomForest                 | 0.477     |                       0.587 |                   0.558 | 0.506                    |                                     0.483 | 0.512      |
| ge         | RidgeClassifier              | 0.320     |                       0.442 |                   0.395 | 0.424                    |                                     0.471 | 0.448      |
| ge         | SVM                          | 0.366     |                       0.506 |                   0.442 | 0.506                    |                                     0.535 | 0.517      |
| ge         | XGBoost                      | **0.622** |                       0.616 |                   0.529 | 0.517                    |                                     0.535 | **0.622**  |
| it         | KNN                          | 0.357     |                       0.343 |                   0.257 | **0.661**                |                                     0.348 | 0.465      |
| it         | LinearSVM                    | 0.443     |                       0.413 |                   0.457 | 0.448                    |                                     0.365 | 0.530      |
| it         | LogisticRegression           | 0.430     |                       0.443 |                   0.43  | 0.517                    |                                     0.474 | 0.565      |
| it         | LogisticRegressionElasticNet | 0.400     |                       0.422 |                   0.439 | 0.461                    |                                     0.422 | 0.522      |
| it         | LogisticRegressionLasso      | 0.348     |                       0.439 |                   0.474 | 0.435                    |                                     0.461 | 0.474      |
| it         | LogisticRegressionRidge      | 0.422     |                       0.417 |                   0.452 | 0.478                    |                                     0.413 | 0.509      |
| it         | NaiveBayes                   | 0.561     |                       0.491 |                   0.439 | 0.435                    |                                     0.47  | 0.526      |
| it         | RandomForest                 | 0.478     |                       0.526 |                   0.565 | 0.509                    |                                     0.522 | 0.509      |
| it         | RidgeClassifier              | 0.443     |                       0.413 |                   0.404 | 0.413                    |                                     0.404 | 0.465      |
| it         | SVM                          | 0.513     |                       0.491 |                   0.452 | 0.487                    |                                     0.474 | 0.483      |
| it         | XGBoost                      | 0.522     |                       0.548 |                   0.526 | 0.574                    |                                     0.517 | 0.570      |
| po         | KNN                          | 0.451     |                       0.427 |                   0.388 | 0.335                    |                                     0.097 | 0.403      |
| po         | LinearSVM                    | 0.558     |                       0.471 |                   0.539 | 0.558                    |                                     0.568 | 0.597      |
| po         | LogisticRegression           | 0.524     |                       0.485 |                   0.563 | 0.534                    |                                     0.544 | 0.655      |
| po         | LogisticRegressionElasticNet | 0.519     |                       0.495 |                   0.553 | 0.510                    |                                     0.573 | 0.641      |
| po         | LogisticRegressionLasso      | 0.350     |                       0.427 |                   0.466 | 0.519                    |                                     0.529 | **0.733**  |
| po         | LogisticRegressionRidge      | 0.578     |                       0.549 |                   0.549 | 0.544                    |                                     0.621 | 0.612      |
| po         | NaiveBayes                   | 0.490     |                       0.481 |                   0.524 | 0.549                    |                                     0.626 | 0.646      |
| po         | RandomForest                 | 0.398     |                       0.539 |                   0.553 | 0.524                    |                                     0.505 | 0.563      |
| po         | RidgeClassifier              | 0.534     |                       0.505 |                   0.51  | 0.490                    |                                     0.578 | 0.558      |
| po         | SVM                          | 0.549     |                       0.539 |                   0.524 | 0.524                    |                                     0.597 | 0.558      |
| po         | XGBoost                      | 0.728     |                       0.67  |                   0.646 | 0.626                    |                                     0.617 | 0.709      |
| ru         | KNN                          | 0.256     |                       0.174 |                   0.267 | 0.279                    |                                     0.326 | 0.233      |
| ru         | LinearSVM                    | 0.314     |                       0.349 |                   0.221 | 0.326                    |                                     0.267 | 0.326      |
| ru         | LogisticRegression           | 0.360     |                       0.302 |                   0.256 | 0.326                    |                                     0.267 | 0.395      |
| ru         | LogisticRegressionElasticNet | 0.314     |                       0.337 |                   0.314 | 0.326                    |                                     0.267 | 0.372      |
| ru         | LogisticRegressionLasso      | 0.233     |                       0.326 |                   0.256 | 0.419                    |                                     0.302 | 0.395      |
| ru         | LogisticRegressionRidge      | 0.349     |                       0.337 |                   0.326 | 0.302                    |                                     0.256 | 0.337      |
| ru         | NaiveBayes                   | 0.419     |                       0.302 |                   0.256 | 0.326                    |                                     0.337 | 0.360      |
| ru         | RandomForest                 | **0.535** |                       0.477 |                   0.326 | 0.233                    |                                     0.326 | 0.302      |
| ru         | RidgeClassifier              | 0.326     |                       0.36  |                   0.244 | 0.279                    |                                     0.267 | 0.291      |
| ru         | SVM                          | 0.419     |                       0.465 |                   0.326 | 0.302                    |                                     0.326 | 0.326      |
| ru         | XGBoost                      | 0.407     |                       0.326 |                   0.337 | 0.360                    |                                     0.372 | **0.535**  |

  multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)


## precision_micro

| language   | model_type                   |   title | title and first paragraph   |   title and 5 sentences | title and 10 sentences   | title and first sentence each paragraph   | raw text   |
|:-----------|:-----------------------------|--------:|:----------------------------|------------------------:|:-------------------------|:------------------------------------------|:-----------|
| en         | KNN                          |   0     | 0.732                       |                   0.676 | 0.679                    | **0.790**                                 | 0.770      |
| en         | LinearSVM                    |   0     | 0.693                       |                   0.739 | 0.726                    | 0.733                                     | 0.746      |
| en         | LogisticRegression           |   0.604 | 0.660                       |                   0.636 | 0.656                    | 0.701                                     | 0.742      |
| en         | LogisticRegressionElasticNet |   0     | 0.677                       |                   0.649 | 0.685                    | 0.737                                     | 0.755      |
| en         | LogisticRegressionLasso      |   0.623 | 0.678                       |                   0.701 | 0.673                    | 0.723                                     | 0.714      |
| en         | LogisticRegressionRidge      |   0.644 | 0.734                       |                   0.747 | 0.713                    | 0.728                                     | 0.752      |
| en         | NaiveBayes                   |   0     | 0                           |                   0.749 | 0.729                    | 0.747                                     | 0.728      |
| en         | RandomForest                 |   0     | 0                           |                   0.684 | 0.692                    | 0.752                                     | 0.729      |
| en         | RidgeClassifier              |   0     | 0.744                       |                   0.752 | 0.755                    | 0.718                                     | 0.726      |
| en         | SVM                          |   0     | 0.756                       |                   0.737 | 0.732                    | 0.757                                     | 0.737      |
| en         | XGBoost                      |   0     | 0.634                       |                   0.649 | 0.664                    | 0.682                                     | 0.672      |
| fr         | KNN                          |   0.363 | 0.454                       |                   0.281 | 0.482                    | 0.469                                     | 0.434      |
| fr         | LinearSVM                    |   0.356 | 0.495                       |                   0.39  | 0.472                    | 0.420                                     | 0.505      |
| fr         | LogisticRegression           |   0.36  | 0.478                       |                   0.376 | 0.450                    | 0.369                                     | 0.525      |
| fr         | LogisticRegressionElasticNet |   0.42  | 0.467                       |                   0.425 | 0.478                    | 0.463                                     | 0.505      |
| fr         | LogisticRegressionLasso      |   0.454 | 0.542                       |                   0.529 | 0.486                    | 0.481                                     | 0.447      |
| fr         | LogisticRegressionRidge      |   0.365 | 0.526                       |                   0.438 | 0.481                    | 0.422                                     | 0.560      |
| fr         | NaiveBayes                   |   0.394 | 0.426                       |                   0.444 | 0.441                    | 0.535                                     | 0.485      |
| fr         | RandomForest                 |   0.42  | 0.459                       |                   0.357 | 0.386                    | 0.437                                     | 0.529      |
| fr         | RidgeClassifier              |   0.401 | 0.502                       |                   0.473 | 0.434                    | 0.382                                     | 0.407      |
| fr         | SVM                          |   0.424 | 0.419                       |                   0.412 | **0.573**                | 0.431                                     | 0.476      |
| fr         | XGBoost                      |   0.338 | 0.427                       |                   0.459 | 0.413                    | 0.422                                     | 0.481      |
| ge         | KNN                          |   0.309 | 0.371                       |                   0.426 | 0.417                    | 0.402                                     | 0.492      |
| ge         | LinearSVM                    |   0.462 | 0.518                       |                   0.528 | 0.514                    | 0.465                                     | 0.503      |
| ge         | LogisticRegression           |   0.459 | 0.535                       |                   0.492 | 0.527                    | 0.450                                     | 0.516      |
| ge         | LogisticRegressionElasticNet |   0.499 | **0.617**                   |                   0.499 | 0.532                    | 0.468                                     | 0.535      |
| ge         | LogisticRegressionLasso      |   0.454 | 0.599                       |                   0.512 | 0.504                    | 0.508                                     | 0.552      |
| ge         | LogisticRegressionRidge      |   0.458 | 0.550                       |                   0.493 | 0.521                    | 0.451                                     | 0.520      |
| ge         | NaiveBayes                   |   0.493 | 0.566                       |                   0.522 | 0.511                    | 0.500                                     | 0.536      |
| ge         | RandomForest                 |   0.523 | 0.562                       |                   0.534 | 0.542                    | 0.549                                     | 0.593      |
| ge         | RidgeClassifier              |   0.48  | 0.549                       |                   0.512 | 0.500                    | 0.575                                     | 0.552      |
| ge         | SVM                          |   0.49  | 0.528                       |                   0.576 | 0.557                    | 0.545                                     | 0.550      |
| ge         | XGBoost                      |   0.436 | 0.555                       |                   0.507 | 0.547                    | 0.550                                     | 0.549      |
| it         | KNN                          |   0.451 | 0.420                       |                   0.496 | 0.490                    | 0.471                                     | 0.560      |
| it         | LinearSVM                    |   0.46  | 0.470                       |                   0.508 | 0.556                    | 0.537                                     | 0.614      |
| it         | LogisticRegression           |   0.479 | 0.463                       |                   0.471 | 0.492                    | 0.549                                     | 0.578      |
| it         | LogisticRegressionElasticNet |   0.465 | 0.463                       |                   0.509 | 0.517                    | 0.668                                     | 0.643      |
| it         | LogisticRegressionLasso      |   0.49  | 0.515                       |                   0.479 | 0.550                    | 0.570                                     | 0.616      |
| it         | LogisticRegressionRidge      |   0.467 | 0.476                       |                   0.52  | 0.532                    | 0.550                                     | 0.626      |
| it         | NaiveBayes                   |   0.487 | 0.496                       |                   0.546 | 0.568                    | 0.602                                     | 0.595      |
| it         | RandomForest                 |   0.473 | 0.493                       |                   0.477 | 0.540                    | 0.532                                     | 0.667      |
| it         | RidgeClassifier              |   0.462 | 0.493                       |                   0.551 | 0.581                    | 0.602                                     | **0.722**  |
| it         | SVM                          |   0.486 | 0.508                       |                   0.575 | 0.637                    | 0.612                                     | 0.631      |
| it         | XGBoost                      |   0.353 | 0.417                       |                   0.448 | 0.453                    | 0.512                                     | 0.527      |
| po         | KNN                          |   0.583 | 0.453                       |                   0.575 | 0.510                    | 0.050                                     | 0.577      |
| po         | LinearSVM                    |   0.549 | 0.520                       |                   0.586 | 0.593                    | 0.670                                     | 0.685      |
| po         | LogisticRegression           |   0.584 | 0.516                       |                   0.536 | 0.571                    | 0.621                                     | 0.645      |
| po         | LogisticRegressionElasticNet |   0.619 | 0.542                       |                   0.602 | 0.586                    | 0.597                                     | 0.667      |
| po         | LogisticRegressionLasso      |   0.614 | 0.614                       |                   0.585 | 0.614                    | 0.601                                     | 0.624      |
| po         | LogisticRegressionRidge      |   0.549 | 0.515                       |                   0.62  | 0.608                    | 0.610                                     | 0.669      |
| po         | NaiveBayes                   |   0.625 | 0.564                       |                   0.567 | 0.584                    | 0.624                                     | 0.658      |
| po         | RandomForest                 |   0.636 | 0.591                       |                   0.57  | 0.540                    | 0.709                                     | **0.784**  |
| po         | RidgeClassifier              |   0.625 | 0.526                       |                   0.554 | 0.657                    | 0.724                                     | 0.713      |
| po         | SVM                          |   0.569 | 0.581                       |                   0.625 | 0.579                    | 0.733                                     | 0.737      |
| po         | XGBoost                      |   0.501 | 0.524                       |                   0.535 | 0.550                    | 0.601                                     | 0.640      |
| ru         | KNN                          |   0.413 | 0.342                       |                   0.298 | 0.362                    | 0.375                                     | 0.317      |
| ru         | LinearSVM                    |   0.392 | 0.410                       |                   0.376 | 0.444                    | 0.452                                     | 0.459      |
| ru         | LogisticRegression           |   0.403 | 0.376                       |                   0.436 | 0.355                    | 0.380                                     | 0.401      |
| ru         | LogisticRegressionElasticNet |   0.457 | 0.459                       |                   0.447 | 0.411                    | 0.368                                     | 0.461      |
| ru         | LogisticRegressionLasso      |   0.539 | 0.331                       |                   0.362 | 0.388                    | 0.358                                     | 0.464      |
| ru         | LogisticRegressionRidge      |   0.39  | 0.455                       |                   0.447 | 0.422                    | 0.446                                     | 0.470      |
| ru         | NaiveBayes                   |   0.358 | 0.421                       |                   0.476 | 0.453                    | 0.405                                     | **0.567**  |
| ru         | RandomForest                 |   0.245 | 0.246                       |                   0.313 | 0.366                    | 0.402                                     | 0.409      |
| ru         | RidgeClassifier              |   0.398 | 0.430                       |                   0.42  | 0.441                    | 0.448                                     | 0.422      |
| ru         | SVM                          |   0.385 | 0.474                       |                   0.498 | 0.497                    | 0.526                                     | 0.555      |
| ru         | XGBoost                      |   0.335 | 0.264                       |                   0.354 | 0.385                    | 0.400                                     | 0.452      |

  multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)


## accuracy

| language   | model_type                   |   title | title and first paragraph   | title and 5 sentences   | title and 10 sentences   |   title and first sentence each paragraph | raw text   |
|:-----------|:-----------------------------|--------:|:----------------------------|:------------------------|:-------------------------|------------------------------------------:|:-----------|
| en         | KNN                          |   0     | **0.126**                   | 0.019                   | 0.058                    |                                     0.097 | 0.107      |
| en         | LinearSVM                    |   0     | 0.058                       | 0.087                   | 0.049                    |                                     0.117 | 0.068      |
| en         | LogisticRegression           |   0.029 | 0.039                       | 0.049                   | 0.019                    |                                     0.068 | 0.068      |
| en         | LogisticRegressionElasticNet |   0     | 0.058                       | 0.058                   | 0.049                    |                                     0.117 | 0.087      |
| en         | LogisticRegressionLasso      |   0.049 | 0.058                       | 0.068                   | 0.068                    |                                     0.058 | 0.068      |
| en         | LogisticRegressionRidge      |   0.058 | 0.117                       | 0.107                   | 0.078                    |                                     0.087 | 0.058      |
| en         | NaiveBayes                   |   0     | 0                           | 0.087                   | 0.107                    |                                     0.097 | 0.068      |
| en         | RandomForest                 |   0     | 0                           | 0.117                   | 0.049                    |                                     0.117 | 0.117      |
| en         | RidgeClassifier              |   0     | 0.117                       | 0.117                   | **0.126**                |                                     0.087 | 0.078      |
| en         | SVM                          |   0     | 0.087                       | 0.097                   | 0.097                    |                                     0.078 | 0.058      |
| en         | XGBoost                      |   0     | 0.049                       | 0.039                   | 0.107                    |                                     0.078 | **0.126**  |
| fr         | KNN                          |   0.048 | 0.048                       | 0.024                   | 0.071                    |                                     0     | 0.048      |
| fr         | LinearSVM                    |   0     | 0.071                       | 0.024                   | 0.048                    |                                     0     | **0.143**  |
| fr         | LogisticRegression           |   0     | 0.048                       | 0.048                   | 0.095                    |                                     0     | 0.095      |
| fr         | LogisticRegressionElasticNet |   0.024 | 0.024                       | 0.024                   | 0.071                    |                                     0.024 | 0.119      |
| fr         | LogisticRegressionLasso      |   0.024 | 0.024                       | 0.024                   | 0.024                    |                                     0.071 | 0.095      |
| fr         | LogisticRegressionRidge      |   0     | 0.071                       | 0.000                   | 0.024                    |                                     0     | 0.024      |
| fr         | NaiveBayes                   |   0.024 | 0.000                       | 0.024                   | 0.048                    |                                     0.048 | 0.071      |
| fr         | RandomForest                 |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0     | 0.024      |
| fr         | RidgeClassifier              |   0     | 0.048                       | 0.048                   | 0.048                    |                                     0     | 0.071      |
| fr         | SVM                          |   0.024 | 0.000                       | 0.024                   | 0.024                    |                                     0.024 | 0.119      |
| fr         | XGBoost                      |   0.024 | 0.024                       | 0.000                   | 0.024                    |                                     0.024 | 0.024      |
| ge         | KNN                          |   0.029 | 0.000                       | 0.000                   | 0.000                    |                                     0     | 0.029      |
| ge         | LinearSVM                    |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0     | 0.000      |
| ge         | LogisticRegression           |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0     | 0.000      |
| ge         | LogisticRegressionElasticNet |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0.029 | 0.000      |
| ge         | LogisticRegressionLasso      |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0     | 0.000      |
| ge         | LogisticRegressionRidge      |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0     | 0.029      |
| ge         | NaiveBayes                   |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0     | **0.057**  |
| ge         | RandomForest                 |   0     | 0.000                       | 0.000                   | 0.029                    |                                     0     | 0.029      |
| ge         | RidgeClassifier              |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0     | 0.029      |
| ge         | SVM                          |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0.029 | 0.029      |
| ge         | XGBoost                      |   0.029 | 0.000                       | 0.000                   | 0.000                    |                                     0     | 0.000      |
| it         | KNN                          |   0.017 | 0.067                       | 0.050                   | 0.017                    |                                     0.067 | 0.117      |
| it         | LinearSVM                    |   0.017 | 0.083                       | 0.033                   | 0.067                    |                                     0.05  | 0.083      |
| it         | LogisticRegression           |   0.067 | 0.050                       | 0.050                   | 0.033                    |                                     0.033 | 0.050      |
| it         | LogisticRegressionElasticNet |   0.033 | 0.083                       | 0.050                   | 0.067                    |                                     0.083 | 0.100      |
| it         | LogisticRegressionLasso      |   0.083 | 0.033                       | 0.033                   | 0.050                    |                                     0.067 | 0.050      |
| it         | LogisticRegressionRidge      |   0.033 | 0.083                       | 0.033                   | 0.017                    |                                     0.067 | 0.100      |
| it         | NaiveBayes                   |   0.033 | 0.067                       | 0.050                   | 0.067                    |                                     0.067 | 0.083      |
| it         | RandomForest                 |   0     | 0.000                       | 0.033                   | 0.067                    |                                     0.017 | 0.067      |
| it         | RidgeClassifier              |   0.05  | 0.083                       | 0.067                   | 0.067                    |                                     0.083 | **0.150**  |
| it         | SVM                          |   0.033 | 0.050                       | 0.067                   | 0.067                    |                                     0.083 | 0.083      |
| it         | XGBoost                      |   0     | 0.000                       | 0.033                   | 0.050                    |                                     0     | 0.050      |
| po         | KNN                          |   0     | 0.000                       | 0.000                   | 0.026                    |                                     0.026 | 0.026      |
| po         | LinearSVM                    |   0     | 0.026                       | 0.026                   | 0.026                    |                                     0.026 | **0.051**  |
| po         | LogisticRegression           |   0     | 0.000                       | 0.000                   | 0.026                    |                                     0     | **0.051**  |
| po         | LogisticRegressionElasticNet |   0     | 0.000                       | 0.026                   | 0.026                    |                                     0     | 0.026      |
| po         | LogisticRegressionLasso      |   0     | 0.000                       | 0.000                   | 0.026                    |                                     0     | 0.026      |
| po         | LogisticRegressionRidge      |   0     | 0.000                       | 0.026                   | 0.026                    |                                     0     | 0.026      |
| po         | NaiveBayes                   |   0     | 0.000                       | 0.026                   | 0.026                    |                                     0     | 0.026      |
| po         | RandomForest                 |   0     | 0.000                       | 0.026                   | 0.026                    |                                     0.026 | 0.026      |
| po         | RidgeClassifier              |   0     | 0.000                       | 0.026                   | 0.026                    |                                     0.026 | 0.026      |
| po         | SVM                          |   0     | 0.000                       | **0.051**               | 0.026                    |                                     0.026 | **0.051**  |
| po         | XGBoost                      |   0     | 0.000                       | 0.000                   | 0.000                    |                                     0     | 0.026      |
| ru         | KNN                          |   0     | 0.026                       | 0.079                   | 0.105                    |                                     0.053 | 0.079      |
| ru         | LinearSVM                    |   0     | 0.053                       | 0.053                   | 0.105                    |                                     0.053 | 0.105      |
| ru         | LogisticRegression           |   0     | 0.053                       | 0.053                   | 0.000                    |                                     0.053 | 0.105      |
| ru         | LogisticRegressionElasticNet |   0     | 0.079                       | 0.053                   | 0.079                    |                                     0.079 | **0.158**  |
| ru         | LogisticRegressionLasso      |   0.026 | 0.026                       | 0.053                   | 0.053                    |                                     0.053 | 0.132      |
| ru         | LogisticRegressionRidge      |   0     | 0.079                       | 0.026                   | 0.079                    |                                     0.079 | 0.105      |
| ru         | NaiveBayes                   |   0     | 0.053                       | **0.158**               | 0.132                    |                                     0.053 | 0.079      |
| ru         | RandomForest                 |   0     | 0.000                       | 0.000                   | 0.026                    |                                     0.053 | 0.105      |
| ru         | RidgeClassifier              |   0.026 | 0.079                       | 0.132                   | 0.053                    |                                     0.053 | 0.079      |
| ru         | SVM                          |   0     | 0.053                       | **0.158**               | 0.053                    |                                     0.105 | 0.053      |
| ru         | XGBoost                      |   0.026 | 0.000                       | 0.053                   | 0.105                    |                                     0.132 | **0.158**  |

  multi_language_report_table_metric.reset_index().to_latex(latex_file, index=False)



# Per model sub-type

In [32]:
%%capture
display_metrics_and_write_to_file(df=exps_df, grouping_criterion=['model_type','model_subtype'], output_dir='per_model_subtype_tables')

# Per model sub-type and exp name

In [33]:
%%capture
display_metrics_and_write_to_file(df=exps_df, grouping_criterion=['model_type','model_subtype', 'model_name'], output_dir='per_exp_name_tables')