In [1]:
from sklearn.metrics import classification_report
import pandas as pd
import json as js
from ast import literal_eval
from pprint import pprint
from copy import deepcopy
from sklearn.metrics import f1_score

In [2]:
with open('../utils/label2id.json', 'r') as f:
    label2id = js.load(f)

## Zero-Shot-Results

In [8]:
def filter_labels(label_list, threshold):
    label_list = [x[0] for x in label_list if x[1]>threshold]
    return label_list

def replace_wrong_values(zero_shot_label_list):
    zero_shot_label_list = [x if x != 'Grundkompetenzen_Illetrismus' else 'Europapolitik' for x in zero_shot_label_list]
    return zero_shot_label_list

def get_one_hot_encoding(zero_shot_label_list):
    one_hot_affair_topic_codes = []
    for label in sorted(list(label2id.keys())):
        if label in zero_shot_label_list:
            one_hot_affair_topic_codes.append(1)
        else:
            one_hot_affair_topic_codes.append(0)
    return list(one_hot_affair_topic_codes)

def get_macro_f1_for_zeroshot_results(df, column_name, average=None):
    df_copy = deepcopy(df)
    df_copy['one_hot_representation'] = df[column_name].apply(get_one_hot_encoding)
    # multilabel classification
    y_true = df_copy['one_hot_affair_topic_codes'].tolist()
    y_pred = df_copy['one_hot_representation'].tolist()
    # print(y_pred)
    _score = f1_score(y_true, y_pred, average=average)
    return _score
    
    

In [10]:
path_to_results_of_multilingual_model = '/Users/vetonmatoshi/Documents/POLITmonitor/results_of_experiments/zero_shot_results/MoritzLaurer:mDeBERTa-v3-base-xnli-multilingual-nli-2mil7/results_zero_shot.jsonl'
path_to_results_of_german_model = '/Users/vetonmatoshi/Documents/POLITmonitor/results_of_experiments/zero_shot_results/Sahajtomar_German_Zeroshot/results_zero_shot_results_only_German_Sahajtomar_German_Zeroshot.jsonl'
zero_shot_results = pd.read_json(path_to_results_of_german_model, lines=True)
print('Number of predictions for the German model: ', zero_shot_results.shape)
zero_shot_results = pd.read_json(path_to_results_of_multilingual_model, lines=True)
print('Number of predictions for the multilingual model: ', zero_shot_results.shape)


Number of predictions for the German model:  (19123, 111)
Number of predictions for the multilingual model:  (40830, 111)


In [23]:
zero_shot_results = pd.read_json(path_to_results_of_multilingual_model, lines=True)
for th in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    column_name = 'results_with_threshold_'+str(th)
    zero_shot_results[column_name]=zero_shot_results.zero_shot_results.apply(lambda x: filter_labels(x,th))
    zero_shot_results[column_name] = zero_shot_results[column_name].apply(replace_wrong_values)
    print(th,': ',get_macro_f1_for_zeroshot_results(zero_shot_results, column_name, 'macro'))

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.1 :  0.05942670543689178


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.2 :  0.06282522303841867


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.3 :  0.06541019946248382


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.4 :  0.06743364624963855


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.5 :  0.06946874022821714


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.6 :  0.07111248264459161


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.7 :  0.07259090284674684


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.8 :  0.07352112514399867
0.9 :  0.0726438994310175


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [24]:
for item in zero_shot_results.to_dict(orient='records'):
    if item['language']=='de' and item['affair_attachment_category']=='Text 1' and (len(item['text'])>50 and len(item['text'])<300):
        y_pred = set(item['results_with_threshold_0.5'])
        y_true = set(item['affair_topic_codes_as_labels'])
        if len(y_pred.intersection(y_true))>0:
            if 'Gesundheit' in y_true and 'Gesundheit' not in y_pred:
                print('True values',':',y_true)
                print('Predicted values',':',y_pred)
                print('----------------------------')
                print(item['text'])
                print('\n#########################\n')
        

True values : {'Beschäftigung und Arbeit', 'Finanzwesen', 'Gesundheit'}
Predicted values : {'Europapolitik', 'Finanzwesen', 'Wissenschaft / Forschung', 'Menschenrechte', 'Bildung', 'Sozialer Schutz', 'Strafrecht', 'Steuer', 'Parlament', 'Politischer Rahmen', 'Staatspolitik', 'Soziale Fragen', 'Internationale Politik', 'Zivilrecht', 'Gerichtswesen'}
----------------------------
Der Bundesrat wird beauftragt, eine Senkung des Personalbestands auf 35 000 Vollzeitäquivalente sowie eine Senkung der Bundespersonalausgaben auf 5 Milliarden Franken bis spätestens in 4 Jahren umzusetzen.

#########################

True values : {'Beschäftigung und Arbeit', 'Gesundheit'}
Predicted values : {'Wirtschaft', 'Recht', 'Beschäftigung und Arbeit', 'Raumplanung / Wohnungswesen', 'Sozialer Schutz', 'Parlament', 'Politischer Rahmen', 'Staatspolitik'}
----------------------------
Der Bundesrat wird beauftragt, unnötige Administrativarbeiten der Unternehmen für die AHV abzuschaffen (Art. 136 AHVV).

######

In [26]:
# Evaluation per metric
target_names = sorted(list(label2id.keys()))
y_true_one_hot_encoding = zero_shot_results.one_hot_affair_topic_codes.tolist()
zero_shot_results["y_pred_one_hot_encoding"] = zero_shot_results['results_with_threshold_'+str(0.5)].apply(get_one_hot_encoding)
y_pred_one_hot_encoding = zero_shot_results.y_pred_one_hot_encoding.tolist()
# print(y_true_one_hot_encoding)
print(classification_report(y_true_one_hot_encoding, y_pred_one_hot_encoding, target_names=target_names, output_dict=False))

                                     precision    recall  f1-score   support

                              Alter       0.00      0.00      0.00       352
                              Armut       0.00      0.00      0.00        60
           Beschäftigung und Arbeit       0.11      0.21      0.14      1711
                            Bildung       0.12      0.29      0.17      1232
                    Digitalisierung       0.00      0.00      0.00       903
                           Diverses       0.00      0.00      0.00         0
                            Energie       0.28      0.27      0.27      1887
                      Europapolitik       0.04      0.30      0.07      1213
                           Familie        0.00      0.00      0.00       579
                        Finanzwesen       0.30      0.22      0.26      4764
                          Forschung       0.00      0.00      0.00       937
                         Gesundheit       0.32      0.23      0.27      619

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
df.head()

NameError: name 'df' is not defined

## Making a report about the performance per label

In [1]:
# Reading the data
prediction_results = pd.read_json('../data/xlm-roberta-base_learning_rate_0.0003batch_size_16weight_decay_0.0--fp16/test_predictions_detailed.json')

with open('../utils/label2id.json', 'r') as f:
    label2id = js.load(f)

del label2id['Diverses']

# One hot encoded predictions
y_pred = prediction_results.predictions.tolist()

# One hot encoded reference labels
y_true = prediction_results.reference.tolist()

# Names of the labels spelled out
target_names = sorted(list(label2id.keys()))

NameError: name 'pd' is not defined

In [4]:
prediction_results = pd.read_json('../data/xlm-roberta-base_learning_rate_0.0003batch_size_16weight_decay_0.0--fp16/test_predictions_detailed.json')

In [5]:
prediction_results.reference.tolist()

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
