In [13]:
import os
import glob

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

LABELS = ('Economic', 'Capacity_and_resources', 'Morality', 'Fairness_and_equality',
          'Legality_Constitutionality_and_jurisprudence', 'Policy_prescription_and_evaluation', 'Crime_and_punishment',
          'Security_and_defense', 'Health_and_safety', 'Quality_of_life', 'Cultural_identity', 'Public_opinion',
          'Political', 'External_regulation_and_reputation')

In [2]:
mlb = MultiLabelBinarizer()
mlb.fit([LABELS])

correct_column_names = mlb.classes_

In [3]:
LABELS

('Economic',
 'Capacity_and_resources',
 'Morality',
 'Fairness_and_equality',
 'Legality_Constitutionality_and_jurisprudence',
 'Policy_prescription_and_evaluation',
 'Crime_and_punishment',
 'Security_and_defense',
 'Health_and_safety',
 'Quality_of_life',
 'Cultural_identity',
 'Public_opinion',
 'Political',
 'External_regulation_and_reputation')

In [5]:
correct_column_names

array(['Capacity_and_resources', 'Crime_and_punishment',
       'Cultural_identity', 'Economic',
       'External_regulation_and_reputation', 'Fairness_and_equality',
       'Health_and_safety',
       'Legality_Constitutionality_and_jurisprudence', 'Morality',
       'Policy_prescription_and_evaluation', 'Political',
       'Public_opinion', 'Quality_of_life', 'Security_and_defense'],
      dtype=object)

# Load the predictions with incorrect column names and correct the column names

In [17]:
input_dir = 'evaluations_to_report_wrong_label_naming'
output_dir = 'evaluations_to_report_correct_label_naming'
os.makedirs(output_dir, exist_ok=True)

In [18]:
for df_filepath in glob.glob(os.path.join(input_dir, '*.csv')):
    df_i = pd.read_csv(df_filepath, index_col='id')
    df_i.columns = correct_column_names
    df_i.to_csv(os.path.join(output_dir, os.path.basename(df_filepath)))

In [15]:
df_i

Unnamed: 0_level_0,Capacity_and_resources,Crime_and_punishment,Cultural_identity,Economic,External_regulation_and_reputation,Fairness_and_equality,Health_and_safety,Legality_Constitutionality_and_jurisprudence,Morality,Policy_prescription_and_evaluation,Political,Public_opinion,Quality_of_life,Security_and_defense
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3327,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3345,0,0,0,0,1,0,0,0,1,0,0,0,0,1
3314,1,0,0,1,0,0,0,0,0,0,0,0,0,0
3337,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3348,1,0,0,1,1,0,0,0,0,0,1,0,0,0
3317,1,0,0,1,1,0,0,0,0,0,1,0,0,1
3350,1,0,0,1,0,0,1,0,0,1,0,0,1,0
3335,0,0,0,0,0,0,0,0,0,0,0,0,0,0
336,0,0,0,0,0,0,0,0,0,0,1,0,0,0
337,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [11]:
pd.read_csv(os.path.join('evaluations_to_report', 'en_ComplementNaiveBayes_ROS_y_test.csv'), index_col='id')

Unnamed: 0_level_0,Economic,Capacity_and_resources,Morality,Fairness_and_equality,Legality_Constitutionality_and_jurisprudence,Policy_prescription_and_evaluation,Crime_and_punishment,Security_and_defense,Health_and_safety,Quality_of_life,Cultural_identity,Public_opinion,Political,External_regulation_and_reputation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
313,0,0,0,0,1,0,0,0,0,0,1,0,0,0
319,0,0,0,0,0,1,0,0,1,0,1,0,0,0
3118,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3124,0,0,0,0,0,0,1,0,0,0,0,0,1,1
3130,0,0,0,0,1,0,0,0,0,0,1,0,0,1
3110,0,0,0,0,1,0,0,0,0,0,1,0,0,1
318,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3112,0,0,0,0,1,0,0,0,0,0,1,0,0,1
316,0,1,0,0,0,0,0,0,0,0,0,0,0,1
315,0,0,0,0,1,0,0,0,0,0,1,0,0,1


# Shape into desired format for submission and Store correctly named predictions

In [19]:
submission_dir = './evaluations_to_report_correct_label_naming'
output_dir = './correct_format_submissions_correct_label_naming'
os.makedirs(output_dir, exist_ok=True)

In [20]:
for prediction_filepath in os.listdir(submission_dir):
    language = prediction_filepath.split('_')[0]
    eval_df = pd.read_csv(os.path.join(submission_dir, prediction_filepath), index_col='id')
    submission_df = eval_df.apply(lambda row: ','.join(list(row[row==1].index)), axis=1)
    submission_df.to_csv(os.path.join(output_dir, f'{language}_submission.txt'), sep='\t', header=False)
    print(language)
    print(prediction_filepath)

fr
fr_ComplementNaiveBayes_ROS_y_test.csv
en
en_ComplementNaiveBayes_ROS_y_test.csv
ru
ru_ComplementNaiveBayes_SMOTE_y_test.csv
it
it_ComplementNaiveBayes_ROS_y_test.csv
po
po_LinearSVMDual_y_test.csv
ge
ge_ComplementNaiveBayes_SMOTE_y_test.csv
