# Load Libraries

In [1]:
import pandas as pd
import os

# Change the directory

In [2]:
import os
os.chdir("../../")
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd drive/MyDrive/poleval_emotion/

# Determine constants

In [3]:
HFAM_PREFIX = 'hfam'
LSTM_PREFIX = 'lstm'
MODEL_NAMES = ['baseline', 'gpt_corr', 'prep_bas', 'prep_gpt']
EMOTIONS = ['Joy', 'Trust', 'Anticipation', 'Surprise', 'Fear', 'Sadness',
            'Disgust', 'Anger', 'Positive', 'Negative', 'Neutral']


In [4]:
PREDICTIONS_PATH = 'predictions/testB/'
OUTPUT_PATH = 'data/testB/concated_for_ensemble_final.csv'

# Load the data

In [5]:
dataframes = {}
for model in MODEL_NAMES:
    hfam_file_path = os.path.join(PREDICTIONS_PATH, f'{HFAM_PREFIX}_{model}.csv')
    lstm_file_path = os.path.join(PREDICTIONS_PATH, f'{LSTM_PREFIX}_{model}.tsv')
    
    hfam_df = pd.read_csv(hfam_file_path)
    lstm_df = pd.read_csv(lstm_file_path, header=None, names=[f'{model}_{emotion}' for emotion in EMOTIONS], sep='\t')
    
    dataframes[model] = (hfam_df, lstm_df)

# Concatenate the data

In [6]:
final_df = pd.DataFrame()

In [7]:
for model in MODEL_NAMES:
    hfam_df, lstm_df = dataframes[model]
    
    hfam_df = hfam_df.drop(columns=['text'])
    hfam_df = hfam_df.add_prefix(f'{model}_')
    
    if final_df.empty:
        final_df = pd.concat([hfam_df, lstm_df], axis=1)
    else:
        final_df = pd.concat([final_df, hfam_df, lstm_df], axis=1)

# Pre-processing

### Perform encoding for "Herbert_label"

In [8]:
for col in final_df.columns:
    if 'Herbert_label' in col:
        final_df[col] = final_df[col].map({'LABEL_0': 0, 'LABEL_1': 1})

### Perform One-Hot encoding for "XLM-RoBERTa_label"

In [9]:
xlm_roberta_cols = [col for col in final_df.columns if 'XLM-RoBERTa_label' in col]
final_df = pd.get_dummies(final_df, columns=xlm_roberta_cols, prefix=xlm_roberta_cols)

### Perform encoding for "Multilingual BERT_label"

In [10]:
for col in final_df.columns:
    if 'Multilingual BERT_label' in col:
        final_df[col] = final_df[col].map({'1 star': 0.0, '2 stars': 0.25, '3 stars': 0.5, '4 stars': 0.75, '5 stars': 1.0})

### Change boolean values "True" / "False" just to 1 / 0

In [11]:
bool_columns = final_df.select_dtypes(include=['bool']).columns
for col in bool_columns:
    final_df[col] = final_df[col].astype(int)

### Standardize each column from 0 (min) to 1 (max)

In [12]:
final_df = (final_df - final_df.min()) / (final_df.max() - final_df.min())

# Save the final dataframe to a CSV file

In [13]:
final_df.to_csv(OUTPUT_PATH, index=False)

# Print the output table

In [14]:
print(final_df.head())

   baseline_Herbert_label  baseline_Herbert_score  baseline_XLM-RoBERTa_score  \
0                     0.0                0.980108                    0.585627   
1                     0.0                0.972357                    0.962053   
2                     0.0                0.939091                    0.708443   
3                     0.0                0.813600                    0.759773   
4                     0.0                0.594430                    0.251097   

   baseline_Multilingual BERT_label  baseline_Multilingual BERT_score  \
0                               1.0                          0.276559   
1                               1.0                          0.327586   
2                               1.0                          0.164487   
3                               0.0                          0.240150   
4                               0.0                          0.192480   

   baseline_Joy  baseline_Trust  baseline_Anticipation  baseline_Surprise 

In [15]:
print(final_df.describe())

       baseline_Herbert_label  baseline_Herbert_score  \
count             1431.000000             1431.000000   
mean                 0.004193                0.866725   
std                  0.064639                0.146855   
min                  0.000000                0.000000   
25%                  0.000000                0.853903   
50%                  0.000000                0.928155   
75%                  0.000000                0.958210   
max                  1.000000                1.000000   

       baseline_XLM-RoBERTa_score  baseline_Multilingual BERT_label  \
count                 1431.000000                       1431.000000   
mean                     0.717809                          0.459119   
std                      0.284760                          0.375712   
min                      0.000000                          0.000000   
25%                      0.485921                          0.000000   
50%                      0.853569                          0