# Load Libraries

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline



# Change the directory

In [2]:
import os
os.chdir("../../../")
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd drive/MyDrive/poleval_emotion/

# Determine constants

In [3]:
INPUT_DIR = 'data/train/'
IN_FILENAME = 'in_prep_gpt.tsv'
EXPECTED_FILENAME = 'expected.tsv'

In [4]:
PREDICTIONS_DIR = 'predictions/train/'
RESULTS_FILENAME = 'hfam_prep_gpt.csv'

# Load & preprocess data

In [5]:
def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    return data

# Define Hugging Face models and tokenizers

In [6]:
models = {
    "Herbert": "dkleczek/Polish-Hate-Speech-Detection-Herbert-Large",
    "XLM-RoBERTa": "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual",
    "Multilingual BERT": "nlptown/bert-base-multilingual-uncased-sentiment"
}

In [7]:
tokenizers = {
    name: AutoTokenizer.from_pretrained(model_name)
    for name, model_name in models.items()
}

# Create pipelines for text classification

In [8]:
model_pipelines = {
    name: pipeline("text-classification", model=AutoModelForSequenceClassification.from_pretrained(model_name), tokenizer=tokenizers[name])
    for name, model_name in models.items()
}

# Load & preprocess the data

In [9]:
in_data = load_data(INPUT_DIR + IN_FILENAME)
in_data['text'] = in_data['text'].astype(str)
expected_data = load_data(INPUT_DIR + EXPECTED_FILENAME)

# Predict using each model

In [10]:
predictions = {}
for model_name, model_pipeline in model_pipelines.items():
    predictions[model_name] = model_pipeline(in_data['text'].tolist())

# Process predictions

In [11]:
results = []
for i, text in enumerate(in_data['text']):
    result = {"text": text}
    for model_name, model_predictions in predictions.items():
        result[f"{model_name}_label"] = model_predictions[i]['label']
        result[f"{model_name}_score"] = model_predictions[i]['score']
    result["expected_label"] = expected_data.iloc[i].values.tolist()
    results.append(result)

# Convert results to DataFrame

In [12]:
results_df = pd.DataFrame(results)
results_df.to_csv(PREDICTIONS_DIR + RESULTS_FILENAME, index=False)

# Save the results

In [13]:
print(results_df.head())

                                                text Herbert_label  \
0                                      dwie gwiazdki       LABEL_0   
1            ten hotel zasługuje na maksymalnie tyle       LABEL_0   
2     to że hotel ma 4 gwiazdki w nazwie jest smutne       LABEL_0   
3  śmiesznie było już podczas rezerwacji pani z r...       LABEL_0   
4  udało mi się jednak dowiedzieć że w hotelu są ...       LABEL_0   

   Herbert_score XLM-RoBERTa_label  XLM-RoBERTa_score Multilingual BERT_label  \
0       0.787984          positive           0.584631                 2 stars   
1       0.853437          positive           0.977473                 5 stars   
2       0.811031          negative           0.982163                 4 stars   
3       0.913569          negative           0.955288                 2 stars   
4       0.978560           neutral           0.866278                 3 stars   

   Multilingual BERT_score                                     expected_label  
0           