# Load Libraries

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline



# Change the directory

In [2]:
import os
os.chdir("../../../")
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd drive/MyDrive/poleval_emotion/

# Determine constants

In [3]:
INPUT_DIR = 'data/testB/'
IN_FILENAME = 'in_gpt_corr.tsv'

In [4]:
PREDICTIONS_DIR = 'predictions/testB/'
RESULTS_FILENAME = 'hfam_gpt_corr.csv'

# Load & preprocess data

In [5]:
def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    return data

# Define Hugging Face models and tokenizers

In [6]:
models = {
    "Herbert": "dkleczek/Polish-Hate-Speech-Detection-Herbert-Large",
    "XLM-RoBERTa": "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual",
    "Multilingual BERT": "nlptown/bert-base-multilingual-uncased-sentiment"
}

In [7]:
tokenizers = {
    name: AutoTokenizer.from_pretrained(model_name)
    for name, model_name in models.items()
}

# Create pipelines for text classification

In [8]:
model_pipelines = {
    name: pipeline("text-classification", model=AutoModelForSequenceClassification.from_pretrained(model_name), tokenizer=tokenizers[name])
    for name, model_name in models.items()
}

# Load & preprocess the data

In [9]:
in_data = load_data(INPUT_DIR + IN_FILENAME)

# Predict using each model

In [10]:
predictions = {}
for model_name, model_pipeline in model_pipelines.items():
    predictions[model_name] = model_pipeline(in_data['text'].tolist())

# Process predictions

In [11]:
results = []
for i, text in enumerate(in_data['text']):
    result = {"text": text}
    for model_name, model_predictions in predictions.items():
        result[f"{model_name}_label"] = model_predictions[i]['label']
        result[f"{model_name}_score"] = model_predictions[i]['score']
    results.append(result)

# Convert results to DataFrame

In [12]:
results_df = pd.DataFrame(results)
results_df.to_csv(PREDICTIONS_DIR + RESULTS_FILENAME, index=False)

# Save the results

In [13]:
print(results_df.head())

                                                text Herbert_label  \
0       Przez 12 lat leczyła mnie na złe schorzenie.       LABEL_0   
1  Teraz jestem leczony za właściwą dolegliwość, ...       LABEL_0   
2  Poza tym każda wizyta, nawet po 10 latach lecz...       LABEL_0   
3  Dodatkowo pani doktor nie zechciała mnie równi...       LABEL_0   
4                 Proszę podać treść do poprawienia.       LABEL_0   

   Herbert_score XLM-RoBERTa_label  XLM-RoBERTa_score Multilingual BERT_label  \
0       0.966462          negative           0.949951                 5 stars   
1       0.963730          positive           0.962640                 5 stars   
2       0.961262          negative           0.843864                 5 stars   
3       0.922081          negative           0.934144                 4 stars   
4       0.959310           neutral           0.847416                 4 stars   

   Multilingual BERT_score  
0                 0.347590  
1                 0.501320  
2    