# Load Libraries

In [1]:
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline



# Change the directory

In [2]:
import os
os.chdir("../../../")
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd drive/MyDrive/poleval_emotion/

# Determine constants

In [3]:
INPUT_DIR = 'data/train/'
IN_FILENAME = 'in_baseline.tsv'
EXPECTED_FILENAME = 'expected.tsv'

In [4]:
PREDICTIONS_DIR = 'predictions/train/'
RESULTS_FILENAME = 'phsd_baseline.csv'

# Load & preprocess data functions

In [5]:
def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    return data

# Define Hugging Face model and tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("dkleczek/Polish-Hate-Speech-Detection-Herbert-Large")
model = AutoModelForSequenceClassification.from_pretrained("dkleczek/Polish-Hate-Speech-Detection-Herbert-Large")

# Create a pipeline for text classification

In [7]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Load & preprocess the data

In [8]:
in_data = load_data(INPUT_DIR + IN_FILENAME)
expected_data = load_data(INPUT_DIR + EXPECTED_FILENAME)

# Use the pipeline to predict on the input data

In [9]:
predictions = pipe(in_data['text'].tolist())

# Convert predictions to desired format

In [10]:
pred_labels = [pred['label'] for pred in predictions]
pred_scores = [pred['score'] for pred in predictions]

# Combine predictions and expected data for comparison or further analysis

In [11]:
results = pd.DataFrame({
    'text': in_data['text'],
    'pred_label': pred_labels,
    'pred_score': pred_scores,
    'expected_label': expected_data.values.tolist()
})

# Save the results

In [12]:
results.to_csv(PREDICTIONS_DIR + RESULTS_FILENAME, index=False)

In [13]:
print(results.head())

                                                text pred_label  pred_score  \
0                                        2 gwiazdki.    LABEL_0    0.940217   
1           Na tyle maksymalnie zasługuje ten hotel.    LABEL_0    0.967163   
2  To, ze hotel ma 4 gwiazdki w nazwie jest żałosne.    LABEL_0    0.874301   
3  Śmiesznie było już podczas rezerwacji, pani z ...    LABEL_0    0.934140   
4  Udało mi się jednak dowiedzieć, ze w hotelu są...    LABEL_0    0.963341   

                                      expected_label  
0  [False, False, False, True, False, True, False...  
1  [False, False, False, False, False, True, True...  
2  [False, False, False, False, False, False, Tru...  
3  [False, False, False, False, False, True, True...  
4  [True, False, False, False, False, False, Fals...  
