In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.metrics import accuracy_score

# Merge results

In [None]:
results = pd.read_csv("data/processed/test.csv")

In [None]:
RoBERTa_base = pd.read_csv("output/RoBERTa_base.csv")
results = pd.merge(results, RoBERTa_base, on = 'review_id')

In [None]:
RoBERTa_ft = pd.read_csv("output/RoBERTa_ft_cls.csv")
results = pd.merge(results, RoBERTa_ft, on = 'review_id')

In [None]:
SiEBERT = pd.read_csv("output/SiEBERT.csv")
results = pd.merge(results, SiEBERT, on = 'review_id')

In [None]:
GPT = pd.read_csv("output/GPT.csv")
results = pd.merge(results, GPT, on = 'review_id')

# Compare performances

In [None]:
models = ["RoBERTa_base", "SiEBERT", "RoBERTa_ft"]

## Average

In [None]:
accuracies = {model: accuracy_score(results["sentiment"], results[model]) for model in models}
accuracy_avg = pd.DataFrame(accuracies.items(), columns=["Model", "Accuracy"])
accuracy_avg.style.hide(axis="index")

## By ratings / sentiments

In [None]:
accuracy_rtg = (
    results.groupby("rating")
    .apply(lambda group: {model: accuracy_score(group["sentiment"], group[model]) for model in models}, include_groups = False)
    .apply(pd.Series)
).sort_values("rating").reset_index()
accuracy_rtg.style.hide(axis="index")

In [None]:
accuracy_long = accuracy_rtg.melt(id_vars="rating", var_name="Model", value_name="Accuracy")

plt.figure(figsize=(10, 6))

# Define a color map for the models
colors = {
    model: plt.cm.tab10(i) for i, model in enumerate(accuracy_long["Model"].unique())
}

# Group by model and plot each group separately
for model in accuracy_long["Model"].unique():
    model_data = accuracy_long[accuracy_long["Model"] == model]
    color = colors[model]
    
    # Split the data into two segments: ratings 1-4 and 7-10
    lower_ratings = model_data[model_data["rating"] <= 4]
    higher_ratings = model_data[model_data["rating"] >= 7]
    
    # Plot each segment with the same color
    plt.plot(lower_ratings["rating"], lower_ratings["Accuracy"], marker="o", linestyle="-", 
             color=color, alpha=0.7, label=model if len(lower_ratings) > 0 else None)
    plt.plot(higher_ratings["rating"], higher_ratings["Accuracy"], marker="o", linestyle="-", 
             color=color, alpha=0.7, label=None)
    
    # Connect the two segments with a styled line to indicate discontinuity if both segments exist
    if len(lower_ratings) > 0 and len(higher_ratings) > 0:
        plt.plot([lower_ratings["rating"].iloc[-1], higher_ratings["rating"].iloc[0]], 
                 [lower_ratings["Accuracy"].iloc[-1], higher_ratings["Accuracy"].iloc[0]], 
                 linestyle="--", alpha=0.4, color=color)

# Add a single legend entry for each model
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())

plt.xlabel("Rating")
plt.ylabel("Accuracy")
plt.grid(True, alpha=0.3)

plt.savefig("output/accuracy_vs_ratings.png", dpi=300, bbox_inches='tight')
plt.show()

## By review length

In [None]:
results['nb_words'] = results['text'].apply(lambda x: len(x.split()))

bins = pd.qcut(results["nb_words"], q=10, duplicates="drop")
upper_bounds = np.array([interval.right for interval in bins.cat.categories])
results["max_words"] = upper_bounds[bins.cat.codes]

In [None]:
accuracy_lth = (
    results.groupby("max_words")
    .apply(lambda group: {model: accuracy_score(group["sentiment"], group[model]) for model in models}, include_groups = False)
    .apply(pd.Series)
).reset_index()
accuracy_lth.style.hide(axis="index")

In [None]:
accuracy_long = accuracy_lth.melt(id_vars="max_words", var_name="Model", value_name="Accuracy")
accuracy_long["max_words"] = accuracy_long["max_words"].astype(float)
accuracy_long = accuracy_long.sort_values("max_words")

plt.figure(figsize=(10, 6))
sns.lineplot(data=accuracy_long, x="max_words", y="Accuracy", hue="Model", marker="o", alpha=0.7)
plt.xlabel("Review length")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig("output/accuracy_vs_wordcount.png", dpi=300, bbox_inches='tight')
plt.show()