In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score

# Merge results

In [2]:
results = pd.read_csv("data/processed/test.csv")

In [3]:
RoBERTa_base = pd.read_csv("output/RoBERTa_base.csv")
results = pd.merge(results, RoBERTa_base, on = 'review_id')

In [None]:
RoBERTa_ft = pd.read_csv("output/RoBERTa_ft.csv")
results = pd.merge(results, RoBERTa_ft, on = 'review_id')

In [4]:
SiEBERT = pd.read_csv("output/SiEBERT.csv")
results = pd.merge(results, SiEBERT, on = 'review_id')

In [None]:
GPT = pd.read_csv("output/GPT.csv")
results = pd.merge(results, GPT, on = 'review_id')

# Compare performances

In [5]:
# models = ["RoBERTa_base", "RoBERTa_ft", "SiEBERT", "GPT"]
models = ["RoBERTa_base", "SiEBERT"]

## Average

In [6]:
accuracies = {model: accuracy_score(results["sentiment"], results[model]) for model in models}
accuracy_avg = pd.DataFrame(accuracies.items(), columns=["Model", "Accuracy"])

accuracy_avg

Unnamed: 0,Model,Accuracy
0,RoBERTa_base,0.79532
1,SiEBERT,0.90484


## By ratings / sentiments

In [17]:
accuracy_lth = (
    results.groupby("rating")
    .apply(lambda group: {model: accuracy_score(group["sentiment"], group[model]) for model in models}, include_groups = False)
    .apply(pd.Series)
).reset_index()
accuracy_lth

Unnamed: 0,rating,RoBERTa_base,SiEBERT
0,1,0.955396,0.954799
1,2,0.929192,0.926151
2,3,0.896891,0.869736
3,4,0.866793,0.80038
4,7,0.504551,0.810143
5,8,0.618947,0.900351
6,9,0.696246,0.927048
7,10,0.764753,0.953591


## By review length

In [7]:
results['nb_words'] = results['text'].apply(lambda x: len(x.split()))

results["nb_words_bin"] = pd.qcut(results["nb_words"], 
                                  q = 7, 
                                  labels = False, 
                                  duplicates = "drop")

In [18]:
accuracy_lth = (
    results.groupby("nb_words_bin")
    .apply(lambda group: {model: accuracy_score(group["sentiment"], group[model]) for model in models}, include_groups = False)
    .apply(pd.Series)
).reset_index()

accuracy_lth

Unnamed: 0,nb_words_bin,RoBERTa_base,SiEBERT
0,0,0.890789,0.952565
1,1,0.847671,0.942466
2,2,0.840832,0.93757
3,3,0.811825,0.920125
4,4,0.771372,0.897188
5,5,0.726381,0.865433
6,6,0.675463,0.816957
