# Comparando palavras estrangeiras

Esse script compara os valores encontrados nos textos revisados manualmente com os valores previstos utilizando um modelo de anotação sintática

In [13]:
import pandas as pd


In [14]:
ground_truth = pd.read_csv('foreign-words.csv')
predicted = pd.read_csv("foreign_words_predicted.csv")

all_names = set(ground_truth["name"].tolist()).union(set(predicted["name"].tolist()))

In [16]:
results = []
results_rounded = []

for document in set(ground_truth["name"].tolist()):
    original_words = ground_truth[ground_truth["name"] == document]["word"].str.lower()
    predicted_words = predicted[predicted["name"] == document]["word"].str.lower()

    word_count_row = predicted[predicted["name"] == document]["word_count"]
    if word_count_row.empty:
       raise ValueError(f"Documento {document} sem quantidade de tokens")

    word_count = word_count_row.iloc[0]


    true_positive = len(set(original_words).intersection(set(predicted_words)))
    false_positive = len(set(predicted_words).difference(set(original_words)))
    false_negative = len(set(original_words).difference(set(predicted_words)))

    true_negative = word_count - false_negative - false_positive - true_positive

    precision = (true_positive / (true_positive + false_positive)) if (true_positive + false_positive) > 0  else 0
    recall = (true_positive / (true_positive + false_negative)) if (true_positive + false_negative) > 0  else 0

    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    accuracy = ((true_positive +  true_negative) / word_count)

    results.append({"document":document,"precision":precision, "recall":recall, "f1_score":f1_score, "accuracy":accuracy})
    results_rounded.append({"document":document,"precision":round(precision*100,2), "recall":round(recall*100,2), "f1_score":round(f1_score*100,2), "accuracy":round(accuracy*100,2)})

df = pd.DataFrame(results)
df_rounded = pd.DataFrame(results_rounded)

In [17]:
df_rounded.to_csv("result_foreign_words.csv")

In [19]:
df[["precision","recall","accuracy","f1_score"]].agg("mean")

precision    0.609325
recall       0.280600
accuracy     0.994156
f1_score     0.358276
dtype: float64