In [None]:
import pandas as pd
import re

## Gera arquivo .csv com os BERTScores para análise em planilha

In [None]:
def process_txt_to_dataframe(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    data = []
    current_base = None
    current_model = None
    current_topic = None

    for line in lines:
        line = line.strip()
        
        # Detectar a linha de cabeçalho com Base, Modelo e Tópico
        if line.startswith("BertScore"):
            match = re.match(r"BertScore\s+(.+?)\s+-\s+(.+?)\s+-\s+Tópico\s+(\d+):", line)
            if match:
                current_base = match.group(1)
                current_model = match.group(2)
                current_topic = int(match.group(3))
        
        # Detectar métricas e armazenar na estrutura de dados
        elif line.startswith("Weight:"):
            weighted = float(line.split(":")[1].strip())
        elif line.startswith("Weighted F1 Avg:"):
            weighted_f1_avg = float(line.split(":")[1].strip())
        elif line.startswith("Weighted F1 Max:"):
            weighted_f1_max = float(line.split(":")[1].strip())
        elif line.startswith("Weighted F1 Min:"):
            weighted_f1_min = float(line.split(":")[1].strip())
        elif line.startswith("Weighted F1 Top-k Avg:"):
            weighted_f1_topk_avg = float(line.split(":")[1].strip())
        elif line.startswith("Precision Avg:"):
            precision_avg = float(line.split(":")[1].strip())
        elif line.startswith("Recall Avg:"):
            recall_avg = float(line.split(":")[1].strip())
        elif line.startswith("F1 Avg:"):
            f1_avg = float(line.split(":")[1].strip())
        elif line.startswith("Precision Max:"):
            precision_max = float(line.split(":")[1].strip())
        elif line.startswith("Recall Max:"):
            recall_max = float(line.split(":")[1].strip())
        elif line.startswith("F1 Max:"):
            f1_max = float(line.split(":")[1].strip())
        elif line.startswith("Precision Min:"):
            precision_min = float(line.split(":")[1].strip())
        elif line.startswith("Recall Min:"):
            recall_min = float(line.split(":")[1].strip())
        elif line.startswith("F1 Min:"):
            f1_min = float(line.split(":")[1].strip())
        elif line.startswith("Precision Top-k Avg:"):
            precision_topk_avg = float(line.split(":")[1].strip())
        elif line.startswith("Recall Top-k Avg:"):
            recall_topk_avg = float(line.split(":")[1].strip())
        elif line.startswith("F1 Top-k Avg:"):
            f1_topk_avg = float(line.split(":")[1].strip())
            # Adicionar linha ao dataframe após a última métrica
            data.append([
                current_base, current_topic, current_model,
                weighted, weighted_f1_avg, weighted_f1_max, weighted_f1_min, weighted_f1_topk_avg,
                precision_avg, recall_avg, f1_avg,
                precision_max, recall_max, f1_max,
                precision_min, recall_min, f1_min,
                precision_topk_avg, recall_topk_avg, f1_topk_avg
            ])

    # Criar DataFrame com os dados
    columns = [
        "Base", "Tópico", "Modelo",
        "Weighted", "Weighted F1 Avg", "Weighted F1 Max", "Weighted F1 Min", "Weighted F1 Top-k Avg",
        "Precision Avg", "Recall Avg", "F1 Avg",
        "Precision Max", "Recall Max", "F1 Max",
        "Precision Min", "Recall Min", "F1 Min",
        "Precision Top-k Avg", "Recall Top-k Avg", "F1 Top-k Avg"
    ]
    df = pd.DataFrame(data, columns=columns)
    return df


In [None]:
df_atos_golpistas = process_txt_to_dataframe('result_bertscore_atos_golpistas_v6.txt')

In [None]:
df_lula = process_txt_to_dataframe('result_bertscore_lula_v6.txt')

In [None]:
df_bolsonaro = process_txt_to_dataframe('result_bertscore_bolsonaro_v6.txt')

In [None]:
df_final = pd.concat([df_atos_golpistas, df_lula, df_bolsonaro], ignore_index=True)

In [None]:
df_final.to_csv('resultados_tratados_weighted.csv', index=False)

## Código para o cálculo da correlação de Pearson e p-valores

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Dados para calcular a correlação - Preencher aqui com os BertScores e Likerts
data = {
    "Modelo": ["Bode", "Bode", "Llama 2", "Llama 2", "Llama 3", "Llama 3", "Mistral", "Mistral"],
    "Base": ["Bolsonaro", "Lula", "Bolsonaro", "Lula", "Bolsonaro", "Lula", "Bolsonaro", "Lula"],
    "Média F1 BertScore - Com peso": [0.508, 0.518, 0.576, 0.576, 0.589, 0.581, 0.495, 0.522],
    "Média F1 BertScore": [0.621, 0.598, 0.602, 0.595, 0.612, 0.602, 0.572, 0.572],
    "Média Likert": [4.836, 5.346, 5.478, 5.203, 5.668, 5.415, 5.096, 4.745],
}

df = pd.DataFrame(data)

# Calculando a correlação de Pearson
correlation_weights = np.corrcoef(df["Média F1 BertScore - Com peso"], df["Média Likert"])[0, 1]
correlation_no_weights = np.corrcoef(df["Média F1 BertScore"], df["Média Likert"])[0, 1]

# Criando os gráficos de dispersão
plt.figure(figsize=(12, 6))

# Gráfico 1: Likert vs F1 BertScore - Com Peso
plt.subplot(1, 2, 1)
plt.scatter(df["Média F1 BertScore - Com peso"], df["Média Likert"], alpha=0.7, label="Dados")
plt.title(f"Correlação: {correlation_weights:.2f}", fontsize=12)
plt.xlabel("Média F1 BertScore - Com peso", fontsize=10)
plt.ylabel("Média Likert", fontsize=10)
plt.grid(True)
plt.plot(np.unique(df["Média F1 BertScore - Com peso"]), 
         np.poly1d(np.polyfit(df["Média F1 BertScore - Com peso"], df["Média Likert"], 1))(np.unique(df["Média F1 BertScore - Com peso"])),
         color="red", label="Tendência")
plt.legend()

# Gráfico 2: Likert vs F1 BertScore
plt.subplot(1, 2, 2)
plt.scatter(df["Média F1 BertScore"], df["Média Likert"], alpha=0.7, label="Dados")
plt.title(f"Correlação: {correlation_no_weights:.2f}", fontsize=12)
plt.xlabel("Média F1 BertScore", fontsize=10)
plt.ylabel("Média Likert", fontsize=10)
plt.grid(True)
plt.plot(np.unique(df["Média F1 BertScore"]), 
         np.poly1d(np.polyfit(df["Média F1 BertScore"], df["Média Likert"], 1))(np.unique(df["Média F1 BertScore"])),
         color="red", label="Tendência")
plt.legend()

plt.tight_layout()
plt.show()

# Mostrando as correlações para o usuário
correlation_weights, correlation_no_weights


In [None]:
from scipy.stats import pearsonr

# Calculando a significância estatística das correlações
correlation_weights, pval_weights = pearsonr(df["Média F1 BertScore - Com peso"], df["Média Likert"])
correlation_no_weights, pval_no_weights = pearsonr(df["Média F1 BertScore"], df["Média Likert"])

{
    "Correlação com peso": correlation_weights,
    "P-valor com peso": pval_weights,
    "Correlação sem peso": correlation_no_weights,
    "P-valor sem peso": pval_no_weights
}
