In [14]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Root directory for your data
root_directory = "data/corrected/"
df_llm = pd.read_csv(root_directory + 'gpt3-5_graded_corrected_questions.csv')
df_human = pd.read_csv(root_directory + 'manually_graded_corrected_questions.csv')

# List of categories to compare
categories = ["Grammatical", "Relevance", "Clarity", "Specificity", "Prettiness"]

t_tests = {}
hypothesis_results = {}

mae_scores = {}
rmse_scores = {}

for category in categories:
    llm_scores = df_llm[category]
    human_scores = df_human[category]
    
    # Z-score normalization (handling std = 0)
    llm_mean = llm_scores.mean()
    llm_std = llm_scores.std()
    if llm_std == 0:
        llm_zscores = np.zeros_like(llm_scores)
    else:
        llm_zscores = (llm_scores - llm_mean) / llm_std

    human_mean = human_scores.mean()
    human_std = human_scores.std()
    if human_std == 0:
        human_zscores = np.zeros_like(human_scores)
    else:
        human_zscores = (human_scores - human_mean) / human_std

    # Perform paired t-test on z-scores
    t_stat, p_value = ttest_rel(llm_zscores, human_zscores)
    t_tests[category] = (t_stat, p_value)
    
    alpha = 0.05
    if p_value < alpha:
        hypothesis_results[category] = "Reject H_0"
    else:
        hypothesis_results[category] = "Don't reject H_0"
    
    # Calculate MAE and RMSE (on original scores, not z-scores)
    mae = mean_absolute_error(human_scores, llm_scores)
    rmse = np.sqrt(mean_squared_error(human_scores, llm_scores))
    mae_scores[category] = mae
    rmse_scores[category] = rmse

# Output t-test results
print("T-tests (t-statistic, p-value) and Hypothesis Decision:")
for category, result in t_tests.items():
    t_stat, p_value = result
    decision = hypothesis_results[category]
    print(f"{category}: t-statistic = {t_stat}, p-value = {p_value} => {decision}")

# Output Mean Absolute Error results
print("\nMean Absolute Error (MAE):")
for category, mae in mae_scores.items():
    print(f"{category}: {mae}")

# Output Root Mean Squared Error results
print("\nRoot Mean Squared Error (RMSE):")
for category, rmse in rmse_scores.items():
    print(f"{category}: {rmse}")


T-tests (t-statistic, p-value) and Hypothesis Decision:
Grammatical: t-statistic = -3.6267285471088444e-15, p-value = 0.9999999999999971 => Don't reject H_0
Relevance: t-statistic = -5.828670879282072e-16, p-value = 0.9999999999999996 => Don't reject H_0
Clarity: t-statistic = -6.568819562365509e-16, p-value = 0.9999999999999996 => Don't reject H_0
Specificity: t-statistic = 1.6653345369377353e-16, p-value = 0.9999999999999999 => Don't reject H_0
Prettiness: t-statistic = -4.946380139329722e-17, p-value = 1.0 => Don't reject H_0

Mean Absolute Error (MAE):
Grammatical: 0.3333333333333333
Relevance: 0.2222222222222222
Clarity: 0.2222222222222222
Specificity: 0.25
Prettiness: 0.4444444444444444

Root Mean Squared Error (RMSE):
Grammatical: 0.5773502691896257
Relevance: 0.9428090415820634
Clarity: 0.9428090415820634
Specificity: 0.9574271077563381
Prettiness: 0.8819171036881969


Hi, I have questions generated by visual genome data and template questions. Some of them are still wrong. Can you correct them and grade them to score from 1 to 5 using criterias (Grammatical,Relevance,Clarity,Specificity,Prettiness,Average Score,Reason for Corrections) 

Please return the output as CSV file with
Original Question,Corrected Question,Grammatical,Relevance,Clarity,Specificity,Prettiness,Average Score,Reason for Corrections as headers

Here is the generated questions along with its context data