In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp as mc

In [2]:
df_new = pd.read_excel('overalldata.xlsx', sheet_name='Sheet1')
exclude_models = [
    "codellama-70b-instruct@anyscale",
    "codellama-7b-instruct@together-ai",
    "command-r-plus@aws-bedrock",
    "deepseek-coder-33b-instruct@together-ai",
    "phi-3-medium-4k-instruct@deepinfra",
    "nemotron-4-340b-instruct@deepinfra",
    "codellama-13b-instruct@together-ai"
]

## Tukey's HSD test to compare differences for Automated Readability Index

In [10]:
filtered_df = df_new[~df_new['model'].isin(exclude_models)]

summary_table = filtered_df.groupby('model')['Automated Readability Index'].describe()

# Tukey's HSD test to compare differences Automated Readability Index
tukey = mc.pairwise_tukeyhsd(filtered_df['Automated Readability Index'], filtered_df['model'])

print(tukey.summary())
tukey_summary_df = pd.DataFrame(data=tukey.summary().data[1:], columns=tukey.summary().data[0])
tukey_summary_df.to_excel('tukey_summary_Automated Readability Index.xlsx', index=True)


                                 Multiple Comparison of Means - Tukey HSD, FWER=0.05                                 
                group1                                 group2                 meandiff p-adj   lower    upper  reject
---------------------------------------------------------------------------------------------------------------------
              claude-3-haiku@anthropic                claude-3-opus@anthropic  -0.1133    1.0  -2.4422  2.2156  False
              claude-3-haiku@anthropic              claude-3-sonnet@anthropic     3.58 0.9571  -3.0071 10.1671  False
              claude-3-haiku@anthropic            claude-3.5-sonnet@anthropic  -0.6667    1.0  -2.9956  1.6622  False
              claude-3-haiku@anthropic             gemini-1.5-flash@vertex-ai  -0.3667    1.0  -2.6956  1.9622  False
              claude-3-haiku@anthropic               gemini-1.5-pro@vertex-ai  -0.8133 0.9999  -3.1422  1.5156  False
              claude-3-haiku@anthropic             gemma

##Tukey's HSD test to compare differences for Coleman-Liau Index


In [8]:
filtered_df1 = df_new[~df_new['model'].isin(exclude_models)]

summary_table = filtered_df.groupby('model')['Coleman-Liau Index'].describe()

# Tukey's HSD test to compare differences Automated Readability Index
tukey = mc.pairwise_tukeyhsd(filtered_df['Coleman-Liau Index'], filtered_df['model'])

print(tukey.summary())
tukey_summary_df = pd.DataFrame(data=tukey.summary().data[1:], columns=tukey.summary().data[0])
tukey_summary_df.to_excel('tukey_summary_Coleman-Liau Index.xlsx', index=True)

                                 Multiple Comparison of Means - Tukey HSD, FWER=0.05                                 
                group1                                 group2                 meandiff p-adj   lower    upper  reject
---------------------------------------------------------------------------------------------------------------------
              claude-3-haiku@anthropic                claude-3-opus@anthropic   0.2373    1.0  -1.4731  1.9477  False
              claude-3-haiku@anthropic              claude-3-sonnet@anthropic    2.346 0.9893  -2.4917  7.1837  False
              claude-3-haiku@anthropic            claude-3.5-sonnet@anthropic   0.1627    1.0  -1.5477  1.8731  False
              claude-3-haiku@anthropic             gemini-1.5-flash@vertex-ai    0.276    1.0  -1.4344  1.9864  False
              claude-3-haiku@anthropic               gemini-1.5-pro@vertex-ai    0.276    1.0  -1.4344  1.9864  False
              claude-3-haiku@anthropic             gemma

## Tukey's HSD test to compare differences for Flesch Reading Ease

In [14]:
filtered_df = df_new[~df_new['model'].isin(exclude_models)]

summary_table = filtered_df.groupby('model')['Flesch Reading Ease'].describe()

# Tukey's HSD test to compare differences Flesch Reading Ease
tukey = mc.pairwise_tukeyhsd(filtered_df['Flesch Reading Ease'], filtered_df['model'])

print(tukey.summary())
tukey_summary_df = pd.DataFrame(data=tukey.summary().data[1:], columns=tukey.summary().data[0])
tukey_summary_df.to_excel('tukey_summary_Flesch Reading Ease.xlsx', index=True)

                                 Multiple Comparison of Means - Tukey HSD, FWER=0.05                                 
                group1                                 group2                 meandiff p-adj   lower    upper  reject
---------------------------------------------------------------------------------------------------------------------
              claude-3-haiku@anthropic                claude-3-opus@anthropic  -1.5593    1.0 -10.3735  7.2548  False
              claude-3-haiku@anthropic              claude-3-sonnet@anthropic -10.5047 0.9986 -35.4349 14.4255  False
              claude-3-haiku@anthropic            claude-3.5-sonnet@anthropic   0.9967    1.0  -7.8175  9.8108  False
              claude-3-haiku@anthropic             gemini-1.5-flash@vertex-ai    0.814    1.0  -8.0002  9.6282  False
              claude-3-haiku@anthropic               gemini-1.5-pro@vertex-ai  -0.0133    1.0  -8.8275  8.8008  False
              claude-3-haiku@anthropic             gemma

## Tukey's HSD test to compare differences for Flesch-Kincaid Grade Level

In [15]:
filtered_df = df_new[~df_new['model'].isin(exclude_models)]

summary_table = filtered_df.groupby('model')['Flesch-Kincaid Grade Level'].describe()

# Tukey's HSD test to compare differences Flesch-Kincaid Grade Level
tukey = mc.pairwise_tukeyhsd(filtered_df['Flesch-Kincaid Grade Level'], filtered_df['model'])

print(tukey.summary())
tukey_summary_df = pd.DataFrame(data=tukey.summary().data[1:], columns=tukey.summary().data[0])
tukey_summary_df.to_excel('tukey_summary_Flesch-Kincaid Grade Levele.xlsx', index=True)

                                Multiple Comparison of Means - Tukey HSD, FWER=0.05                                 
                group1                                 group2                 meandiff p-adj   lower   upper  reject
--------------------------------------------------------------------------------------------------------------------
              claude-3-haiku@anthropic                claude-3-opus@anthropic   0.0333    1.0 -1.8538  1.9204  False
              claude-3-haiku@anthropic              claude-3-sonnet@anthropic     2.54 0.9917 -2.7975  7.8775  False
              claude-3-haiku@anthropic            claude-3.5-sonnet@anthropic  -0.6533    1.0 -2.5404  1.2338  False
              claude-3-haiku@anthropic             gemini-1.5-flash@vertex-ai  -0.5867    1.0 -2.4738  1.3004  False
              claude-3-haiku@anthropic               gemini-1.5-pro@vertex-ai  -0.8267 0.9975 -2.7138  1.0604  False
              claude-3-haiku@anthropic             gemma-2-9b-it