In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp as mc


In [2]:
pip install pandas xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.0


In [4]:
df_new = pd.read_excel('overalldata.xlsx', sheet_name='Sheet1')

exclude_models = [
    "codellama-70b-instruct@anyscale",
    "codellama-7b-instruct@together-ai",
    "command-r-plus@aws-bedrock",
    "deepseek-coder-33b-instruct@together-ai",
    "phi-3-medium-4k-instruct@deepinfra"
]

filtered_df = df_new[~df_new['model'].isin(exclude_models)]

control_group = filtered_df[filtered_df['model'] == 'original_text']
other_groups = filtered_df[filtered_df['model'] != 'original_text']
combined_df = pd.concat([control_group, other_groups])

model = ols('Q("Coleman-Liau Index") ~ C(model)', data=combined_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)

comp = mc.pairwise_tukeyhsd(combined_df['Coleman-Liau Index'], combined_df['model'])
tukey_results = pd.DataFrame(data=comp.summary().data[1:], columns=comp.summary().data[0])

control_comparisons = tukey_results[tukey_results['group1'] == 'original_text']

control_comparisons_sorted = control_comparisons.sort_values(by='meandiff', key=abs)
print(control_comparisons_sorted.head())

              sum_sq     df         F        PR(>F)
C(model)  200.802861   28.0  4.561794  1.323822e-11
Residual  430.751429  274.0       NaN           NaN
            group1                         group2  meandiff  p-adj   lower  \
405  original_text  qwen-2-72b-instruct@deepinfra    3.4463    0.0  1.6887   

     upper  reject  
405  5.204    True  


In [5]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp as mc

df_new = pd.read_excel('overalldata.xlsx', sheet_name='Sheet1')
exclude_models = [
    "codellama-70b-instruct@anyscale",
    "codellama-7b-instruct@together-ai",
    "command-r-plus@aws-bedrock",
    "deepseek-coder-33b-instruct@together-ai",
    "phi-3-medium-4k-instruct@deepinfra",
    "nemotron-4-340b-instruct@deepinfra",
    "codellama-13b-instruct@together-ai"
]

filtered_df = df_new[~df_new['model'].isin(exclude_models)]

summary_table = filtered_df.groupby('model')['SMOG Index'].describe()


print(summary_table)
model = ols('Q("Coleman-Liau Index") ~ C(model)', data=filtered_df).fit()

anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)
anova_table.to_excel('anova_table_Coleman-Liau Index.xlsx', index=True)

tukey = mc.pairwise_tukeyhsd(filtered_df['Coleman-Liau Index'], filtered_df['model'])
print(tukey.summary())

tukey_summary_df = pd.DataFrame(data=tukey.summary().data[1:], columns=tukey.summary().data[0])
tukey_summary_df.to_excel('tukey_summary_Coleman-Liau Index.xlsx', index=True)

                                        count       mean       std   min  \
model                                                                      
claude-3-haiku@anthropic                 15.0   9.220000  1.081137   7.4   
claude-3-opus@anthropic                  15.0   9.126667  1.181081   6.9   
claude-3-sonnet@anthropic                 1.0  11.400000       NaN  11.4   
claude-3.5-sonnet@anthropic              15.0   8.846667  1.128758   6.4   
gemini-1.5-flash@vertex-ai               15.0   8.733333  1.119098   6.8   
gemini-1.5-pro@vertex-ai                 15.0   8.433333  0.718795   7.4   
gemma-2-9b-it@fireworks-ai               15.0   8.533333  1.211650   6.2   
gemma-2b-it@together-ai                  14.0   9.207143  1.219372   7.1   
gemma-7b-it@anyscale                      1.0  10.100000       NaN  10.1   
gpt-3.5-turbo@openai                     15.0   9.300000  1.281740   7.7   
gpt-4-turbo@openai                       15.0   8.293333  0.786917   6.9   
gpt-4@openai

In [6]:
summary_table = filtered_df.groupby('model')['Coleman-Liau Index'].describe()
anova_df = pd.DataFrame(anova_table)
output_file_path = 'summary_and_anova_output_Coleman-Liau Index.xlsx'

with pd.ExcelWriter(output_file_path, engine='xlsxwriter') as writer:

    summary_table.to_excel(writer, sheet_name='Summary Statistics')
    anova_df.to_excel(writer, sheet_name='ANOVA Table')

print(f"Data has been saved to {output_file_path}")

Data has been saved to summary_and_anova_output_Coleman-Liau Index.xlsx
