In [55]:
#Load all the essential packages for the analysis
import pandas as pd
import statsmodels.api as sm

# Data

Download the replication data available at https://zenodo.org/records/14878936 and insert the files into the data folder.

# Regression analysis

Logistic regression models to estimate the effect size of the number of tokens on response accuracy.

In [58]:
#Create dataframes with necessary metadata per question for reasoning models
#Prepare data for logistic regression analysis
from Performance_eval import parse_domain, get_dataframe_reasoning_models
num_tiers = 4

#======================================================== o1-mini ========================================================
df_o1 = get_dataframe_reasoning_models('omni-judge_output_o1.jsonl')
df_o1['difficulty'] = pd.qcut(
    df_o1['difficulty'],
    q=num_tiers,
    labels=[f'Tier {i+1}' for i in range(num_tiers)]
    )


df_o1 = df_o1.explode("domain")
df_o1['domain'] = df_o1['domain'].apply(parse_domain)
df_o1 = df_o1[~df_o1.duplicated()].reset_index(drop=True)

df_o1.loc[df_o1["domain"] == "Precalculus", "domain"] = "Calculus"

df_o1 = pd.get_dummies(df_o1, columns=['difficulty'], drop_first=True, dtype=int)
df_o1 = pd.get_dummies(df_o1, columns=['domain'], drop_first=True, dtype=int)

#For analysis without controls, comment out the two lines indicated by (*)
X_o1 = sm.add_constant(
    df_o1[
        ['reasoning_tokens'] +
        [col for col in df_o1.columns if col.startswith('difficulty_')] +  #(*)
        [col for col in df_o1.columns if col.startswith('domain_')] #(*)
    ]
)
y_o1 = df_o1['correctness'].astype(int)


#======================================================== o3-mini (m) ========================================================
df_o3 = get_dataframe_reasoning_models('omni-judge_output_o3.jsonl')
df_o3['difficulty'] = pd.qcut(
    df_o3['difficulty'],
    q=num_tiers,
    labels=[f'Tier {i+1}' for i in range(num_tiers)]
    )


df_o3 = df_o3.explode("domain")
df_o3['domain'] = df_o3['domain'].apply(parse_domain)
df_o3 = df_o3[~df_o3.duplicated()].reset_index(drop=True)

df_o3.loc[df_o3["domain"] == "Precalculus", "domain"] = "Calculus"

df_o3 = pd.get_dummies(df_o3, columns=['difficulty'], drop_first=True, dtype=int)
df_o3 = pd.get_dummies(df_o3, columns=['domain'], drop_first=True, dtype=int)

#For analysis without controls, comment out the two lines indicated by (*)
X_o3 = sm.add_constant(
    df_o3[
        ['reasoning_tokens'] + 
        [col for col in df_o3.columns if col.startswith('difficulty_')] + #(*)
        [col for col in df_o3.columns if col.startswith('domain_')] #(*)
    ]
)
y_o3 = df_o3['correctness'].astype(int)


#======================================================== o3-mini (h) ========================================================
df_o3_high = get_dataframe_reasoning_models('omni-judge_output_o3_high.jsonl')
df_o3_high['difficulty'] = pd.qcut(
    df_o3_high['difficulty'],
    q=num_tiers,
    labels=[f'Tier {i+1}' for i in range(num_tiers)]
    )


df_o3_high = df_o3_high.explode("domain")
df_o3_high['domain'] = df_o3_high['domain'].apply(parse_domain)
df_o3_high = df_o3_high[~df_o3_high.duplicated()].reset_index(drop=True)

df_o3_high.loc[df_o3_high["domain"] == "Precalculus", "domain"] = "Calculus"

df_o3_high = pd.get_dummies(df_o3_high, columns=['difficulty'], drop_first=True, dtype=int)
df_o3_high = pd.get_dummies(df_o3_high, columns=['domain'], drop_first=True, dtype=int)

#For analysis without controls, comment out the two lines indicated by (*)
X_o3_high = sm.add_constant(
    df_o3_high[
        ['reasoning_tokens'] + 
        [col for col in df_o3_high.columns if col.startswith('difficulty_')] + #(*)
        [col for col in df_o3_high.columns if col.startswith('domain_')] #(*)
    ]
)
y_o3_high = df_o3_high['correctness'].astype(int)

In [None]:
#Logit regression results for o1-mini
logit_model_o1 = sm.Logit(y_o1, X_o1).fit(cov_type='HC1')
logit_model_o1.summary()

#print(f"P-val White test: {het_white(logit_model_o1.resid_pearson**2, X_o1)[1]}")
#For the coefficient of the reasoning_tokens we use the value in scientific notation

Optimization terminated successfully.
         Current function value: 0.576917
         Iterations 6


0,1,2,3
Dep. Variable:,correctness,No. Observations:,5535.0
Model:,Logit,Df Residuals:,5524.0
Method:,MLE,Df Model:,10.0
Date:,"Mon, 17 Mar 2025",Pseudo R-squ.:,0.1497
Time:,16:36:23,Log-Likelihood:,-3193.2
converged:,True,LL-Null:,-3755.5
Covariance Type:,HC1,LLR p-value:,2.716e-235

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.9315,0.080,24.112,0.000,1.775,2.089
reasoning_tokens,-0.0002,8e-06,-20.073,0.000,-0.000,-0.000
difficulty_Tier 2,-0.5337,0.083,-6.421,0.000,-0.697,-0.371
difficulty_Tier 3,-0.7404,0.099,-7.462,0.000,-0.935,-0.546
difficulty_Tier 4,-1.0764,0.089,-12.134,0.000,-1.250,-0.903
domain_Applied Mathematics,-0.4093,0.097,-4.200,0.000,-0.600,-0.218
domain_Calculus,0.1254,0.163,0.770,0.441,-0.194,0.445
domain_Discrete Mathematics,-0.8649,0.093,-9.319,0.000,-1.047,-0.683
domain_Geometry,-0.4618,0.092,-5.047,0.000,-0.641,-0.282


In [60]:
#Effect size of reasoning tokens on accuracy for o1-mini
#We multiply the first margeff by 1000 to get the effect size of 1000 tokens on the accuracy
logit_model_o1.get_margeff().margeff

array([-3.15774310e-05, -1.04883372e-01, -1.45507720e-01, -2.11529730e-01,
       -8.04300317e-02,  2.46420647e-02, -1.69959244e-01, -9.07567053e-02,
        3.50506479e-03,  9.31500244e-02])

In [61]:
#Logit regression results for o1-mini
logit_model_o3 = sm.Logit(y_o3, X_o3).fit(cov_type='HC1')
logit_model_o3.summary()

#print(f"P-val White test: {het_white(logit_model_o3.resid_pearson**2, X_o3)[1]}")

Optimization terminated successfully.
         Current function value: 0.544435
         Iterations 6


0,1,2,3
Dep. Variable:,correctness,No. Observations:,5531.0
Model:,Logit,Df Residuals:,5520.0
Method:,MLE,Df Model:,10.0
Date:,"Mon, 17 Mar 2025",Pseudo R-squ.:,0.07073
Time:,16:39:02,Log-Likelihood:,-3011.3
converged:,True,LL-Null:,-3240.5
Covariance Type:,HC1,LLR p-value:,3.4169999999999996e-92

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.0235,0.082,24.610,0.000,1.862,2.185
reasoning_tokens,-0.0001,7.11e-06,-15.242,0.000,-0.000,-9.44e-05
difficulty_Tier 2,-0.3591,0.089,-4.029,0.000,-0.534,-0.184
difficulty_Tier 3,-0.5627,0.105,-5.361,0.000,-0.768,-0.357
difficulty_Tier 4,-0.6953,0.094,-7.382,0.000,-0.880,-0.511
domain_Applied Mathematics,-0.3408,0.101,-3.363,0.001,-0.539,-0.142
domain_Calculus,0.0278,0.178,0.156,0.876,-0.321,0.376
domain_Discrete Mathematics,-0.4950,0.094,-5.270,0.000,-0.679,-0.311
domain_Geometry,-0.2114,0.096,-2.206,0.027,-0.399,-0.024


In [62]:
#Effect size of reasoning tokens on accuracy for o3-mini (m)
#We multiply the first margeff by 1000 to get the effect size of 1000 tokens on the accuracy
logit_model_o3.get_margeff().margeff

array([-1.96353964e-05, -6.50583589e-02, -1.01949381e-01, -1.25975474e-01,
       -6.17527028e-02,  5.02862291e-03, -8.96937407e-02, -3.83031476e-02,
        7.15054462e-03,  4.09152147e-03])

In [63]:
#Logit regression results for o3-mini (h)
logit_model_o3_unlimited = sm.Logit(y_o3_high, X_o3_high).fit(cov_type='HC1')
logit_model_o3_unlimited.summary()

# print(f"P-val White test: {het_white(logit_model_o3_unlimited.resid_pearson**2, X_o3_unlimited)[1]}")

Optimization terminated successfully.
         Current function value: 0.494473
         Iterations 6


0,1,2,3
Dep. Variable:,correctness,No. Observations:,5526.0
Model:,Logit,Df Residuals:,5515.0
Method:,MLE,Df Model:,10.0
Date:,"Mon, 17 Mar 2025",Pseudo R-squ.:,0.07629
Time:,16:39:27,Log-Likelihood:,-2732.5
converged:,True,LL-Null:,-2958.1
Covariance Type:,HC1,LLR p-value:,1.061e-90

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.2523,0.089,25.293,0.000,2.078,2.427
reasoning_tokens,-5.071e-05,3.27e-06,-15.512,0.000,-5.71e-05,-4.43e-05
difficulty_Tier 2,-0.1996,0.097,-2.056,0.040,-0.390,-0.009
difficulty_Tier 3,-0.3731,0.114,-3.284,0.001,-0.596,-0.150
difficulty_Tier 4,-0.6372,0.102,-6.241,0.000,-0.837,-0.437
domain_Applied Mathematics,-0.3746,0.110,-3.413,0.001,-0.590,-0.159
domain_Calculus,0.0872,0.194,0.450,0.653,-0.293,0.467
domain_Discrete Mathematics,-0.4124,0.100,-4.133,0.000,-0.608,-0.217
domain_Geometry,-0.2570,0.102,-2.515,0.012,-0.457,-0.057


In [None]:
#Effect size of reasoning tokens on accuracy for o3-mini (h)
#We multiply the first margeff by 1000 to get the effect size of 1000 tokens on the accuracy
logit_model_o3_unlimited.get_margeff().margeff

array([-8.10523724e-06, -3.19025183e-02, -5.96443215e-02, -1.01846965e-01,
       -5.98822673e-02,  1.39460870e-02, -6.59150200e-02, -4.10725714e-02,
        1.23140006e-02, -4.01888476e-03])