In [4]:
# Import necessary libraries for analysis and statistics
import pandas as pd
import os
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [5]:
# Load the data and verify its structure.
df = pd.read_csv(os.path.join("..", "results", "model_performance.csv"))

print("Data Head:")
print(df.head())
print("\n" + "=" * 50 + "\n")
print("Data Info:")
df.info()
print("\n" + "=" * 50 + "\n")
print(f"Total runs loaded: {len(df)}")

Data Head:
    dataset               model  run_id  accuracy  precision    recall  \
0  baseline  LogisticRegression       1  0.731278   0.735000  0.948387   
1  baseline  LogisticRegression       2  0.775330   0.765306  0.967742   
2  baseline  LogisticRegression       3  0.740088   0.737624  0.961290   
3  baseline  LogisticRegression       4  0.740088   0.733010  0.974194   
4  baseline  LogisticRegression       5  0.726872   0.738462  0.929032   

         f1   roc_auc  
0  0.828169  0.596192  
1  0.854701  0.700806  
2  0.834734  0.631541  
3  0.836565  0.625896  
4  0.822857  0.640054  


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   dataset    90 non-null     object 
 1   model      90 non-null     object 
 2   run_id     90 non-null     int64  
 3   accuracy   90 non-null     float64
 4   precision  90 non-null     float64
 5   

In [None]:
# Define the model formula. `C()` treats the variables as categorical.
# The '*' notation tests for both main effects and the interaction effect.
formula = "f1 ~ C(model) * C(dataset)"

# Fit the Ordinary Least Squares (OLS) model
model = ols(formula, data=df).fit()

# Perform the ANOVA and print the results table
anova_table = sm.stats.anova_lm(model, typ=2)
print("Two-Way ANOVA Results (Response Variable: F1-Score):")
print(anova_table)

Two-Way ANOVA Results (Response Variable: F1-Score):
                       sum_sq    df          F        PR(>F)
C(model)             0.005634   2.0  24.136876  5.990649e-09
C(dataset)           0.000240   2.0   1.027335  3.625759e-01
C(model):C(dataset)  0.000101   4.0   0.215312  9.292178e-01
Residual             0.009453  81.0        NaN           NaN


In [7]:
# Perform Tukey's HSD on the 'model' group.
tukey_model = pairwise_tukeyhsd(endog=df["f1"], groups=df["model"], alpha=0.05)

# Print the results summary
print("Tukey's HSD Post-Hoc Test Results (Factor: Model):")
print(tukey_model)

Tukey's HSD Post-Hoc Test Results (Factor: Model):
            Multiple Comparison of Means - Tukey HSD, FWER=0.05            
      group1             group2       meandiff p-adj   lower  upper  reject
---------------------------------------------------------------------------
               KNN LogisticRegression   0.0132    0.0  0.0067 0.0198   True
               KNN       RandomForest   0.0189    0.0  0.0123 0.0254   True
LogisticRegression       RandomForest   0.0056 0.1049 -0.0009 0.0122  False
---------------------------------------------------------------------------


In [8]:
# Define the model formula for recall.
formula_recall = "recall ~ C(model) * C(dataset)"

# Fit the OLS model for recall
model_recall = ols(formula_recall, data=df).fit()

# Perform the ANOVA and print the results table
anova_table_recall = sm.stats.anova_lm(model_recall, typ=2)
print("Two-Way ANOVA Results (Response Variable: Recall):")
print(anova_table_recall)

Two-Way ANOVA Results (Response Variable: Recall):
                       sum_sq    df          F        PR(>F)
C(model)             0.066820   2.0  83.280261  2.236387e-20
C(dataset)           0.000445   2.0   0.554502  5.765226e-01
C(model):C(dataset)  0.000515   4.0   0.321058  8.631085e-01
Residual             0.032495  81.0        NaN           NaN


In [9]:
# Perform Tukey's HSD on the 'model' group using the 'recall' metric.
tukey_recall = pairwise_tukeyhsd(endog=df["recall"], groups=df["model"], alpha=0.05)

# Print the results summary
print("Tukey's HSD Post-Hoc Test Results (Factor: Model, Metric: Recall):")
print(tukey_recall)

Tukey's HSD Post-Hoc Test Results (Factor: Model, Metric: Recall):
            Multiple Comparison of Means - Tukey HSD, FWER=0.05             
      group1             group2       meandiff p-adj   lower   upper  reject
----------------------------------------------------------------------------
               KNN LogisticRegression   0.0647    0.0  0.0527  0.0768   True
               KNN       RandomForest   0.0465    0.0  0.0344  0.0585   True
LogisticRegression       RandomForest  -0.0183 0.0015 -0.0304 -0.0062   True
----------------------------------------------------------------------------


In [None]:
# Isolate the data for the two groups we are comparing
lr_recalls = df[df["model"] == "LogisticRegression"]["recall"]
rf_recalls = df[df["model"] == "RandomForest"]["recall"]

# Calculate Cohen's d for the independent samples
effect_size = pg.compute_effsize(x=lr_recalls, y=rf_recalls, eftype="cohen")

print(f"Cohen's d for Recall (LogisticRegression vs. RandomForest): {effect_size:.4f}")

Cohen's d for Recall (LogisticRegression vs. RandomForest): 1.1003


In [13]:
# Isolate the data for the F1 scores
lr_f1s = df[df["model"] == "LogisticRegression"]["f1"]
rf_f1s = df[df["model"] == "RandomForest"]["f1"]

# Calculate Cohen's d for the F1 scores
effect_size_f1 = pg.compute_effsize(x=lr_f1s, y=rf_f1s, eftype="cohen")

print(
    f"Cohen's d for F1-Score (LogisticRegression vs. RandomForest): {effect_size_f1:.4f}"
)

Cohen's d for F1-Score (LogisticRegression vs. RandomForest): -0.5826
