## Group comparison in python 

import libaries

In [3]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pingouin as pg

Generate Example Data

In [26]:
np.random.seed(42)
data = pd.DataFrame({
    'Group': np.random.choice(['A', 'B'], 100),
    'Score': np.random.normal(50, 10, 100),
    'Subject': np.arange(100),
    'Time1': np.random.normal(50, 10, 100),
    'Time2': np.random.normal(52, 10, 100),
    'Covariate1': np.random.normal(100, 15, 100),
    'Covariate2': np.random.normal(100, 15, 100),})

# longitudinal dataframe
data_long = data.melt(id_vars=['Group', 'Subject', 'Score', 'Covariate1', 'Covariate2'], value_vars=['Time1', 'Time2'], 
                      var_name='Time', value_name='Time_Score')

Independent t-test

In [None]:
ind_ttest = stats.ttest_ind(
    data.loc[data['Group'] == 'A', 'Score'],
    data.loc[data['Group'] == 'B', 'Score'],
    equal_var=True)
print("Independent t-test:", ind_ttest)

Independent t-test: TtestResult(statistic=0.8466418913640135, pvalue=0.39925748482515977, df=98.0)


Paired t-test

In [None]:
paired_ttest = stats.ttest_rel(data['Time1'], data['Time2'])
print("Paired t-test:", paired_ttest)

Paired t-test: TtestResult(statistic=-0.9653334717494163, pvalue=0.33673000817565746, df=99)


One-way ANOVA

In [None]:
# statsmodels
anova = smf.ols('Score ~ Group', data=data).fit()
anova_result = sm.stats.anova_lm(anova, typ=2)

# pingouin
anova_result = pg.anova(data=data, dv='Score', between='Group').round(3)

print("One-way ANOVA:", anova_result)

One-way ANOVA:   Source  ddof1  ddof2      F  p-unc    np2
0  Group      1     98  0.717  0.399  0.007


Repeated Measures ANOVA

In [None]:
rm_anova = pg.rm_anova(data=data_long, dv='Time_Score', within='Time', subject='Subject', detailed=True)
print("Repeated Measures ANOVA:", rm_anova)

Repeated Measures ANOVA:   Source            SS  DF          MS         F    p-unc       ng2  eps
0   Time    100.828244   1  100.828244  0.931869  0.33673  0.005367  1.0
1  Error  10711.805208  99  108.200053       NaN      NaN       NaN  NaN


Mixed ANOVA

In [None]:
mixed_anova = pg.mixed_anova(data=data_long, dv='Time_Score', within='Time', between='Group', subject='Subject')
print("Mixed ANOVA:", mixed_anova)

Mixed ANOVA:         Source          SS  DF1  DF2          MS         F     p-unc  \
0        Group   12.363711    1   98   12.363711  0.152161  0.697326   
1         Time  100.828244    1   98  100.828244  0.922876  0.339085   
2  Interaction    4.879277    1   98    4.879277  0.044660  0.833070   

        np2  eps  
0  0.001550  NaN  
1  0.009329  1.0  
2  0.000456  NaN  


ANCOVA

In [None]:
# statsmodels
ancova = smf.ols('Score ~ Group + Covariate1 + Covariate2', data=data).fit()
ancova_result = sm.stats.anova_lm(ancova, typ=2)

# pingouin
ancova_result = pg.ancova(data=data, dv='Score', between='Group', covar=['Covariate1', 'Covariate2']).round(3) 

print("ANCOVA:", ancova_result)

ANCOVA:        Source        SS  DF      F  p-unc    np2
0       Group    39.250   1  0.454  0.502  0.005
1  Covariate1     0.571   1  0.007  0.935  0.000
2  Covariate2   349.644   1  4.041  0.047  0.040
3    Residual  8307.055  96    NaN    NaN    NaN


Linear Mixed Model (LMM)

In [None]:
lmm = smf.mixedlm('Score ~ Group + Covariate1 + Covariate2', data, groups=data['Subject']).fit()
print("Linear Mixed Model:", lmm.summary())

# longitudinal data:
lmm_long = smf.mixedlm('Time_Score ~ Time * Group + Covariate1 + Covariate2', data_long, groups=data_long['Subject']).fit()
print("Linear Mixed Model:", lmm_long.summary())



Linear Mixed Model:          Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Score    
No. Observations: 100     Method:             REML     
No. Groups:       100     Scale:              43.2659  
Min. group size:  1       Log-Likelihood:     -364.2082
Max. group size:  1       Converged:          Yes      
Mean group size:  1.0                                  
-------------------------------------------------------
             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------
Intercept    62.630    4.745 13.198 0.000 53.329 71.930
Group[T.B]   -1.267    1.881 -0.674 0.501 -4.954  2.420
Covariate1    0.005    0.016  0.314 0.754 -0.026  0.036
Covariate2   -0.131    0.065 -2.027 0.043 -0.258 -0.004
Group Var    43.266                                    





Linear Mixed Model:                Mixed Linear Model Regression Results
Model:                 MixedLM    Dependent Variable:    Time_Score
No. Observations:      200        Method:                REML      
No. Groups:            100        Scale:                 96.2321   
Min. group size:       2          Log-Likelihood:        -736.7335 
Max. group size:       2          Converged:             Yes       
Mean group size:       2.0                                         
-------------------------------------------------------------------
                         Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------------------
Intercept                51.250    6.309  8.124 0.000 38.885 63.615
Time[T.Time2]             1.772    2.091  0.847 0.397 -2.327  5.872
Group[T.B]               -0.179    1.980 -0.090 0.928 -4.060  3.702
Time[T.Time2]:Group[T.B] -0.629    2.795 -0.225 0.822 -6.107  4.848
Covariate1                0.003    0.046  0



Generalized Linear Mixed Model (GLMM)

In [35]:
glmm = smf.glm('Score ~ Group + Covariate1 + Covariate2', data, groups=data['Subject'], family=sm.families.Binomial()).fit()
print("Generalized Linear Mixed Model:", glmm.summary())

# longitudinal data:
glmm_long = smf.glm('Time_Score ~ Time * Group + Covariate1 + Covariate2', data_long, groups=data_long['Subject'], family=sm.families.Binomial()).fit()
print("Generalized Linear Mixed Model:", glmm_long.summary())

Generalized Linear Mixed Model:                  Generalized Linear Model Regression Results                  
Dep. Variable:                  Score   No. Observations:                  100
Model:                            GLM   Df Residuals:                       96
Model Family:                Binomial   Df Model:                            3
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:             2.2329e+05
Date:                 Do, 13 Mär 2025   Deviance:                   3.9005e+05
Time:                        19:02:29   Pearson chi2:                 1.11e+21
No. Iterations:                     2   Pseudo R-squ. (CS):                nan
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   4.175e+1

  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  n * np.log(1 - mu + 1e-20)) * var_weights
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  n * np.log(1 - mu + 1e-20)) * var_weights
