In [1]:
import statsmodels.api as sm #Import stasts package
import numpy as np
import pandas as pd
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm

In [2]:
df = pd.read_csv("Cleaned_all_accuracy_for _parasitemia_Anemia_stats.csv")
df

Unnamed: 0,Anemic_condition,Parasitemia,Model,Accuracy
0,Non_anemic,6PRS,Model 1 (High Contrast),100.0
1,Non_anemic,0.1PRS,Model 1 (High Contrast),100.0
2,Non_anemic,0.002PRS,Model 1 (High Contrast),91.0
3,Non_anemic,0.00003PRS,Model 1 (High Contrast),92.0
4,Non_anemic,All Positive,Model 1 (High Contrast),91.0
5,Moderate_Anemia,6PRS,Model 1 (High Contrast),100.0
6,Moderate_Anemia,0.1PRS,Model 1 (High Contrast),100.0
7,Moderate_Anemia,0.002PRS,Model 1 (High Contrast),95.0
8,Moderate_Anemia,0.00003PRS,Model 1 (High Contrast),95.0
9,Moderate_Anemia,All Positive,Model 1 (High Contrast),90.0


# Lets Investigate, if anemia affect perfomance of MIRs-ML on malaria detection

In [6]:
# First lets convert 'Anemic_condition', 'Parasitemia', and 'Model' to categorical variables
df['Anemic_condition'] = pd.Categorical(df['Anemic_condition'])
df['Parasitemia'] = pd.Categorical(df['Parasitemia'])
df['Model'] = pd.Categorical(df['Model'])

# Ensure the reference category is 'Non_anemic'
df['Anemic_condition'] = df['Anemic_condition'].cat.reorder_categories(['Non_anemic', 'Moderate_Anemia', 'Severe_Anemia'])

# Create dummy variables for 'Anemic_condition' and define a design _Matrix
design_matrix = sm.add_constant(pd.get_dummies(df[['Anemic_condition']], drop_first=True, prefix='Anemic'))

# Fit GLM with Gamma family
model = sm.GLM(df['Accuracy'], design_matrix, family=sm.families.Gamma())
result = model.fit()

# Display results including Null and Residual Deviances
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               Accuracy   No. Observations:                   45
Model:                            GLM   Df Residuals:                       42
Model Family:                   Gamma   Df Model:                            2
Link Function:          inverse_power   Scale:                        0.036790
Method:                          IRLS   Log-Likelihood:                -183.99
Date:                Thu, 18 Jan 2024   Deviance:                       1.5951
Time:                        04:38:41   Pearson chi2:                     1.55
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0



In [7]:
# Lets work for Odds ratio of each anaemic class where non anemic is the reference group
def my_model_summary(result, print_summary=True):
    if print_summary:
        print(result.summary2())
    
    Mt = result.params
    ORt = np.exp(Mt)
    lowert = np.exp(Mt - 1.96 * result.bse)
    Uppert = np.exp(Mt + 1.96 * result.bse)
    
    all_SS = pd.DataFrame({
        'ORt': ORt,
        'Lowert': lowert,
        'Uppert': Uppert,
        'P-value': result.pvalues
    })
    
    # Display odds ratios and confidence intervals
    print(all_SS)

# Call the function with GLM model result
my_model_summary(result)


                 Results: Generalized linear model
Model:                GLM                AIC:              373.9737 
Link Function:        inverse_power      BIC:              -158.2848
Dependent Variable:   Accuracy           Log-Likelihood:   -183.99  
Date:                 2024-01-18 04:38   LL-Null:          -183.99  
No. Observations:     45                 Deviance:         1.5951   
Df Model:             2                  Pearson chi2:     1.55     
Df Residuals:         42                 Scale:            0.036790 
Method:               IRLS                                          
--------------------------------------------------------------------
                       Coef.  Std.Err.    z    P>|z|   [0.025 0.975]
--------------------------------------------------------------------
const                  0.0128   0.0006 20.1920 0.0000  0.0116 0.0141
Anemic_Moderate_Anemia 0.0001   0.0009  0.0821 0.9346 -0.0017 0.0018
Anemic_Severe_Anemia   0.0001   0.0009  0.0664 0.947

In [8]:
# Post-hoc Tukey's HSD for comparison of anaemic conditions

mc = pairwise_tukeyhsd(endog=df['Accuracy'], groups=df['Anemic_condition'], alpha=0.05)
print(mc.summary())

        Multiple Comparison of Means - Tukey HSD, FWER=0.05         
     group1         group2    meandiff p-adj  lower    upper  reject
--------------------------------------------------------------------
Moderate_Anemia    Non_anemic   0.4473   0.9 -12.7853   13.68  False
Moderate_Anemia Severe_Anemia   0.0853   0.9 -13.1473  13.318  False
     Non_anemic Severe_Anemia   -0.362   0.9 -13.5947 12.8707  False
--------------------------------------------------------------------


# Now investigate Parasitemia Impact idependently

In [9]:
# Re_order parasitemia classes'
df['Parasitemia'] = df['Parasitemia'].cat.reorder_categories(['6PRS', '0.1PRS', '0.002PRS', '0.00003PRS', "All Positive"])

# Create dummy variables for 'Parasitemia'
design_matrix = sm.add_constant(pd.get_dummies(df[['Parasitemia']], drop_first=True, prefix='Parasitemia'))

# Fit GLM with Gamma family
model = sm.GLM(df['Accuracy'], design_matrix, family=sm.families.Gamma())
result = model.fit()

# Display results including Null and Residual Deviances
print(result.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:               Accuracy   No. Observations:                   45
Model:                            GLM   Df Residuals:                       40
Model Family:                   Gamma   Df Model:                            4
Link Function:          inverse_power   Scale:                        0.038444
Method:                          IRLS   Log-Likelihood:                -183.93
Date:                Thu, 18 Jan 2024   Deviance:                       1.5860
Time:                        04:38:41   Pearson chi2:                     1.54
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   



In [10]:
# Lets work for Odds ratio of each parasitemia class where 0% is the reference group

def my_model_summary(result, print_summary=True):
    if print_summary:
        print(result.summary2())
    
    Mt = result.params
    ORt = np.exp(Mt)
    lowert = np.exp(Mt - 1.96 * result.bse)
    Uppert = np.exp(Mt + 1.96 * result.bse)
    
    all_SS = pd.DataFrame({
        'ORt': ORt,
        'Lowert': lowert,
        'Uppert': Uppert,
        'P-value': result.pvalues
    })
    
    # Display odds ratios and confidence intervals
    print(all_SS)

# Call the loop above with GLM model result
my_model_summary(result)


                   Results: Generalized linear model
Model:                 GLM                 AIC:               377.8639 
Link Function:         inverse_power       BIC:               -150.6805
Dependent Variable:    Accuracy            Log-Likelihood:    -183.93  
Date:                  2024-01-18 04:38    LL-Null:           -184.05  
No. Observations:      45                  Deviance:          1.5860   
Df Model:              4                   Pearson chi2:      1.54     
Df Residuals:          40                  Scale:             0.038444 
Method:                IRLS                                            
-----------------------------------------------------------------------
                          Coef.  Std.Err.    z    P>|z|   [0.025 0.975]
-----------------------------------------------------------------------
const                     0.0127   0.0008 15.3006 0.0000  0.0111 0.0143
Parasitemia_0.1PRS        0.0004   0.0012  0.3310 0.7406 -0.0019 0.0027
Parasitemia

In [11]:
# Post-hoc Tukey's HSD for comparison between parasitemia classes
mc = pairwise_tukeyhsd(endog=df['Accuracy'], groups=df['Parasitemia'], alpha=0.05)
print(mc.summary())

     Multiple Comparison of Means - Tukey HSD, FWER=0.05      
  group1      group2    meandiff p-adj  lower    upper  reject
--------------------------------------------------------------
0.00003PRS     0.002PRS  -1.2433   0.9 -21.7637  19.277  False
0.00003PRS       0.1PRS  -2.7111   0.9 -23.2315 17.8092  False
0.00003PRS         6PRS  -0.3356   0.9 -20.8559 20.1848  False
0.00003PRS All Positive     -2.6   0.9 -23.1204 17.9204  False
  0.002PRS       0.1PRS  -1.4678   0.9 -21.9881 19.0526  False
  0.002PRS         6PRS   0.9078   0.9 -19.6126 21.4281  False
  0.002PRS All Positive  -1.3567   0.9  -21.877 19.1637  False
    0.1PRS         6PRS   2.3756   0.9 -18.1448 22.8959  False
    0.1PRS All Positive   0.1111   0.9 -20.4092 20.6315  False
      6PRS All Positive  -2.2644   0.9 -22.7848 18.2559  False
--------------------------------------------------------------


# Now lets Investigate the effect of training Approach (Model)

In [12]:
design_matrix = sm.add_constant(pd.get_dummies(df[['Model']], drop_first=True))

# Fit GLM with gamma family
model = sm.GLM(df['Accuracy'], design_matrix, family=sm.families.Gamma())
result = model.fit()

# Display results including Null and Residual Deviances
print(result.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:               Accuracy   No. Observations:                   45
Model:                            GLM   Df Residuals:                       42
Model Family:                   Gamma   Df Model:                            2
Link Function:          inverse_power   Scale:                       0.0064282
Method:                          IRLS   Log-Likelihood:                -144.19
Date:                Thu, 18 Jan 2024   Deviance:                      0.27318
Time:                        04:38:41   Pearson chi2:                    0.270
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons



In [13]:
# Lets calculate odds ratios for training approaches where high contrast training approach is the reference group
def my_model_summary(result, print_summary=True):
    if print_summary:
        print(result.summary2())
    
    Mt = result.params
    ORt = np.exp(Mt)
    lowert = np.exp(Mt - 1.96 * result.bse)
    Uppert = np.exp(Mt + 1.96 * result.bse)
    
    all_SS = pd.DataFrame({
        'ORt': ORt,
        'Lowert': lowert,
        'Uppert': Uppert,
        'P-value': result.pvalues
    })
    
    # Display odds ratios and confidence intervals
    print(all_SS)

# Call the function with your GLM model result
my_model_summary(result)

                       Results: Generalized linear model
Model:                     GLM                   AIC:                 294.3831 
Link Function:             inverse_power         BIC:                 -159.6066
Dependent Variable:        Accuracy              Log-Likelihood:      -144.19  
Date:                      2024-01-18 04:38      LL-Null:             -247.03  
No. Observations:          45                    Deviance:            0.27318  
Df Model:                  2                     Pearson chi2:        0.270    
Df Residuals:              42                    Scale:               0.0064282
Method:                    IRLS                                                
-------------------------------------------------------------------------------
                                   Coef.  Std.Err.    z    P>|z|  [0.025 0.975]
-------------------------------------------------------------------------------
const                              0.0105   0.0002 48.3060 0.00

In [14]:
# Post-hoc Tukey's HSD for comparing between different training approaches
mc = pairwise_tukeyhsd(endog=df['Accuracy'], groups=df['Model'], alpha=0.05)
print(mc.summary())

                       Multiple Comparison of Means - Tukey HSD, FWER=0.05                       
           group1                       group2            meandiff p-adj  lower    upper   reject
-------------------------------------------------------------------------------------------------
     Model 1 (High Contrast) Model 2 (All concentrations)  -19.934 0.001 -24.8209 -15.0471   True
     Model 1 (High Contrast)       Model 3 (Low Contrast) -32.5333 0.001 -37.4203 -27.6464   True
Model 2 (All concentrations)       Model 3 (Low Contrast) -12.5993 0.001 -17.4863  -7.7124   True
-------------------------------------------------------------------------------------------------


## Now lets investigate in general what influence perfomance of MIRs-ML between Parasitemia, Anaemic conditions or training approaches

* I will use Chi sqaure and the p value to make statistical conclusions
* Mainly by applying the wald test on the GLM results

In [15]:
# Start for Anemic conditions
design_matrix = sm.add_constant(pd.get_dummies(df[['Anemic_condition']], drop_first=True))

# Fit GLM with Gamma family
model = sm.GLM(df['Accuracy'], design_matrix, family=sm.families.Gamma())
result = model.fit()

# Perform Wald test on Anemic conditions
wald_test_anemic_condition = result.wald_test(np.eye(len(design_matrix.columns))[1:])  # Exclude the intercept
print(f'Wald test for Anemic_condition:\n{wald_test_anemic_condition}')

# Now for Parasitemia
design_matrix = sm.add_constant(pd.get_dummies(df[['Parasitemia']], drop_first=True))

# Fit GLM with Gamma family
model = sm.GLM(df['Accuracy'], design_matrix, family=sm.families.Gamma())
result = model.fit()
wald_test_parasitemia = result.wald_test(np.eye(len(design_matrix.columns))[1:])  # Exclude the intercept
print(f'Wald test for parasitemia:\n{wald_test_parasitemia}')

# Finally for the training approach (Model)
design_matrix = sm.add_constant(pd.get_dummies(df[['Model']], drop_first=True))
model = sm.GLM(df['Accuracy'], design_matrix, family=sm.families.Gamma()) # Fit GLM with Gamma family
result = model.fit()
wald_test_model = result.wald_test(np.eye(len(design_matrix.columns))[1:])  # Exclude the intercept
print(f'Wald test for Model:\n{wald_test_model}')




Wald test for Anemic_condition:
<Wald test (chi2): statistic=[[0.00760325]], p-value=0.9962055941877578, df_denom=2>
Wald test for parasitemia:
<Wald test (chi2): statistic=[[0.24275983]], p-value=0.9932032820288033, df_denom=4>
Wald test for Model:
<Wald test (chi2): statistic=[[201.61599705]], p-value=1.658221376366497e-44, df_denom=2>


