In [14]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# `ANOVA`

In [5]:
# 1. Way ANOVA
# generate dummy data
a = [2.3, 3.4, 4.5, 2.3, 3.4]
b = [11.2, 12.2, 13.2, 14.9, 15.0]
c = [22.2, 24.4, 25.6, 28.55,26.5]

# perform one way ANOVA
stats.f_oneway(a, b, c)

# print the results using if else condition
f_statistic, p_val = stats.f_oneway(a, b, c)
if p_val > 0.05:
    print(f'p-value: {p_val}, Means are equal (fail to reject H0)')
else:
    print(f'p-value: {p_val}, Means are not equal (reject H0)')



p-value: 5.70508900807336e-10, Means are not equal (reject H0)


##### ANOVA, 
##### ANOVA is a statistical test that stands for analysis of variance, 
`ANOVA was developed by statistician and evolutionary biologist Ronald Fisher. The idea behind ANOVA is to compare different groups of samples to determine whether there is a significant difference between the groups.
ANOVA is an extension of the t and the z test and was developed to compare more than two groups.
The null hypothesis of ANOVA is that there is no difference between the groups. The alternative hypothesis is that there is a difference between the groups.
ANOVA is an omnibus test, meaning it tests the data as a whole. In other words, it does not tell you which specific group
significantly different from each other; it only tells you that at least two groups were different.`

In [10]:
ferti_1 = [20, 22, 19, 24, 25]
ferti_2 = [28, 30, 27, 26, 29]
ferti_3 = [18, 20, 22, 19, 24]
# perform the one-way ANOVA
f_statistic, p_val = stats.f_oneway(ferti_1, ferti_2, ferti_3)
# print the results using if else condition
if p_val > 0.05:
    print(f'p-value: {p_val}, Means are equal (fail to reject H0)')
else:
    print(f'p-value: {p_val}, Means are not equal (reject H0)')

p-value: 0.0004515404760997283, Means are not equal (reject H0)


In [25]:
# using stats models we are going to apply miltivariate annova
# create a dataframe
df = pd.DataFrame({'fertilizer':['A']*5 + ['B']*5 + ['C']*5,
                   'growth': ferti_1 + ferti_2 + ferti_3})

# fit the model
model = ols('growth ~ fertilizer', data=df).fit()
# perform ANOVA and print the results using if and Else loop
anova_table = sm.stats.anova_lm(model, typ=1)
print(anova_table)
# print the results based on if the p-value is less than 0.05
if anova_table['PR(>F)'][0] < 0.05:
    print(f'Reject null hypothesis: The means are not equal, as the p-value: {anova_table["PR(>F)"][0]} is less than 0.05')
else:
    print(f'Fail to reject null hypothesis: The means are equal, as the p-value: {anova_table["PR(>F)"][0]} is greater than 0.05')


              df      sum_sq    mean_sq          F    PR(>F)
fertilizer   2.0  154.533333  77.266667  15.662162  0.000452
Residual    12.0   59.200000   4.933333        NaN       NaN
Reject null hypothesis: The means are not equal, as the p-value: 0.00045154047609972665 is less than 0.05


  if anova_table['PR(>F)'][0] < 0.05:
  print(f'Reject null hypothesis: The means are not equal, as the p-value: {anova_table["PR(>F)"][0]} is less than 0.05')


In [30]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Sample data for growth under different fertilizers and sunlight conditions
ferti_1 = [10, 12, 14, 13, 15]
ferti_2 = [20, 22, 21, 19, 18]
ferti_3 = [30, 28, 29, 31, 32]
sunlight_1 = ['Low']*5 + ['Medium']*5 + ['High']*5
sunlight_2 = ['Low']*5 + ['Medium']*5 + ['High']*5
sunlight_3 = ['Low']*5 + ['Medium']*5 + ['High']*5

# Step 1: Create a DataFrame
df = pd.DataFrame({
    'fertilizer': ['A']*15 + ['B']*15 + ['C']*15,
    'growth': ferti_1*3 + ferti_2*3 + ferti_3*3,
    'sunlight': sunlight_1 + sunlight_2 + sunlight_3
})

# Step 2: Fit the model
model = ols('growth ~ fertilizer * sunlight', data=df).fit()

# Step 3: Perform ANOVA
anova_table = sm.stats.anova_lm(model, typ=1)

# Step 4: Print the ANOVA table
print(anova_table)

# Step 5: Print the results based on the p-value
if anova_table['PR(>F)'][0] < 0.05:
    print(f'Reject null hypothesis: The means are not equal, as the p-value: {anova_table["PR(>F)"][0]} is less than 0.05')
else:
    print(f'Fail to reject null hypothesis: The means are equal, as the p-value: {anova_table["PR(>F)"][0]} is greater than 0.05')

                       df        sum_sq       mean_sq             F  \
fertilizer            2.0  2.238400e+03  1.119200e+03  3.859310e+02   
sunlight              2.0  2.565770e-28  1.282885e-28  4.423742e-29   
fertilizer:sunlight   4.0  1.822269e-28  4.555672e-29  1.570921e-29   
Residual             36.0  1.044000e+02  2.900000e+00           NaN   

                           PR(>F)  
fertilizer           4.801480e-25  
sunlight             1.000000e+00  
fertilizer:sunlight  1.000000e+00  
Residual                      NaN  
Reject null hypothesis: The means are not equal, as the p-value: 4.8014797278584455e-25 is less than 0.05


  if anova_table['PR(>F)'][0] < 0.05:
  print(f'Reject null hypothesis: The means are not equal, as the p-value: {anova_table["PR(>F)"][0]} is less than 0.05')


In [33]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Adjusted sample data for growth under different fertilizers, sunlight, and water conditions
ferti_1 = [10, 12, 14, 13, 15]
ferti_2 = [20, 22, 21, 19, 18]
ferti_3 = [30, 28, 29, 31, 32]
sunlight_1 = ['Low']*5 + ['Medium']*5 + ['High']*5
sunlight_2 = ['Low']*5 + ['Medium']*5 + ['High']*5
sunlight_3 = ['Low']*5 + ['Medium']*5 + ['High']*5
water_1 = ['Low']*5 + ['Medium']*5 + ['High']*5
water_2 = ['Low']*5 + ['Medium']*5 + ['High']*5
water_3 = ['Low']*5 + ['Medium']*5 + ['High']*5

# Adjust growth values to create significant differences
growth_values = [
    10, 12, 14, 13, 15,  # Fertilizer A, Low Sunlight, Low Water
    15, 17, 19, 18, 20,  # Fertilizer A, Medium Sunlight, Medium Water
    20, 22, 24, 23, 25,  # Fertilizer A, High Sunlight, High Water
    25, 27, 29, 28, 30,  # Fertilizer B, Low Sunlight, Low Water
    30, 32, 34, 33, 35,  # Fertilizer B, Medium Sunlight, Medium Water
    35, 37, 39, 38, 40,  # Fertilizer B, High Sunlight, High Water
    40, 42, 44, 43, 45,  # Fertilizer C, Low Sunlight, Low Water
    45, 47, 49, 48, 50,  # Fertilizer C, Medium Sunlight, Medium Water
    50, 52, 54, 53, 55   # Fertilizer C, High Sunlight, High Water
]

# Step 1: Create a DataFrame
df = pd.DataFrame({
    'fertilizer': ['A']*15 + ['B']*15 + ['C']*15,
    'growth': growth_values,
    'sunlight': sunlight_1 + sunlight_2 + sunlight_3,
    'water': water_1 + water_2 + water_3
})

# Step 2: Fit the model
model = ols('growth ~ fertilizer * sunlight * water', data=df).fit()

# Step 3: Perform ANOVA
anova_table = sm.stats.anova_lm(model, typ=2)

# Step 4: Print the ANOVA table
print(anova_table)

# Step 5: Print the results based on the p-value
if anova_table['PR(>F)'][0] < 0.05:
    print(f'Reject null hypothesis: The means are not equal, as the p-value: {anova_table["PR(>F)"][0]} is less than 0.05')
else:
    print(f'Fail to reject null hypothesis: The means are equal, as the p-value: {anova_table["PR(>F)"][0]} is greater than 0.05')

                                 sum_sq    df             F        PR(>F)
fertilizer                 6.750000e+03   2.0  9.121622e+02  1.448268e-31
sunlight                   6.197318e-13   2.0  8.374754e-14  1.000000e+00
water                      6.185615e-13   2.0  8.358940e-14  1.000000e+00
fertilizer:sunlight        1.696760e-28   4.0  1.146459e-29  1.000000e+00
fertilizer:water           2.277112e-29   4.0  1.538589e-30  1.000000e+00
sunlight:water             5.765313e+01   4.0  3.895482e+00  2.941197e-02
fertilizer:sunlight:water  5.206482e-28   8.0  1.758947e-29  1.000000e+00
Residual                   1.332000e+02  36.0           NaN           NaN
Reject null hypothesis: The means are not equal, as the p-value: 1.4482683847998476e-31 is less than 0.05


  if anova_table['PR(>F)'][0] < 0.05:
  print(f'Reject null hypothesis: The means are not equal, as the p-value: {anova_table["PR(>F)"][0]} is less than 0.05')


In [34]:
# Interpretation
# In the output, you'll see p-values for:
# • The main effects of each factor (Fertilizer, Sunlight, Watering)
# • The interaction effects between two factors (e.g., Fertilizer:Sunlight)
# • The interaction effect among all three factors (Fertilizer:Sunlight:Watering)
# A p-value less than 0.05 typically suggests a statistically significant effect. However, interpreting ANOVA results can be complex, especially with interactions. You should consider the practical significance and the context of your experiment alongside the statistical result
# abc
# Remember, ANOVA makes certain assumptions (normality, homogeneity of variance, and ind DEOSuld be tested before running the analysis.
# codani.cs

In [43]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import MultiComparison

# Adjusted sample data for growth under different fertilizers, sunlight, and water conditions
ferti_1 = [10, 12, 14, 13, 15]
ferti_2 = [20, 22, 21, 19, 18]
ferti_3 = [30, 28, 29, 31, 32]
sunlight_1 = ['Low']*5 + ['Medium']*5 + ['High']*5
sunlight_2 = ['Low']*5 + ['Medium']*5 + ['High']*5
sunlight_3 = ['Low']*5 + ['Medium']*5 + ['High']*5
water_1 = ['Low']*5 + ['Medium']*5 + ['High']*5
water_2 = ['Low']*5 + ['Medium']*5 + ['High']*5
water_3 = ['Low']*5 + ['Medium']*5 + ['High']*5

# Adjust growth values to create significant differences
growth_values = [
    10, 12, 14, 13, 15,  # Fertilizer A, Low Sunlight, Low Water
    15, 17, 19, 18, 20,  # Fertilizer A, Medium Sunlight, Medium Water
    20, 22, 24, 23, 25,  # Fertilizer A, High Sunlight, High Water
    25, 27, 29, 28, 30,  # Fertilizer B, Low Sunlight, Low Water
    30, 32, 34, 33, 35,  # Fertilizer B, Medium Sunlight, Medium Water
    35, 37, 39, 38, 40,  # Fertilizer B, High Sunlight, High Water
    40, 42, 44, 43, 45,  # Fertilizer C, Low Sunlight, Low Water
    45, 47, 49, 48, 50,  # Fertilizer C, Medium Sunlight, Medium Water
    50, 52, 54, 53, 55   # Fertilizer C, High Sunlight, High Water
]

# Step 1: Create a DataFrame
df = pd.DataFrame({
    'fertilizer': ['A']*15 + ['B']*15 + ['C']*15,
    'growth': growth_values,
    'sunlight': sunlight_1 + sunlight_2 + sunlight_3,
    'water': water_1 + water_2 + water_3
})

# Step 2: Fit the model
model = ols('growth ~ fertilizer', data=df).fit()

# Step 3: Perform Tukey's HSD test for fertilizer
mc_fertilizer = MultiComparison(df['growth'], df['fertilizer'])
result_fertilizer = mc_fertilizer.tukeyhsd()

# Step 4: Print the results for fertilizer
print("Tukey's HSD test for fertilizer:")
print(result_fertilizer.summary())

# Step 5: Perform Tukey's HSD test for sunlight
mc_sunlight = MultiComparison(df['growth'], df['sunlight'])
result_sunlight = mc_sunlight.tukeyhsd()

# Step 6: Print the results for sunlight
print("\nTukey's HSD test for sunlight:")
print(result_sunlight.summary())

# Step 7: Perform Tukey's HSD test for water
mc_water = MultiComparison(df['growth'], df['water'])
result_water = mc_water.tukeyhsd()

# Step 8: Print the results for water
print("\nTukey's HSD test for water:")
print(result_water.summary())

Tukey's HSD test for fertilizer:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
     A      B     15.0   0.0 10.9319 19.0681   True
     A      C     30.0   0.0 25.9319 34.0681   True
     B      C     15.0   0.0 10.9319 19.0681   True
---------------------------------------------------

Tukey's HSD test for sunlight:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower    upper  reject
-----------------------------------------------------
  High    Low    -10.0 0.0941 -21.3568  1.3568  False
  High Medium     -5.0 0.5379 -16.3568  6.3568  False
   Low Medium      5.0 0.5379  -6.3568 16.3568  False
-----------------------------------------------------

Tukey's HSD test for water:
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower    upper  reject
-----------------------------------------------------
  