In [8]:
from module4 import *
from scipy.stats import f_oneway,t
import numpy as np
import pandas as pd
def print_hypothesis_anova():
    print_latex("$H_0 : $ All means are equal")
    print_latex("$H_A : $ means are not all equal")


def get_lsd(levels, mse, df_residuals, alpha=0.05,df=None):
    t_critical = t.ppf(1 - alpha/2, df_residuals)
    print(f"t-critical value for alpha={alpha}, df={df_residuals} is {t_critical:.4f}")
    print(f"Mean Square Error = {mse:.4f}")
    if df is None:
        df=pd.DataFrame(levels)
    groups = list(df.columns)
    lsd_results = []
    
    for i in range(len(groups)):
        for j in range(i+1, len(groups)):
            groupA, groupB = groups[i], groups[j]
            nA = len(df[groupA].dropna())
            nB = len(df[groupB].dropna())
            xA = df[groupA].dropna().mean()
            xB = df[groupB].dropna().mean()
            lsd = t_critical * (mse * (1/nA + 1/nB))**0.5
            
            diff_means = abs(xA - xB)
            conclusion = "significant difference" if lsd < diff_means else "no significant difference"
            
            lsd_results.append([groupA, groupB, lsd, diff_means, conclusion])
    
    lsd_df = pd.DataFrame(lsd_results, columns=['Group A', 'Group B', 'LSD', '|xA-xB|', 'Conclusion'])
    
    print("\nHere are the results of the LSD test for all pairs of groups:")
    print(lsd_df)

def get_anova(levels,alpha=0.05):
    if type(levels)==pd.DataFrame:
        levels = {col: levels[col].dropna().values for col in levels.columns}
    print(levels)
    F,p=f_oneway(*levels.values())
    print(f"F={F:.4f}, p={p:.4f}")
    
    if p < alpha:
        print(f"Since, {p:.4f} < {alpha}, we reject the null hypothesis that all means are equal.")
        reject_null = True
    else:
        print(f"Since, {p:.4f} > {alpha}, we fail to reject the null hypothesis that all means are equal.")
        reject_null = False
        
    for level in levels.keys():
        print(f"Participants in the {level} group had a mean of {np.mean(levels[level]):.2f} and a standard deviation of {np.std(levels[level],ddof=1):.2f}.")
    
    # Calculate the Mean Square Error
    residuals = []
    grand_mean = np.mean(np.concatenate(list(levels.values())))
    
    for level in levels.keys():
        residuals.extend(levels[level] - np.mean(levels[level]))
    
    ss_residuals = sum([r**2 for r in residuals])
    df_residuals = len(np.concatenate(list(levels.values()))) - len(levels)
    mse = ss_residuals / df_residuals
    
    if reject_null:
        print("\nSince we have rejected the null hypothesis in the ANOVA test, let's perform the LSD test.")
        get_lsd(levels, mse, df_residuals, alpha)
    else:
        print("\nSince we have failed to reject the null hypothesis in the ANOVA test, there's no need to perform the LSD test.")

    return F, p, mse, df_residuals, reject_null

levels={}
levels["Low-Calorie"]=[8,9,6,7,3]
levels["Low-Fat"]=[2,4,3,5,1]
levels["Low-Carbohydrate"]=[3,5,4,2,3]
levels["Control"]=[2,2,-1,0,3]
F,p,*extra=get_anova(levels,alpha=0.05)

{'Low-Calorie': [8, 9, 6, 7, 3], 'Low-Fat': [2, 4, 3, 5, 1], 'Low-Carbohydrate': [3, 5, 4, 2, 3], 'Control': [2, 2, -1, 0, 3]}
F=8.5593, p=0.0013
Since, 0.0013 < 0.05, we reject the null hypothesis that all means are equal.
Participants in the Low-Calorie group had a mean of 6.60 and a standard deviation of 2.30.
Participants in the Low-Fat group had a mean of 3.00 and a standard deviation of 1.58.
Participants in the Low-Carbohydrate group had a mean of 3.40 and a standard deviation of 1.14.
Participants in the Control group had a mean of 1.20 and a standard deviation of 1.64.

Since we have rejected the null hypothesis in the ANOVA test, let's perform the LSD test.
t-critical value for alpha=0.05, df=16 is 2.1199
Mean Square Error = 2.9500

Here are the results of the LSD test for all pairs of groups:
            Group A           Group B       LSD  |xA-xB|  \
0       Low-Calorie           Low-Fat  2.302807      3.6   
1       Low-Calorie  Low-Carbohydrate  2.302807      3.2   
2    

In [9]:
levels={}
levels["Normal Bone Density"]=[1200,1000,980,900,750,800]
levels["Osteopenia"]=[1000,1100,700,800,500,700]
levels["Osteoporosis"]=[890,650,1100,900,400,350]
print_hypothesis_anova()
F,p,*extra=get_anova(levels,alpha=0.01)

<IPython.core.display.Latex object>

<IPython.core.display.Latex object>

{'Normal Bone Density': [1200, 1000, 980, 900, 750, 800], 'Osteopenia': [1000, 1100, 700, 800, 500, 700], 'Osteoporosis': [890, 650, 1100, 900, 400, 350]}
F=1.3949, p=0.2782
Since, 0.2782 > 0.01, we fail to reject the null hypothesis that all means are equal.
Participants in the Normal Bone Density group had a mean of 938.33 and a standard deviation of 161.30.
Participants in the Osteopenia group had a mean of 800.00 and a standard deviation of 219.09.
Participants in the Osteoporosis group had a mean of 715.00 and a standard deviation of 299.92.

Since we have failed to reject the null hypothesis in the ANOVA test, there's no need to perform the LSD test.


Steps in doing a least significant difference test (LSD)

Let A,B are the groups that you are comparing.
Let MSE be the mean square of error which (also called mean square within groups), obtained from the ANOVA table.
Let nA,nB are the number of elements in groups A
 and B, respectively.


In [10]:
from scipy.stats import f_oneway, t
import numpy as np
import pandas as pd

levels={}
levels["Low-Calorie"]=[8,9,6,7,3]
levels["Low-Fat"]=[2,4,3,5,1]
levels["Low-Carbohydrate"]=[3,5,4,2,3]
levels["Control"]=[2,2,-1,0,3]

df=pd.DataFrame(levels)
alpha = 0.05
print_hypothesis_anova()
F, p, mse, df_residuals, reject_null = get_anova(levels,alpha)





<IPython.core.display.Latex object>

<IPython.core.display.Latex object>

{'Low-Calorie': [8, 9, 6, 7, 3], 'Low-Fat': [2, 4, 3, 5, 1], 'Low-Carbohydrate': [3, 5, 4, 2, 3], 'Control': [2, 2, -1, 0, 3]}
F=8.5593, p=0.0013
Since, 0.0013 < 0.05, we reject the null hypothesis that all means are equal.
Participants in the Low-Calorie group had a mean of 6.60 and a standard deviation of 2.30.
Participants in the Low-Fat group had a mean of 3.00 and a standard deviation of 1.58.
Participants in the Low-Carbohydrate group had a mean of 3.40 and a standard deviation of 1.14.
Participants in the Control group had a mean of 1.20 and a standard deviation of 1.64.

Since we have rejected the null hypothesis in the ANOVA test, let's perform the LSD test.
t-critical value for alpha=0.05, df=16 is 2.1199
Mean Square Error = 2.9500

Here are the results of the LSD test for all pairs of groups:
            Group A           Group B       LSD  |xA-xB|  \
0       Low-Calorie           Low-Fat  2.302807      3.6   
1       Low-Calorie  Low-Carbohydrate  2.302807      3.2   
2    

In [11]:
df=pd.read_csv('module7_solved_problem1.csv')
print_hypothesis_anova()
F, p, mse, df_residuals, reject_null=get_anova(df,alpha=0.01)


<IPython.core.display.Latex object>

<IPython.core.display.Latex object>

{'Location 1': array([5.7, 6.3, 6.1, 6. , 5.8, 6.2]), 'Location 2': array([6.2, 5.3, 5.7, 6. , 5.2, 5.5]), 'Location 3': array([5.4, 5. , 6. , 5.6, 4.9, 5.2]), 'Location 4': array([3.7, 3.2, 3.9, 4. , 3.5, 3.6])}
F=57.3837, p=0.0000
Since, 0.0000 < 0.01, we reject the null hypothesis that all means are equal.
Participants in the Location 1 group had a mean of 6.02 and a standard deviation of 0.23.
Participants in the Location 2 group had a mean of 5.65 and a standard deviation of 0.39.
Participants in the Location 3 group had a mean of 5.35 and a standard deviation of 0.41.
Participants in the Location 4 group had a mean of 3.65 and a standard deviation of 0.29.

Since we have rejected the null hypothesis in the ANOVA test, let's perform the LSD test.
t-critical value for alpha=0.01, df=20 is 2.8453
Mean Square Error = 0.1147

Here are the results of the LSD test for all pairs of groups:
      Group A     Group B       LSD   |xA-xB|                 Conclusion
0  Location 1  Location 2 

In [12]:
df = pd.read_csv('module7_solved_problem2.csv')
print_hypothesis_anova()
F, p, mse, df_residuals, reject_null=get_anova(df,alpha=0.05)

<IPython.core.display.Latex object>

<IPython.core.display.Latex object>

{'City A': array([2610, 2550, 2480, 2600, 2630]), 'City B': array([2530, 2560, 2580, 2550, 2460]), 'City C': array([2500, 2450, 2550, 2480, 2600])}
F=1.4015, p=0.2838
Since, 0.2838 > 0.05, we fail to reject the null hypothesis that all means are equal.
Participants in the City A group had a mean of 2574.00 and a standard deviation of 60.25.
Participants in the City B group had a mean of 2536.00 and a standard deviation of 46.15.
Participants in the City C group had a mean of 2516.00 and a standard deviation of 59.41.

Since we have failed to reject the null hypothesis in the ANOVA test, there's no need to perform the LSD test.


In [13]:
df=pd.read_csv('module7_exercise_problem1.csv')
df = df.set_index("Method").T
print (df)
# print_hypothesis_anova()
# F, p, mse, df_residuals, reject_null=get_anova(df,alpha=0.05)

Method                     A     B     C
Scores on the HLT Test  73.0  54.0  79.0
Unnamed: 2              83.0  74.0  95.0
Unnamed: 3              76.0  71.0  87.0
Unnamed: 4              68.0   NaN   NaN
Unnamed: 5              80.0   NaN   NaN


In [14]:
df=pd.read_csv('diamonds-carat-cut.csv')
df.groupby('cut')
print(df)



    carat        cut
0    0.22       Fair
1    0.86       Fair
2    0.96       Fair
3    0.23       Good
4    0.31       Good
..    ...        ...
95   0.26  Very Good
96   0.26  Very Good
97   0.71  Very Good
98   0.78  Very Good
99   0.73  Very Good

[100 rows x 2 columns]
