In [1]:
import pandas as pd

# The data as a list of lists
data = [
    ['male', 'C', 12.40],
    ['female', 'A', 7.69],
    ['male', 'C', 14.01],
    ['female', 'A', 9.69],
    ['male', 'C', 11.65],
    ['female', 'A', 8.89],
    ['female', 'A', 6.94],
    ['female', 'A', 2.13],
    ['female', 'A', 7.26],
    ['female', 'A', 5.87],
    ['male', 'B', 12.93],
    ['female', 'C', 12.19],
    ['female', 'A', 7.20],
    ['male', 'C', 13.88],
    ['male', 'A', 8.18],
    ['male', 'B', 16.64],
    ['female', 'C', 9.41],
    ['male', 'C', 11.21],
    ['female', 'B', 8.35],
    ['male', 'A', 7.24],
    ['female', 'A', 6.81],
    ['male', 'B', 9.81],
    ['female', 'A', 6.67],
    ['female', 'A', 6.98],
    ['female', 'A', 7.07],
    ['female', 'C', 2.40],
    ['male', 'B', 7.84],
    ['female', 'B', 3.84],
    ['male', 'B', 9.42],
    ['male', 'A', 7.00],
    ['male', 'A', 7.00],
    ['female', 'A', 5.00],
    ['male', 'A', 8.00]
]

# Create the DataFrame
df = pd.DataFrame(data, columns=['Gender', 'Group', 'Value'])

# Convert 'Value' column to numeric type
df['Value'] = pd.to_numeric(df['Value'])

# Display the first few rows of the DataFrame
print(df.head())

# Display basic information about the DataFrame
print(df.info())

# Display summary statistics of the DataFrame
print(df.describe())


   Gender Group  Value
0    male     C  12.40
1  female     A   7.69
2    male     C  14.01
3  female     A   9.69
4    male     C  11.65
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Gender  33 non-null     object 
 1   Group   33 non-null     object 
 2   Value   33 non-null     float64
dtypes: float64(1), object(2)
memory usage: 924.0+ bytes
None
           Value
count  33.000000
mean    8.533333
std     3.253118
min     2.130000
25%     6.980000
50%     7.840000
75%     9.810000
max    16.640000


In [None]:
from utilities import group_by2factors
import statistics as stat

d=group_by2factors(df["Gender"],df["Group"],df["Value"])

In [41]:
from scipy.stats import f as f_stat
from prettytable import PrettyTable as PT
import statistics as stat
######## Own Modules ##########
from utilities import group_by2factors

def two_way_anova(data,factor_listA=None,
factor_listB=None, print_out=False,name_a="A",name_b="B"):
    """
    The data input should be a 3D list, the first two dims are the
    two factors. If the factor_listA and B are inputted, the data should be a 
    1D list.
    """
    if factor_listA is not None and factor_listB is not None:
        d=group_by2factors(factor_listA,factor_listB,data)
    else:
        d=data
    flat_list=[k for i in d for j in i for k in j]
    n=len(flat_list)
    a=len(d)
    b=len(d[0])
    y_bar_a=[stat.mean([k for j in i for k in j]) for i in d]
    y_bar_b=[stat.mean([k for j in range(a) for k in d[j][i]])
            for i in range(b)]
    n_a=[len([k for j in i for k in j]) for i in d]
    n_b=[len([k for j in range(a) for k in d[j][i]])
            for i in range(b)]
    y_bar=stat.mean(flat_list)
    SSE=sum([sum((x-stat.mean(k))**2 for x in k) for j in d for k in j])
    SST=sum((x-y_bar)**2 for j in d for k in j for x in k)
#     SSA=n*b*sum((i-y_bar)**2 for i in y_bar_a)
    SSA=sum(j*(i-y_bar)**2 for i,j in zip(y_bar_a,n_a))
#     SSB=n*a*sum((i-y_bar)**2 for i in y_bar_b)
    SSB=sum(j*(i-y_bar)**2 for i,j in zip(y_bar_b,n_b))
    Re_SS_B=sum([(k-y_bar_b[bi])**2 for ai in range(a)
                for bi in range(b) for k in d[ai][bi]])
    print(SST-Re_SS_B)
    
    SSAB=SST-SSE-SSA-SSB
    DFA=a-1
    DFB=b-1
    DFAB=DFA*DFB
    DFT=n-1
    DFE=DFT-DFA-DFB-DFAB
    MSA=SSA/DFA
    MSB=SSB/DFB
    MSAB=SSAB/DFAB
    MSE=SSE/DFE
    FA=MSA/MSE
    FB=MSB/MSE
    FAB=MSAB/MSE
    PA=1-f_stat.cdf(FA,DFA,DFE)
    PB=1-f_stat.cdf(FB,DFB,DFE)
    PAB=1-f_stat.cdf(FAB,DFAB,DFE)
    print(y_bar,y_bar_a,y_bar_b,n_b,n_a)
    
    if print_out:
        print("Two Way ANOVA\n\nFactor Info.")
        t=PT()
        t.field_names=["Factor","Levels"]
        t.add_row([name_a, a])
        t.add_row([name_b, b])
        print(t)
        print("\nAnalysis of Variance")
        t=PT()
        t.field_names=["Source","DF","Adj SS","Adj MS",
        "F-value","p-value"]
        t.add_row([name_a,DFA,"%.3f"%SSA,"%.3f"%MSA,"%.3f"%FA,"%.3f"%PA])
        t.add_row([name_b,DFB,"%.3f"%SSB,"%.3f"%MSB,"%.3f"%FB,"%.3f"%PB])
        t.add_row([name_a+"*"+name_b,DFAB,"%.3f"%SSAB,"%.3f"%MSAB,"%.3f"%FAB,"%.3f"%PAB])
        t.add_row(["Error", DFE, "%.3f"%SSE,"%.3f"%MSE," "," "])
        t.add_row(["Total",DFT,"%.3f"%SST," "," "," "])
        print(t)\
    
    return {"N":n,"SSE":SSE,"MSE":MSE,"SSA":SSA,"MSA":MSA,
    "SSB":SSB,"SSAB":SSAB,"MSB":MSB,"MSAB":MSAB,"DF A":DFA,
    "DF B":DFB, "DF AB":DFAB,"F A":FA,"F B":FB,"F AB":FAB,
    "p A":PA,"p B":PB,"p AB":PAB}
    


In [42]:
_=two_way_anova(d,print_out=True)

99.8872251984127
8.533333333333333 [6.910555555555556, 10.480666666666666] [6.978888888888889, 9.832857142857144, 10.89375] [18, 7, 8] [18, 15]
Two Way ANOVA

Factor Info.
+--------+--------+
| Factor | Levels |
+--------+--------+
|   A    |   2    |
|   B    |   3    |
+--------+--------+

Analysis of Variance
+--------+----+---------+---------+---------+---------+
| Source | DF |  Adj SS |  Adj MS | F-value | p-value |
+--------+----+---------+---------+---------+---------+
|   A    | 1  | 104.283 | 104.283 |  17.857 |  0.000  |
|   B    | 2  |  99.887 |  49.944 |  8.552  |  0.001  |
|  A*B   | 2  | -23.202 | -11.601 |  -1.986 |  1.000  |
| Error  | 27 | 157.681 |  5.840  |         |         |
| Total  | 32 | 338.649 |         |         |         |
+--------+----+---------+---------+---------+---------+


In [37]:
data = [
    # Variety 1
    [
        [7.8, 9.1, 10.6],   # 5 k/ha
        [11.2, 12.7, 13.3], # 10 k/ha
        [12.1, 12.5, 14.1], # 15 k/ha
        [9.1, 10.7, 12.6]   # 20 k/ha
    ],
    # Variety 2
    [
        [8.0, 8.7, 10.0],   # 5 k/ha
        [11.3, 12.9, 13.8], # 10 k/ha
        [13.8, 14.3, 15.4], # 15 k/ha
        [11.3, 12.7, 14.3]  # 20 k/ha
    ],
    # Variety 3
    [
        [15.3, 16.0, 17.6], # 5 k/ha
        [16.8, 18.3, 19.2], # 10 k/ha
        [17.9, 21.0, 20.7], # 15 k/ha
        [17.2, 18.3, 19.1]  # 20 k/ha
    ]
]
two_way_anova(data,print_out=True)

13.880555555555556 [11.316666666666666, 12.208333333333334, 18.116666666666667] [11.455555555555556, 14.38888888888889, 15.755555555555555, 13.922222222222222]
Two Way ANOVA

Factor Info.
+--------+--------+
| Factor | Levels |
+--------+--------+
|   A    |   3    |
|   B    |   4    |
+--------+--------+

Analysis of Variance
+--------+----+---------+---------+---------+---------+
| Source | DF |  Adj SS |  Adj MS | F-value | p-value |
+--------+----+---------+---------+---------+---------+
|   A    | 2  | 327.774 | 163.887 | 100.476 |  0.000  |
|   B    | 3  |  86.907 |  28.969 |  17.760 |  0.000  |
|  A*B   | 6  |  8.068  |  1.345  |  0.824  |  0.562  |
| Error  | 24 |  39.147 |  1.631  |         |         |
| Total  | 35 | 461.896 |         |         |         |
+--------+----+---------+---------+---------+---------+


{'N': 36,
 'SSE': 39.146666666666675,
 'MSE': 1.6311111111111114,
 'SSA': 327.7738888888889,
 'MSA': 163.88694444444445,
 'SSB': 86.90749999999997,
 'SSAB': 8.068333333333328,
 'MSB': 28.969166666666656,
 'MSAB': 1.3447222222222213,
 'DF A': 2,
 'DF B': 3,
 'DF AB': 6,
 'F A': 100.47564713896456,
 'F B': 17.760388283378738,
 'F AB': 0.8244209809264298,
 'p A': 2.175037927543144e-12,
 'p B': 2.7384098582050598e-06,
 'p AB': 0.5623027998578893}

In [44]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Assuming we have a DataFrame 'df' with columns 'Y' (dependent variable), 'A' (Factor A), and 'B' (Factor B)
data = [
    ['male', 'C', 12.40],
    ['female', 'A', 7.69],
    ['male', 'C', 14.01],
    ['female', 'A', 9.69],
    ['male', 'C', 11.65],
    ['female', 'A', 8.89],
    ['female', 'A', 6.94],
    ['female', 'A', 2.13],
    ['female', 'A', 7.26],
    ['female', 'A', 5.87],
    ['male', 'B', 12.93],
    ['female', 'C', 12.19],
    ['female', 'A', 7.20],
    ['male', 'C', 13.88],
    ['male', 'A', 8.18],
    ['male', 'B', 16.64],
    ['female', 'C', 9.41],
    ['male', 'C', 11.21],
    ['female', 'B', 8.35],
    ['male', 'A', 7.24],
    ['female', 'A', 6.81],
    ['male', 'B', 9.81],
    ['female', 'A', 6.67],
    ['female', 'A', 6.98],
    ['female', 'A', 7.07],
    ['female', 'C', 2.40],
    ['male', 'B', 7.84],
    ['female', 'B', 3.84],
    ['male', 'B', 9.42],
    ['male', 'A', 7.00],
    ['male', 'A', 7.00],
    ['female', 'A', 5.00],
    ['male', 'A', 8.00]
]

# Create the DataFrame
df = pd.DataFrame(data, columns=['A', 'B', 'Y'])

# Step 1: Fit the full model
full_model = ols("Y ~ C(A, Sum) + C(B, Sum) + C(A, Sum):C(B, Sum)",
                 data=df).fit()

# Step 2: Fit the reduced model (without Factor A)
reduced_model = ols("Y ~ C(B, Sum) + C(A, Sum):C(B, Sum)",
                    ldata=df).fit()

# Step 3: Calculate Type 3 SS for Factor A
ss_full = ((full_model.fittedvalues - df['Y'].mean())**2).sum()
ss_reduced = ((reduced_model.fittedvalues - df['Y'].mean())**2).sum()
type3_ss_A = ss_full - ss_reduced

# Print the result
print(f"Type 3 Sum of Squares for Factor A: {type3_ss_A}")

# To get the complete ANOVA table with Type 3 SS
anova_table = sm.stats.anova_lm(full_model, typ=3)
print(anova_table)


Type 3 Sum of Squares for Factor A: -5.684341886080802e-14
                          sum_sq    df           F        PR(>F)
Intercept            1812.640170   1.0  310.382069  2.470703e-16
C(A, Sum)              73.870879   1.0   12.649061  1.412004e-03
C(B, Sum)              51.032793   2.0    4.369225  2.270277e-02
C(A, Sum):C(B, Sum)    30.603340   2.0    2.620136  9.120704e-02
Residual              157.680773  27.0         NaN           NaN
