<a href="https://colab.research.google.com/github/KenDaupsey/ANALYSIS-OF-VARIANCE-ANALYSIS-OF-COVARIANCE/blob/main/ANALYSIS_OF_VARIANCE_%26_ANALYSIS_OF_COVARIANCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*** ANALYSIS OF VARIANCE***
*** ANALYSIS OF COVARIANCE***

*ANALYSIS OF VARIANCE - ANOVA*

In [12]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [13]:
# Load the dataset from GitHub URL
url = "https://raw.githubusercontent.com/KenDaupsey/ANALYSIS-OF-VARIANCE-ANALYSIS-OF-COVARIANCE/main/hsb2%7Edata.csv"
df = pd.read_csv(url)

In [14]:
# Print columns
print(df.columns)

Index(['id', 'female', 'race', 'ses', 'schtyp', 'prog', 'read', 'write',
       'math', 'science', 'socst'],
      dtype='object')


In [15]:
### Import statistical analysis library
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [16]:
### We carry out the ANOVA procedure:

import statsmodels.api as sm
from statsmodels.formula.api import ols

# Assuming df is your DataFrame
# One-way ANOVA
model = ols('write ~ prog', df).fit()
aov_table = sm.stats.anova_lm(model)

# Round the p-values to four decimal places
aov_table['PR(>F)'] = aov_table['PR(>F)'].round(4)

# Print the ANOVA table with rounded p-values
print(aov_table)

             df        sum_sq      mean_sq          F  PR(>F)
prog        2.0   3175.697857  1587.848929  21.274738     0.0
Residual  197.0  14703.177143    74.635417        NaN     NaN


In [17]:
### Check for degree of freedom
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Assuming df is your DataFrame
# One-way ANOVA
model = ols('write ~ prog', df).fit()
aov_table = sm.stats.anova_lm(model)

# Calculate degrees of freedom
total_df = len(df) - 1
between_df = len(df['prog'].unique()) - 1
within_df = total_df - between_df

# Print degrees of freedom
print(f"Between-group (prog) degrees of freedom: {between_df}")
print(f"Within-group (Residual) degrees of freedom: {within_df}")

# Print the ANOVA table
print(aov_table)

Between-group (prog) degrees of freedom: 2
Within-group (Residual) degrees of freedom: 197
             df        sum_sq      mean_sq          F        PR(>F)
prog        2.0   3175.697857  1587.848929  21.274738  4.310163e-09
Residual  197.0  14703.177143    74.635417        NaN           NaN


In [18]:
### Summary of ANOVA
import pandas as pd

# Assuming df is your DataFrame
# Group by 'prog' and summarize 'write'
summary_table = df.groupby('prog')['write'].describe()

# Print the summary table
print(summary_table)

          count       mean       std   min    25%   50%   75%   max
prog                                                               
academic  105.0  56.257143  7.943343  33.0  52.00  59.0  62.0  67.0
general    45.0  51.333333  9.397775  31.0  44.00  54.0  59.0  67.0
vocation   50.0  46.760000  9.318754  31.0  40.25  46.0  53.5  67.0


*Two-way ANOVA*

In [19]:
### TWO-WAY ANOVA without Interaction
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Assuming 'df' is your DataFrame
model = ols('write ~ C(ses) + C(prog)', data=df).fit()

anova_table = sm.stats.anova_lm(model, typ=2)
# Round the p-values to 4 decimal places
anova_table['PR(>F)'] = anova_table['PR(>F)'].round(4)

print(anova_table)

                sum_sq     df          F  PR(>F)
C(ses)      274.229561    2.0   1.853038  0.1595
C(prog)    2591.211977    2.0  17.509466  0.0000
Residual  14428.947582  195.0        NaN     NaN


In [20]:
### nclude the interaction term between 'ses' and 'prog' in the model
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Assuming 'df' is your DataFrame
model = ols('write ~ C(ses) * C(prog)', data=df).fit()

# Alternatively, to explicitly include the interaction term
#model = ols('write ~ C(ses) + C(prog) + C(ses):C(prog)', data=df).fit()

anova_table = sm.stats.anova_lm(model, typ=2)
# Round the p-values to 4 decimal places
anova_table['PR(>F)'] = anova_table['PR(>F)'].round(4)

print(anova_table)

                      sum_sq     df          F  PR(>F)
C(ses)            274.229561    2.0   1.835742  0.1623
C(prog)          2591.211977    2.0  17.346042  0.0000
C(ses):C(prog)    162.825721    4.0   0.544992  0.7029
Residual        14266.121861  191.0        NaN     NaN


*Analysis of covariance (ANCOVA)*

In [21]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Assuming df is your DataFrame with the given structure

# Define the ANACOVA model with a continuous covariate
formula = 'write ~ female + math'

# Fit the ANCOVA model
model = ols(formula, data=df).fit()

# Display the ANCOVA summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.457
Model:                            OLS   Adj. R-squared:                  0.451
Method:                 Least Squares   F-statistic:                     82.81
Date:                Sat, 02 Mar 2024   Prob (F-statistic):           7.94e-27
Time:                        19:04:06   Log-Likelihood:                -672.08
No. Observations:                 200   AIC:                             1350.
Df Residuals:                     197   BIC:                             1360.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         21.8321      2.865      7.