In [5]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [8]:
df = pd.read_csv("Datasets/Vaccination_Coverage_among_Young_Children__0-35_Months__20241101.csv")


In [11]:
df = df[df['Vaccine'] == 'DTaP']

df

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,DTaP,≥3 Doses,States/Local Areas,North Dakota,2019,Age,19 Months,93.5,88.0 to 96.6,263.0
1,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018,Age,19 Months,95.2,91.0 to 97.5,293.0
2,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556.0
6,DTaP,≥2 Doses,States/Local Areas,North Dakota,2021,Age,5 Months,79.3,69.0 to 86.8,143.0
16,DTaP,≥3 Doses,States/Local Areas,North Dakota,2020-2021,Age,13 Months,89.5,84.7 to 92.8,391.0
...,...,...,...,...,...,...,...,...,...,...
128162,DTaP,≥1 Dose,States/Local Areas,New Hampshire,2021,Age,3 Months,93.7,88.3 to 96.7,126.0
128165,DTaP,≥4 Doses,States/Local Areas,New Hampshire,2021,Age,24 Months,89.1,80.1 to 95.2,126.0
128172,DTaP,≥4 Doses,States/Local Areas,New Hampshire,2020-2021,Age,35 Months,94.6,91.2 to 97.1,315.0
128173,DTaP,≥4 Doses,States/Local Areas,New Hampshire,2021,Age,19 Months,78.7,68.5 to 86.3,126.0


In [25]:
import pandas as pd
from scipy.stats import f  # Import the F distribution for p-value calculation
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Filter for DTaP data only
df = df[df['Vaccine'] == 'DTaP']

# Define the state-to-region mapping
state_to_region = {
    # Middle Atlantic
    'New York': 'Middle Atlantic', 'NY-Rest of state': 'Middle Atlantic', 'NY-City of New York': 'Middle Atlantic', 
    'New Jersey': 'Middle Atlantic', 'Pennsylvania': 'Middle Atlantic', 
    'PA-Philadelphia': 'Middle Atlantic', 'PA-Rest of state': 'Middle Atlantic',
    
    # New England
    'Vermont': 'New England', 'New Hampshire': 'New England', 'Massachusetts': 'New England', 
    'Connecticut': 'New England', 'Rhode Island': 'New England', 'Maine': 'New England',

    # East North Central
    'Indiana': 'East North Central', 'Illinois': 'East North Central', 'Michigan': 'East North Central', 
    'Ohio': 'East North Central', 'Wisconsin': 'East North Central', 
    'IL-City of Chicago': 'East North Central', 'IL-Rest of state': 'East North Central',

    # West North Central
    'Iowa': 'West North Central', 'Kansas': 'West North Central', 'Minnesota': 'West North Central',
    'Missouri': 'West North Central', 'Nebraska': 'West North Central', 
    'North Dakota': 'West North Central', 'South Dakota': 'West North Central',

    # South Atlantic
    'Delaware': 'South Atlantic', 'District of Columbia': 'South Atlantic', 'Florida': 'South Atlantic', 
    'Georgia': 'South Atlantic', 'Maryland': 'South Atlantic', 'North Carolina': 'South Atlantic', 
    'South Carolina': 'South Atlantic', 'Virginia': 'South Atlantic', 'West Virginia': 'South Atlantic',

    # East South Central
    'Alabama': 'East South Central', 'Kentucky': 'East South Central', 
    'Mississippi': 'East South Central', 'Tennessee': 'East South Central', 

    # West South Central
    'Arkansas': 'West South Central', 'Louisiana': 'West South Central', 'Oklahoma': 'West South Central', 
    'Texas': 'West South Central', 'TX-City of Houston': 'West South Central', 'TX-Rest of state': 'West South Central',
    'TX-Dallas County': 'West South Central', 'TX-Bexar County': 'West South Central', 
    'TX-El Paso County': 'West South Central', 'TX-Hidalgo County': 'West South Central', 
    'TX-Tarrant County': 'West South Central',

    # Mountain
    'Arizona': 'Mountain', 'Colorado': 'Mountain', 'Idaho': 'Mountain', 'New Mexico': 'Mountain', 
    'Montana': 'Mountain', 'Utah': 'Mountain', 'Nevada': 'Mountain', 'Wyoming': 'Mountain',

    # Pacific
    'Alaska': 'Pacific', 'California': 'Pacific', 'Hawaii': 'Pacific', 
    'Oregon': 'Pacific', 'Washington': 'Pacific'
}

# Map Census Region and fill any missing values with 'Unknown' to identify unmapped entries
df['Census Region'] = df['Geography'].map(state_to_region).fillna('Unknown')

# Check for unmapped states
print("Unmapped states:")
print(df[df['Census Region'] == 'Unknown']['Geography'].unique())

# Extract Age Milestone from Dimension (assuming it contains age information like "Age 4 Months")
df['Age Milestone (months)'] = df['Dimension'].str.extract(r'(\d+)').astype(float)

# Drop rows with missing or unassigned data
df = df.dropna(subset=['Age Milestone (months)', 'Census Region', 'Estimate (%)'])

# Convert Estimate (%) to numeric and coerce errors to NaN, then drop NaNs
df['Estimate (%)'] = pd.to_numeric(df['Estimate (%)'], errors='coerce')
df = df.dropna(subset=['Estimate (%)'])

# Verify the final data shape and column types
print("Data shape after cleaning:", df.shape)
print(df.dtypes)

# Run 1-way ANOVA for Region x Age using Census Region as the factor
model = ols('Q("Estimate (%)") ~ Q("Census Region")', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

# Calculate F-statistic and p-value manually
# Extract values for calculations
SS_between = anova_table.loc['Q("Census Region")', 'sum_sq']  # Sum of Squares Between (for Census Region)
df_between = anova_table.loc['Q("Census Region")', 'df']      # Degrees of Freedom Between
SS_within = anova_table.loc['Residual', 'sum_sq']             # Sum of Squares Within (Residuals)
df_within = anova_table.loc['Residual', 'df']                 # Degrees of Freedom Within

# Calculate Mean Squares
MS_between = SS_between / df_between
MS_within = SS_within / df_within

# Calculate F-statistic
F_statistic = MS_between / MS_within

# Calculate p-value
p_value = 1 - f.cdf(F_statistic, df_between, df_within)

# Display the ANOVA table and manual F-statistic and p-value results
print("\nANOVA Table:\n", anova_table)
print("\nCalculated F-statistic:", F_statistic)
print("Calculated p-value:", p_value)



Unmapped states:
[]
Data shape after cleaning: (12356, 12)
Vaccine                     object
Dose                        object
Geography Type              object
Geography                   object
Birth Year/Birth Cohort     object
Dimension Type              object
Dimension                   object
Estimate (%)               float64
95% CI (%)                  object
Sample Size                float64
Census Region               object
Age Milestone (months)     float64
dtype: object

ANOVA Table:
                           sum_sq       df          F         PR(>F)
Q("Census Region")  4.298086e+04      8.0  64.720934  1.882880e-104
Residual            1.024948e+06  12347.0        NaN            NaN

Calculated F-statistic: 64.72093446125822
Calculated p-value: 1.1102230246251565e-16
