In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

In [None]:
import tensorflow as tf
print(tf.version)

In [None]:
print(tf.version)

In [None]:
#loads a CSV file into a pandas DataFrame  
# prints the first few rows to give you a preview of the data
file_path = "C:\\Users\\Kerry\\Documents\\RNN_COVID_GROUP\\Datasets\\Vaccination_Coverage_among_Young_Children__0-35_Months__20241101.csv"
df = pd.read_csv(file_path)
print(df.head())

In [None]:
#remove rows with missing values (NaN) in the specified column, Estimate (%)
# makes sure the Estimate (%) column has no missing values, which could interfere with calculations or visualizations
df_clean = df.dropna(subset=['Estimate (%)'])


In [None]:
# Strip any leading/trailing spaces
df_clean['Dimension'] = df_clean['Dimension'].str.strip()

# Standardize to "Title Case" (e.g., "5 Months" instead of "5 months")
df_clean['Dimension'] = df_clean['Dimension'].str.title()

In [None]:
regions = {
    'Northeast': ['Maine', 'New York', ...],
    'Midwest': ['Illinois', 'Ohio', ...],
    'South': ['Texas', 'Florida', ...],
    'West': ['California', 'Washington', ...]
}

def assign_region(state):
    for region, states in regions.items():
        if state in states:
            return region
    return None

df_clean['Region'] = df_clean['Geography'].apply(assign_region)

In [None]:
# Group by region and state, then sample proportionally (e.g., 20% from each state)
df_sampled = df_clean.groupby(['Region', 'Geography']).apply(lambda x: x.sample(frac=0.2, random_state=42)).reset_index(drop=True)


In [None]:
# code performs a two-way ANOVA to analyze the effects of two categorical variables (Region and Dimension)
import statsmodels.api as sm
from statsmodels.formula.api import ols

model = ols('Q("Estimate (%)") ~ C(Region) * C(Dimension)', data=df_sampled).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)


In [None]:
# Remove rows with missing values in 'Estimate (%)' and 'Region'
df_tukey_region = df_clean.dropna(subset=['Estimate (%)', 'Region'])

# Remove rows with missing values in 'Estimate (%)' and 'Dimension'
df_tukey_age = df_clean.dropna(subset=['Estimate (%)', 'Dimension'])


In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Tukey's test for regions
tukey_region = pairwise_tukeyhsd(df_tukey_region['Estimate (%)'], df_tukey_region['Region'])
print(tukey_region)

# Tukey's test for age milestones
tukey_age = pairwise_tukeyhsd(df_tukey_age['Estimate (%)'], df_tukey_age['Dimension'])
print(tukey_age)



In [None]:
# Filter for specific age milestones (e.g., 6 months, 12 months, 18 months, 24 months)
age_milestones = ['6 Months', '12 Months', '18 Months', '24 Months']
df_filtered = df_clean[df_clean['Dimension'].isin(age_milestones)]

sns.lineplot(data=df_filtered, x='Dimension', y='Estimate (%)', hue='Region', marker='o')
plt.title('Vaccine Coverage by Selected Age Milestones and Region')
plt.xlabel('Age Milestone')
plt.ylabel('Vaccine Coverage (%)')
plt.show()


In [None]:
sns.boxplot(data=df_clean, x='Region', y='Estimate (%)')
plt.title('Vaccine Coverage Distribution by Region')
plt.xlabel('Region')
plt.ylabel('Vaccine Coverage (%)')
plt.show()


In [None]:
# Define the list of age milestones you want to keep
age_milestones = [
    '0 Days', '2 Months', '4 Months', '6 Months', '9 Months', '12 Months', 
    '15 Months', '18 Months', '19 Months', '24 Months', '35 Months'
]

# Filter the DataFrame to include only these age milestones
df_age_only = df_clean[df_clean['Dimension'].isin(age_milestones)]

# Plot the filtered data
sns.boxplot(data=df_age_only, x='Dimension', y='Estimate (%)')
plt.title('Vaccine Coverage Distribution by Age Milestone')
plt.xlabel('Age Milestone')
plt.ylabel('Vaccine Coverage (%)')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Create broader age categories
df_clean['Age Group'] = df_clean['Dimension'].apply(lambda x: '0-6 Months' if x in ['0 Days', '2 Months', '4 Months', '6 Months'] 
                                                    else '6-12 Months' if x in ['9 Months', '12 Months']
                                                    else '12-18 Months' if x in ['15 Months', '18 Months']
                                                    else '18-24 Months' if x in ['19 Months', '24 Months']
                                                    else 'Other')

# Calculate the average coverage for each region and age group
df_grouped = df_clean.groupby(['Age Group', 'Region'])['Estimate (%)'].mean().reset_index()

sns.lineplot(data=df_grouped, x='Age Group', y='Estimate (%)', hue='Region', marker='o')
plt.title('Average Vaccine Coverage by Age Group and Region')
plt.xlabel('Age Group')
plt.ylabel('Average Vaccine Coverage (%)')
plt.show()


In [None]:
# Check unique values in the 'Dimension' column to see available age milestones
unique_age_milestones = df_clean['Dimension'].unique()
print(sorted(unique_age_milestones))

In [None]:
# Define a new list of available age milestones starting from "0-3 Days"
# (Adjust based on your dataset's actual contents)
age_milestones_from_0_3_days = [
    '0-3 Days', '1 Month', '2 Months', '3 Months', '4 Months', '5 Months',
    '6 Months', '7 Months', '8 Months', '9 Months', '12 Months', 
    '15 Months', '18 Months', '19 Months', '24 Months', '35 Months'
]

# Filter the DataFrame again based on this list
df_filtered = df_clean[df_clean['Dimension'].isin(age_milestones_from_0_3_days)]

# Plot the filtered data
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_filtered, x='Dimension', y='Estimate (%)')

# Set the title and labels
plt.title('Vaccine Coverage Distribution by Age Milestone (Starting from 0-3 Days)')
plt.xlabel('Age Milestone')
plt.ylabel('Vaccine Coverage (%)')

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()
