In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Data Preprocessing

## Preparation

In [None]:
# Load the dataset
chronic = pd.read_csv('U.S._Chronic_Disease_Indicators.csv')
chronic

## Data Cleaning

Predictive Question: Predict the crude prevalence of diabetes based on variables such as year, location, and demographic stratifications.

In [None]:
# Check for missing values in each column
for col in chronic.columns:
    missing_count = chronic[col].isna().sum()
    print(f"{col}: {missing_count} missing values")
    
# Drop redundant columns
chronic = chronic.drop(columns=['Response', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 
                                'Stratification3', 'ResponseID', 'StratificationCategoryID2', 'StratificationID2', 
                                'StratificationCategoryID3', 'StratificationID3', 'Geolocation', 'LocationID', 'TopicID', 'QuestionID', 'DataValueTypeID', 'StratificationCategoryID1', 'StratificationID1', 'YearEnd', 'DataValueAlt'])

# Fill the DataValue with the average of LowConfidenceLimit and HighConfidenceLimit
for index, row in chronic.iterrows():
    if pd.isna(row['DataValue']):
        if pd.notna(row['LowConfidenceLimit']) and pd.notna(row['HighConfidenceLimit']):
            chronic.at[index, 'DataValue'] = (row['LowConfidenceLimit'] + row['HighConfidenceLimit']) / 2

# Delete rows where DataValue is missing and either LowConfidenceLimit or HighConfidenceLimit is missing
chronic.dropna(subset=['DataValue', 'LowConfidenceLimit', 'HighConfidenceLimit'], how='any', inplace=True)


# Impute missing values from Geolocation with (0, 0), one can also remove those observations if necessary in the later analysis


# Remove the footnote columns as they are irrelevant to the analysis
chronic = chronic.drop(columns=['DataValueFootnoteSymbol', 'DataValueFootnote', 'LowConfidenceLimit', 'HighConfidenceLimit'])

In [None]:
# Final check of the dataset
for col in chronic.columns:
    missing_count = chronic[col].isna().sum()
    print(f"{col}: {missing_count} missing values")

# There are no missing values in the dataset now. We have 22 columns in the data and 311745 observations in total

## Preprocessing

In [None]:
# Filtering the data
diabetes_df = chronic[(chronic['Question'] == 'Diabetes among adults') & (chronic['DataValueType'] == 'Crude Prevalence')]
obesity_df = chronic[(chronic['Question'] == 'Obesity among adults') & (chronic['DataValueType'] == 'Crude Prevalence')]

# Dropping rows where the target variable 'DataValue' is missing
diabetes_df = diabetes_df.dropna(subset=['DataValue'])
obesity_df = obesity_df.dropna(subset=['DataValue'])

# Add Obesity Crude Prevalence to the dataset
df = pd.merge(diabetes_df, obesity_df, on=['YearStart', 'LocationAbbr', 'StratificationCategory1', 'Stratification1'], how='inner')
df.rename(columns={'DataValue_y': 'Obesity', 'DataValue_x': 'Target'}, inplace=True)


# Selecting features for the model
features = df[['YearStart', 'LocationAbbr', 'StratificationCategory1', 'Stratification1', 'Obesity']]

# Encoding categorical data
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(features[['LocationAbbr', 'StratificationCategory1', 'Stratification1']])

# Including the numerical feature 'YearStart' and 'Obesity' after reshaping it for concatenation
year_data = features['YearStart'].values.reshape(-1, 1)
obesity_data = features['Obesity'].values.reshape(-1, 1)
final_features = np.concatenate([year_data, obesity_data, encoded_features], axis=1)

# Get the final dataset for analysis
feature_names = ['YearStart'] + ['Obesity'] + list(encoder.get_feature_names_out(['LocationAbbr', 'StratificationCategory1', 'Stratification1']))
X = pd.DataFrame(final_features, columns=feature_names)
y = df['Target']
y = y.reset_index(drop=True).to_frame()

# Check for missing values
print(f'There are {X.isnull().sum().sum()} missing values')

# EDA

## Summary Statistics

In [None]:
chronic.info()
# We have 22 columns in the data and 311745 observations in total

summary_stats = chronic.describe()
print(summary_stats)

# Check the number of different topics and questions
topics = chronic['Topic'].unique()
print(topics)
questions = chronic['Question'].unique()
print(questions)

# There are 19 topics and 109 questions
# Split the dataset by topic for further analysis

# Dictionary to hold the split DataFrames
topic_dataframes = {}

for topic in topics:
    topic_dataframes[f'{topic}_data'] = chronic[chronic['Topic'] == topic]

## Visualizations

What is the rank of chronic disease in the U.S. with crude prevalence?

In [None]:
# Define the questions and their simplified names
questions = {
    "Diabetes among adults": "Diabetes",
    "High cholesterol among adults who have been screened": "High Cholesterol",
    "High blood pressure among adults": "High Blood Pressure",
    "Depression among adults": "Depression",
    "Current asthma among adults": "Asthma",
    "Arthritis among adults": "Arthritis",
    "Obesity among adults": "Obesity"
}

# Calculating the mean DataValue for each condition
mean_values = {}
for question, simple_name in questions.items():
    mean_value = chronic[
        (chronic['Question'] == question) &
        (chronic['StratificationCategory1'] == 'Overall') &
        (chronic['DataValueType'] == 'Crude Prevalence')
    ]['DataValue'].astype(float).mean()  # Convert DataValue to float before calculating mean
    mean_values[simple_name] = mean_value

# Create a DataFrame from the dictionary
common_diseases_prevalence = pd.DataFrame(list(mean_values.items()), columns=['Disease', 'Average Prevalence'])

# Sort the DataFrame by 'Average Prevalence' in descending order
common_diseases_prevalence = common_diseases_prevalence.sort_values(by='Average Prevalence', ascending=False)

# Create an interactive bar chart with adjusted size and appearance
fig = px.bar(common_diseases_prevalence, x='Disease', y='Average Prevalence', title='Average Prevalence of Chronic Diseases Among Adults',
             labels={'Average Prevalence': 'Prevalence (%)'},
             color='Disease',  # Color by disease for better visualization
             barmode='group')

# Adjusting the layout for a larger view
fig.update_layout(
    title={'text': "Average Prevalence of Chronic Diseases Among Adults", 'x':0.5, 'xanchor': 'center'},
    template='plotly_white',
    autosize=False,
    width=800,
    height=600,
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="RebeccaPurple"
    )
)

fig.show()

The above plot highlights that the three most prevalent conditions are high cholesterol, high blood pressure, and obesity. Given our domain knowledge, these conditions significantly contribute to diabetes. Yet, the crude prevalence of diabetes is notably lower compared to these other diseases. Consequently, this prompts further investigation into the underlying factors influencing diabetes crude prevalence in the U.S., as well as the development of predictive analyses based on this data.

What is the distribution of diabetes prevalence by state? Assuming the data from 2019 could be the representative.

In [None]:
# Filter data for the year 2019, topic of "Diabetes among adults", location in US states only, and stratification by sex
Diabetes_Overall_2019 = chronic[
    (chronic['YearStart'] == 2019) &

    (chronic['Question'] == 'Diabetes among adults') &
    (chronic['DataSource']) &
    (chronic['StratificationCategory1'] == 'Overall') &
    (chronic['DataValueType'] == 'Crude Prevalence')
]


# Drop rows with missing 'DataValue' as they cannot be plotted
Diabetes_Overall_2019 = Diabetes_Overall_2019.dropna(subset=['DataValue'])

# Convert DataValue to float for plotting
Diabetes_Overall_2019['DataValue'] = Diabetes_Overall_2019['DataValue'].astype(float)

# Plotting the map using Plotly Express
fig = px.choropleth(
    Diabetes_Overall_2019,
    locations='LocationAbbr',
    locationmode='USA-states',
    color='DataValue',
    color_continuous_scale='Viridis',
    scope='usa',
    labels={'DataValue': 'Crude Prevalence (%)'},
    title='2019 Crude Overall Diabetes Prevalence in USA States',
    hover_name='LocationDesc',
    hover_data={'Stratification1': True, 'DataValue': True},
    range_color = [6, 17]
)

fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()
Diabetes_Overall_2019['DataValue'].min(), Diabetes_Overall_2019['DataValue'].max()

The map displayed above indicates that the crude prevalence of diabetes exceeds 14% in three states: West Virginia (15.7%), Mississippi (14.8%), and Alabama (14%). In contrast, states such as Colorado (7%) and Montana (7.6%) exhibit relatively lower prevalence rates. There is a discernible regional trend, with states in the Eastern and Southeastern U.S. demonstrating higher crude prevalence rates, whereas Northern and Western states generally show lower rates. This regional disparity may be attributed to a variety of factors, including differences in lifestyle, socioeconomic status, access to healthcare, and possibly environmental influences such as weather conditions. Further analysis is needed to understand the complex interplay of these factors and their impact on diabetes prevalence across different states.

What is the distribution of diabetes prevalence by race?

In [None]:
# Filter data for diabetes prevalence by race, irrespective of state
Diabetes_By_Race = chronic[
    (chronic['Question'] == 'Diabetes among adults') &
    (chronic['StratificationCategory1'] == 'Race/Ethnicity') &  # Assuming the correct category name
    (chronic['DataValueType'] == 'Crude Prevalence')
].copy()  # Add .copy() here to explicitly make a copy of the filtered data

# Convert 'DataValue' to float for numerical operations
Diabetes_By_Race['DataValue'] = pd.to_numeric(Diabetes_By_Race['DataValue'], errors='coerce')

# Plotting the boxplot using Plotly Express
fig = px.box(
    Diabetes_By_Race,
    x='Stratification1',  # This should correspond to the race/ethnicity
    y='DataValue',
    labels={'DataValue': 'Crude Prevalence (%)', 'Stratification1': 'Race/Ethnicity'},
    title='Diabetes Prevalence by Race',
    color='Stratification1',  # Color by race/ethnicity
    category_orders={"Stratification1": Diabetes_By_Race['Stratification1'].unique().tolist()}  # Ensure the order is consistent
)

fig.update_layout(
    xaxis_title="Race/Ethnicity",
    yaxis_title="Diabetes Prevalence (%)",
    coloraxis_colorbar={
        'title': 'Prevalence %'
    },
    margin={"r":0,"t":50,"l":0,"b":0}
)

fig.show()

 - The median prevalence of diabetes is highest among the American Indian or Alaska Native, non-Hispanic group, with a median percentage 17.6%. This group also shows the widest interquartile range (IQR), indicating considerable variation within this population.
 - The non-Hispanic Black population has the second-highest median prevalence, which is just above 15%, with a narrower IQR compared to the American Indian or Alaska Native group, suggesting less variability.
 - Hispanic individuals exhibit a median prevalence slightly lower than non-Hispanic Black individuals but have a comparable IQR.
 - The non-Hispanic White and Multiracial groups have lower median prevalence rates, both around the 10% mark, with Multiracial individuals displaying slightly more variation.
 - Non-Hispanic Asian and non-Hispanic Hawaiian or Pacific Islander populations have the lowest median diabetes prevalence, which is 10.05%.
 - Outliers are present in most groups, indicating that there are subpopulations with prevalence rates significantly different from the main group. These are especially notable in the American Indian or Alaska Native, non-Hispanic group.
 

In conclusion, the prevalence of diabetes differs across race/ethnicity categories, with American Indian or Alaska Native, non-Hispanic individuals being the most affected and Asian, non-Hispanic and Hawaiian or Pacific Islander, non-Hispanic individuals being the least affected. The data suggests that racial and ethnic factors may influence the prevalence of diabetes, which can be crucial for targeted healthcare interventions.

What is the trend of diabetes prevalence over males by states? Assuming the data from 2019 could be the representative.

In [None]:
Diabetes_Male_2019 = chronic[
    (chronic['YearStart'] == 2019) &

    (chronic['Question'] == 'Diabetes among adults') &

    (chronic['DataSource']) &
    (chronic['StratificationCategory1'] == 'Sex') &
    (chronic['Stratification1'] == 'Male') &
    (chronic['DataValueType'] == 'Crude Prevalence')
]

# Plotting the bar chart using Plotly Express
fig = px.bar(
    Diabetes_Male_2019.sort_values('DataValue'),  # Sort values for better visualization
    x='LocationDesc',  # Use the full location description for clarity
    y='DataValue',
    labels={'DataValue': 'Crude Prevalence (%)'},
    title='2019 Crude Diabetes Prevalence of Male in USA States',
    color='DataValue',  # Color the bars by the data value
    color_continuous_scale='Viridis',  # Maintain the same color scale for consistency
    hover_data=['LocationAbbr', 'DataValue']
)

fig.update_layout(
    xaxis_title="State",
    yaxis_title="Diabetes Prevalence (%)",
    xaxis={'categoryorder':'total descending'},  # Optional: to ensure bars are sorted by value
    coloraxis_colorbar={
        'title': 'Prevalence %'
    },
    margin={"r":0,"t":30,"l":0,"b":0}
)

fig.show()

We can see Puerto Rico and West Virginia exhibit the highest prevalence rates, surpassing 16%, which significantly stands out. This high prevalence suggests a critical public health concern in these regions. Following these, a descending trend is evident, with several states like Tennessee, Alabama, and Kentucky also showing notably high rates, above 13%.

As the bars transition from green to blue, indicating lower prevalence, we see states such as Hawaii, Nevada, and Georgia hovering around the 11% mark. This gradual decrease continues with states like Idaho, California, and Connecticut displaying even lower prevalence rates, falling below 11%. The lowest prevalence rates are observed in Alaska and the District of Columbia, both of which are below 9%.

Assuming the data from 2019 is representative, the plot suggests a significant variation in diabetes prevalence among males at the state level. 

What is the trend of diabetes prevalence over females by states? Assuming the data from 2019 could be the representative.

In [None]:
Diabetes_Female_2019 = chronic[
    (chronic['YearStart'] == 2019) &

    (chronic['Question'] == 'Diabetes among adults') &

    (chronic['DataSource']) &
    (chronic['StratificationCategory1'] == 'Sex') &
    (chronic['Stratification1'] == 'Female') &
    (chronic['DataValueType'] == 'Crude Prevalence')
]

# Plotting the bar chart using Plotly Express
fig = px.bar(
    Diabetes_Female_2019.sort_values('DataValue'),  # Sort values for better visualization
    x='LocationDesc',  # Use the full location description for clarity
    y='DataValue',
    labels={'DataValue': 'Crude Prevalence (%)'},
    title='2019 Crude Diabetes Prevalence of Female in USA States',
    color='DataValue',  # Color the bars by the data value
    color_continuous_scale='Viridis',  # Maintain the same color scale for consistency
    hover_data=['LocationAbbr', 'DataValue']
)

fig.update_layout(
    xaxis_title="State",
    yaxis_title="Diabetes Prevalence (%)",
    xaxis={'categoryorder':'total descending'},  # Optional: to ensure bars are sorted by value
    coloraxis_colorbar={
        'title': 'Prevalence %'
    },
    margin={"r":0,"t":30,"l":0,"b":0}
)

fig.show()

Similar to the trends observed in males, the chart indicates significant geographical disparities in diabetes prevalence. Puerto Rico stands out with the highest prevalence, well above 17%, while Mississippi emerges as the second-highest, suggesting that these areas may require targeted public health interventions for women.

Descending from these peaks, states like West Virginia, Tennessee, and Arkansas also show high prevalence rates, exceeding 13%. The prevalence rates gradually taper off through states like Delaware, Texas, and Ohio, with these states hovering around the 11% mark.

As we move towards states represented in blue, such as Idaho, Hawaii, and Iowa, the prevalence continues to decrease, falling below 10%. The trend extends to the lowest rates observed in North Dakota, Wyoming, and Colorado, with Colorado showing the lowest prevalence among females, under 7%.

This trend for females, while displaying some variations, is generally similar to that observed in males, underscoring the pervasiveness of diabetes across both genders. 

 What is the trend of diabetes prevalence over years

In [None]:
# Filter the data for diabetes prevalence
diabetes_data = chronic[
    (chronic['Question'] == 'Diabetes among adults') &
    (chronic['DataValueType'] == 'Crude Prevalence')
].copy()  # Make a copy to avoid the SettingWithCopyWarning

# Convert 'DataValue' to float for numerical operations
diabetes_data['DataValue'] = pd.to_numeric(diabetes_data['DataValue'], errors='coerce')

# Group by year and calculate the mean prevalence
diabetes_trend = diabetes_data.groupby('YearStart')['DataValue'].mean().reset_index()

# Create a line chart to visualize the trend of diabetes prevalence over years
fig = px.line(diabetes_trend, x='YearStart', y='DataValue', title='Trend of Diabetes Prevalence Over Years',
              labels={'YearStart': 'Year', 'DataValue': 'Diabetes Prevalence (%)'},
              markers=True)  # Adding markers for clarity

# Enhance the layout and aesthetics
fig.update_layout(
    plot_bgcolor='white',
    xaxis=dict(
        title='Year',
        showline=True,
        showgrid=False,
        showticklabels=True,
        linecolor='black',
        linewidth=2,
        ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black'
        ),
        tickmode='linear',  # Set tick mode to linear for consistent intervals
        tick0=diabetes_trend['YearStart'].min(),  # Start ticks from the first year
        dtick=1  # Set one tick per year
    ),
    yaxis=dict(
        title='Average Prevalence (%)',
        showgrid=True,
        gridcolor='gray',
        showline=True,
        linewidth=2,
        linecolor='black',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black'
        ),
        range=[10, 15]  # Custom range for the y-axis to make changes less dramatic
    ),
    title={
        'text': 'Trend of Diabetes Prevalence Over Years',
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    )
)

fig.show()

To elucidate the trend in diabetes prevalence from the visual data provided, it can be observed that the average prevalence of diabetes remained relatively stable at approximately 12.2% from 2019 to 2021. However, there was a notable increase to around 13.1% from 2021 to 2022. This uptick may be attributable to a multitude of factors including, but not limited to, changes in diagnostic criteria, heightened awareness and screening for diabetes, alterations in lifestyle or environmental factors, and potentially the impacts of the COVID-19 pandemic on health behaviors and access to care. 

What is the distribution of diabetes prevalence by age group?

In [None]:
# Filter data for diabetes-related questions specifically for adults
diabetes_data = chronic[(chronic['Question'] == 'Diabetes among adults')]

# Extract relevant columns for further analysis
diabetes_trends = diabetes_data[['YearStart', 'DataValue', 'Stratification1']]

# Filter out only the age-specific groups
age_specific_data = diabetes_trends[diabetes_trends['Stratification1'].isin(['Age 18-44', 'Age 45-64', 'Age >=65'])]

# Group the data by Year and Age Group and calculate the mean prevalence
age_trend_data = age_specific_data.groupby(['YearStart', 'Stratification1']).mean().unstack()

# Prepare data for Plotly by resetting the index
age_trend_data_reset = age_trend_data['DataValue'].reset_index()

# Melt the DataFrame to fit Plotly's requirements
plotly_data = age_trend_data_reset.melt(id_vars='YearStart', var_name='Age Group', value_name='Prevalence')

# Create an interactive Plotly graph
fig = px.line(plotly_data, x='YearStart', y='Prevalence', color='Age Group',
              title='Interactive Trend of Diabetes Prevalence by Age Group',
              labels={'Prevalence': 'Diabetes Prevalence (%)', 'YearStart': 'Year'},
              markers=True)
fig.update_xaxes(dtick=1, tick0=min(plotly_data['YearStart']))
# Show the plot
fig.show()

The trend depicted in the plot aligns with conventional medical understanding that the risk of developing various diseases, including diabetes, escalates with age. Notably, the data indicates that for younger individuals, the crude prevalence of diabetes is consistently over 10% higher than that of the middle-aged population across the observed years. Conversely, for the elderly, the prevalence rate surpasses that of the middle-aged cohort by approximately 7%. These statistics underscore the significance of age as a pivotal determinant in the incidence of diabetes. It's imperative to highlight that while age is a non-modifiable risk factor, this trend reinforces the importance of age-specific healthcare strategies in managing and preventing diabetes.

 What is the relationship between obesity prevalence and diabetes prevalence?

In [None]:
# Filter the data for plotting
diabetes_pre = y.copy()
obesity_pre = pd.Series(X['Obesity'])
scatter_df = pd.concat([diabetes_pre, obesity_pre], axis=1)

# Create a scatter plot
fig = px.scatter(scatter_df, x='Obesity', y='Target', title='Scatter Plot Between Crude Diabetes and Obesity Prevalence')

# Customize the plot further if needed
fig.update_traces(marker=dict(size=12, line=dict(width=2, color='DarkSlateGrey')))

fig.update_layout(
    xaxis_title="Obesity Prevalence (%)",
    yaxis_title="Diabetes Prevalence (%)",
    autosize=False,
    width=900,  
    height=600  
)

# Show the plot
fig.show()

The scatterplot above implies there are severl points worth for notice:
 - There appears to be a positive correlation between obesity prevalence and diabetes prevalence. As the percentage of obesity increases, there is a general trend of increased diabetes prevalence.
 - The data points are densely clustered in the area where obesity prevalence ranges between approximately 20% and 40%, with the corresponding diabetes prevalence mostly falling between 10% and 20%.
 - A few outliers can be observed, particularly in the higher ranges of both obesity and diabetes prevalence. These outliers indicate that there may be instances where the prevalence of diabetes is high despite a lower prevalence of obesity, and vice versa.
 - The trend suggests that while obesity is likely a significant factor in diabetes prevalence, there are other factors at play, given that the relationship is not perfectly linear.


In summary, the scatter plot suggests a strong association between obesity and diabetes prevalence, supporting the notion that obesity is one of the risk factors for diabetes. However, the variation in the data also indicates the influence of additional factors contributing to the prevalence of diabetes that merit further investigation.

What are the correlations among all numerical variables, including the target variable?

In [None]:
# Filter the data for plotting
corr_df = pd.concat([scatter_df, X['YearStart']], axis=1)

# Calculate the correlation matrix
corr_matrix = corr_df.corr()

# Create a heatmap using Plotly Express
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    labels=dict(x="Features", y="Features", color="Correlation"),
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    title="Correlation Heatmap of Features"
)

fig.update_layout(
    autosize=False,
    width=700, 
    height=700 
)

# Show the plot
fig.show()

The target variable shows a modest positive correlation with obesity, indicated by a correlation coefficient of approximately 0.277. This suggests that as obesity prevalence increases, there is a slight tendency for the target variable to increase as well. However, the correlation is not strong, pointing to the influence of other factors in addition to obesity. There is a negligible correlation between the target variable and the year, as well as between obesity and the year, with coefficients close to 0.059 and 0.089, respectively. This indicates that there is very little to no linear relationship between the year the data was collected and both obesity prevalence and the target variable. The low correlations involving the year suggest that time, as represented by the YearStart variable, does not have a significant linear effect on the prevalence of obesity or the target outcome within the range of years studied.

Does the distribution of obesity prevalence by state align with diabetes prevalence?

In [None]:
# Filter data for the year 2019, topic of "High blood pressure among males", location in US states only, and stratification by sex
Obesity_Overall_2019 = chronic[
    (chronic['YearStart'] == 2019) &

    (chronic['Question'] == 'Obesity among adults') &
    (chronic['DataSource'] == 'BRFSS') &
    (chronic['StratificationCategory1'] == 'Overall') &

    (chronic['DataValueType'] == 'Crude Prevalence')
]



# Drop rows with missing 'DataValue' as they cannot be plotted
Obesity_Overall_2019 = Obesity_Overall_2019.dropna(subset=['DataValue'])

# Convert DataValue to float for plotting
Obesity_Overall_2019['DataValue'] = Obesity_Overall_2019['DataValue'].astype(float)

# Plotting the map using Plotly Express
fig = px.choropleth(
    Obesity_Overall_2019,
    locations='LocationAbbr',
    locationmode='USA-states',
    color='DataValue',
    color_continuous_scale='Viridis',
    scope='usa',
    labels={'DataValue': 'Crude Prevalence (%)'},
    title='2019 Crude Overall Obesity Prevalence in USA States',
    hover_name='LocationDesc',
    hover_data={'Stratification1': True, 'DataValue': True},
    range_color = [22, 41]
)

fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()

The data presents a geographical pattern of obesity prevalence that mirrors the distribution observed for diabetes, with the Western United States showing a relatively lower crude prevalence of obesity, whereas higher rates are seen in the Eastern regions. However, there is an exception noted in Florida, where the prevalence of obesity is unexpectedly lower, deviating from the pattern observed in other Eastern states. Additionally, the Northeast also reports a relatively low crude prevalence of obesity. These variations suggest that regional factors, possibly including lifestyle, cultural influences, and socioeconomic conditions, could be contributing to the differences in obesity rates beyond the simple East-West divide. 

What is the trend of obesity prevalence over males by states? Assuming the data from 2019 could be the representative.

In [None]:
Obesity_Male_2019 = chronic[
    (chronic['YearStart'] == 2019) &

    (chronic['Question'] == 'Obesity among adults') &

    (chronic['DataSource']) &
    (chronic['StratificationCategory1'] == 'Sex') &
    (chronic['Stratification1'] == 'Male') &
    (chronic['DataValueType'] == 'Crude Prevalence')
]

# Plotting the bar chart using Plotly Express
fig = px.bar(
    Obesity_Male_2019.sort_values('DataValue'),  # Sort values for better visualization
    x='LocationDesc',  # Use the full location description for clarity
    y='DataValue',
    labels={'DataValue': 'Crude Prevalence (%)'},
    title='2019 Crude Obesity Prevalence of Male in USA States',
    color='DataValue',  # Color the bars by the data value
    color_continuous_scale='Viridis',  # Maintain the same color scale for consistency
    hover_data=['LocationAbbr', 'DataValue']
)

fig.update_layout(
    xaxis_title="State",
    yaxis_title="Obesity Prevalence (%)",
    xaxis={'categoryorder':'total descending'},  # Optional: to ensure bars are sorted by value
    coloraxis_colorbar={
        'title': 'Prevalence %'
    },
    margin={"r":0,"t":30,"l":0,"b":0}
)

fig.show()

Given the established connection between obesity and diabetes, the states with higher obesity rates, such as West Virginia and Tennessee, also face greater challenges with diabetes management and prevention among their male residents, which we can tell from previous plots. The data underscores the importance of targeted interventions aimed at obesity reduction, which could, in turn, have a beneficial impact on diabetes prevalence rates. These findings also indicate that lifestyle, dietary habits, and healthcare accessibility likely play roles in the varying rates of obesity and consequently diabetes across the states.

What is the trend of obesity prevalence over females by states? Assuming the data from 2019 could be the representative.

In [None]:
Obesity_Female_2019 = chronic[
    (chronic['YearStart'] == 2019) &

    (chronic['Question'] == 'Obesity among adults') &

    (chronic['DataSource']) &
    (chronic['StratificationCategory1'] == 'Sex') &
    (chronic['Stratification1'] == 'Female') &
    (chronic['DataValueType'] == 'Crude Prevalence')
]

# Plotting the bar chart using Plotly Express
fig = px.bar(
    Obesity_Female_2019.sort_values('DataValue'),  # Sort values for better visualization
    x='LocationDesc',  # Use the full location description for clarity
    y='DataValue',
    labels={'DataValue': 'Crude Prevalence (%)'},
    title='2019 Crude Obesity Prevalence of Female in USA States',
    color='DataValue',  # Color the bars by the data value
    color_continuous_scale='Viridis',  # Maintain the same color scale for consistency
    hover_data=['LocationAbbr', 'DataValue']
)

fig.update_layout(
    xaxis_title="State",
    yaxis_title="Obesity Prevalence (%)",
    xaxis={'categoryorder':'total descending'},  # Optional: to ensure bars are sorted by value
    coloraxis_colorbar={
        'title': 'Prevalence %'
    },
    margin={"r":0,"t":30,"l":0,"b":0}
)

fig.show()

When relating this data to diabetes, it is crucial to consider that obesity is a significant risk factor for the condition. The higher obesity prevalence in states like Mississippi and West Virginia suggests these areas might also see higher rates of diabetes among women. The chart underscores the necessity for public health initiatives that address obesity, which may also help mitigate diabetes rates. Additionally, it highlights the importance of addressing the socioeconomic and lifestyle factors that contribute to the disparity in obesity rates between states.

What is the distribution of obesity prevalence by race?

In [None]:
# Filter data for diabetes prevalence by race, irrespective of state
Obesity_By_Race = chronic[
    (chronic['Question'] == 'Obesity among adults') &
    (chronic['StratificationCategory1'] == 'Race/Ethnicity') &  # Assuming the correct category name
    (chronic['DataValueType'] == 'Crude Prevalence')
].copy()  # Add .copy() here to explicitly make a copy of the filtered data

# Convert 'DataValue' to float for numerical operations
Obesity_By_Race['DataValue'] = pd.to_numeric(Obesity_By_Race['DataValue'], errors='coerce')

# Plotting the boxplot using Plotly Express
fig = px.box(
    Obesity_By_Race,
    x='Stratification1',  # This should correspond to the race/ethnicity
    y='DataValue',
    labels={'DataValue': 'Crude Prevalence (%)', 'Stratification1': 'Race/Ethnicity'},
    title='Obesity Prevalence by Race',
    color='Stratification1',  # Color by race/ethnicity
    category_orders={"Stratification1": Obesity_By_Race['Stratification1'].unique().tolist()}  # Ensure the order is consistent
)

fig.update_layout(
    xaxis_title="Race/Ethnicity",
    yaxis_title="Obesity Prevalence (%)",
    coloraxis_colorbar={
        'title': 'Prevalence %'
    },
    margin={"r":0,"t":50,"l":0,"b":0}
)

fig.show()

 - American Indian or Alaska Native, non-Hispanic individuals exhibit the third highest median obesity prevalence, with values about 39.6%. The distribution for this group also shows a substantial range, indicating varied obesity rates within this population.
 - Non-Hispanic Black individuals have a median obesity prevalence that is slightly higher than that of the American Indian or Alaska Native group which is the second highest. This group also shows a wide interquartile range (IQR), denoting variability within the population.
 - Hispanic individuals show a median obesity prevalence in a similar range to that of non-Hispanic Black individuals, with a somewhat narrower IQR, indicating a less varied spread within this demographic.
 - Non-Hispanic White and Multiracial groups have comparatively lower median obesity prevalence, with Multiracial individuals demonstrating a narrower IQR, suggesting more consistency in obesity rates within this group.
 - The lowest median obesity prevalence is observed in non-Hispanic Asian with the Asian population showing particularly low variability.
 - The highest median obesity prevalence is observed in Hawaiian or Pacific Islander groups.


When connecting these findings to diabetes, it's important to consider that obesity is a known risk factor for developing diabetes. The groups with higher obesity prevalence — such as Hawaiian or Pacific Islander, non-Hispanic or Alaska Native and non-Hispanic Black populations — could be at a correspondingly higher risk for diabetes. This is consistent with the earlier scatter plot analysis, which indicated a positive correlation between obesity and diabetes prevalence. The data suggests that targeted interventions to reduce obesity in these high-prevalence groups could also impact diabetes prevalence and overall public health outcomes.

Among different age groups, what is the distribution of diabetes prevalence by gender?

In [None]:
# Filter the data for entries related to diabetes
diabetes_data = chronic[chronic['Question'] == 'Diabetes among adults']

# Further filter data to ensure it includes gender and age stratifications
diabetes_data = diabetes_data[diabetes_data['StratificationCategory1'].isin(['Sex', 'Age'])]

# Separate the age and gender data into two different datasets
age_data = diabetes_data[diabetes_data['StratificationCategory1'] == 'Age']
sex_data = diabetes_data[diabetes_data['StratificationCategory1'] == 'Sex']

# Merge the two datasets on common keys to align age groups with gender entries
merged_data = pd.merge(
    age_data, 
    sex_data, 
    on=['YearStart', 'LocationAbbr', 'LocationDesc', 'DataSource', 'Topic', 'Question', 'DataValueType'],
    suffixes=('_age', '_sex')
)

# Pivot the table to get a better structure for plotting
pivot_table = merged_data.pivot_table(
    index='Stratification1_age',
    columns='Stratification1_sex',
    values='DataValue_sex',
    aggfunc='mean'
)

# Convert the pivot table to a format suitable for Plotly
plotly_data = pivot_table.reset_index().melt(id_vars='Stratification1_age', var_name='Gender', value_name='Prevalence')

# Create the interactive heatmap
fig = px.imshow(
    pivot_table,
    labels=dict(x="Gender", y="Age Group", color="Prevalence (%)"),
    x=pivot_table.columns,
    y=pivot_table.index,
    text_auto=True,
    title="Diabetes Prevalence by Gender Across Age Groups"
)

fig.update_layout(
    autosize=False,
    width=800, 
    height=600 
)

fig.update_xaxes(side="bottom")
fig.show()

The heatmap illustrates the prevalence of diabetes across different age groups, segmented by gender. Across all age brackets, males demonstrate a higher prevalence of diabetes than females. Starting with the youngest group, ages 18-44, males show a prevalence of 11.71%, while females are at 10.85%. This pattern is consistent in the 45-64 age range, where males have a prevalence of 11.70% compared to 10.87% in females. The trend continues in the senior population, aged 65 and above, where the prevalence for males is 11.70% and for females, it remains at 10.87%. The heatmap confirms that within each age group, gender is a significant variable, with males consistently showing higher diabetes prevalence rates than females, and it underlines the importance of considering gender differences in diabetes risk assessments and prevention strategies.

Among different ethnicity groups, what is the distribution of diabetes prevalence by gender?

In [None]:
# Filter the data for entries related to diabetes
diabetes_data = chronic[chronic['Question'] == 'Diabetes among adults']

# Further filter data to ensure it includes gender and age stratifications
diabetes_data = diabetes_data[diabetes_data['StratificationCategory1'].isin(['Sex', 'Race/Ethnicity'])]

# Separate the age and gender data into two different datasets
race_data = diabetes_data[diabetes_data['StratificationCategory1'] == 'Race/Ethnicity']
sex_data = diabetes_data[diabetes_data['StratificationCategory1'] == 'Sex']

# Merge the two datasets on common keys to align age groups with gender entries
merged_data = pd.merge(
    race_data, 
    sex_data, 
    on=['YearStart', 'LocationAbbr', 'LocationDesc', 'DataSource', 'Topic', 'Question', 'DataValueType'],
    suffixes=('_race', '_sex')
)

# Pivot the table to get a better structure for plotting
pivot_table = merged_data.pivot_table(
    index='Stratification1_race',
    columns='Stratification1_sex',
    values='DataValue_sex',
    aggfunc='mean'
)

# Convert the pivot table to a format suitable for Plotly
plotly_data = pivot_table.reset_index().melt(id_vars='Stratification1_race', var_name='Gender', value_name='Prevalence')

# Create the interactive heatmap
fig = px.imshow(
    pivot_table,
    labels=dict(x="Gender", y="Age Group", color="Prevalence (%)"),
    x=pivot_table.columns,
    y=pivot_table.index,
    text_auto=True,
    title="Diabetes Prevalence by Gender Across Age Groups"
)

fig.update_layout(
    autosize=False,
    width=800, 
    height=900  
)

fig.update_xaxes(side="bottom")
fig.show()

The heatmap illustrates diabetes prevalence by gender across various ethnic groups. It reveals that within every ethnic category, males have a higher prevalence of diabetes than females. The differences in prevalence rates between genders are relatively consistent across ethnicities, though the magnitude of these differences varies.

For American Indian or Alaska Native, non-Hispanic individuals, the prevalence in males is marginally higher than in females, with males at approximately 10.83% compared to females at about 9.66%. The Asian, non-Hispanic group shows a similar trend, with males having a prevalence of around 11.18% while females are at roughly 9.77%. Non-Hispanic Black males exhibit a prevalence of 11.48%, which is higher than that of their female counterparts at 10.59%. Hawaiian or Pacific Islander, non-Hispanic males show the most considerable difference, with a prevalence of 12.44%, significantly higher than the 10.80% seen in females.

Hispanic, Multiracial, and White, non-Hispanic groups all show a consistent pattern with higher diabetes prevalence in males than in females. For Hispanics, it is approximately 11.10% in males versus 10.02% in females; for Multiracial individuals, about 11.27% in males compared to 10.09% in females; and for White, non-Hispanics, around 11.03% in males against 10.00% in females.

The heatmap reinforces the importance of gender consideration in addressing and managing diabetes prevalence, as well as the need for culturally sensitive approaches given the variability across different ethnic groups.