In [3]:
# import libraries
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd 
import plotly.express as px

In [4]:
# set the url for the data
a_url = "https://chronicdata.cdc.gov/resource/jiwm-ppbh.json"

In [5]:
# set the parameters for the data
limit = 100000
offset = 0 
params = {"$limit": limit, "$offset": offset}

# get the data
response = requests.get(a_url, params=params)
data = response.json()


In [6]:
df = pd.DataFrame(data)
df_kb= df[(df['stratification1'] == 'Overall') & (df['stratification2'] == 'Overall')]

In [7]:
df_kb.dropna(subset=['y_lat', 'x_lon'], inplace=True)
df_kb['data_value'] = df_kb['data_value'].astype(float)
df_kb['x_lon'] = df_kb['x_lon'].astype(float)
df_kb['y_lat'] = df_kb['y_lat'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kb.dropna(subset=['y_lat', 'x_lon'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kb['data_value'] = df_kb['data_value'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kb['x_lon'] = df_kb['x_lon'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

In [8]:
df_kb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3282 entries, 0 to 59076
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   year                        3282 non-null   object 
 1   locationabbr                3282 non-null   object 
 2   locationdesc                3282 non-null   object 
 3   geographiclevel             3282 non-null   object 
 4   datasource                  3282 non-null   object 
 5   class                       3282 non-null   object 
 6   topic                       3282 non-null   object 
 7   data_value                  3275 non-null   float64
 8   data_value_unit             3282 non-null   object 
 9   data_value_type             3282 non-null   object 
 10  stratificationcategory1     3282 non-null   object 
 11  stratification1             3282 non-null   object 
 12  stratificationcategory2     3282 non-null   object 
 13  stratification2             3282

In [9]:
new_df = df_kb[['y_lat', 'x_lon', 'data_value']].copy()
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3282 entries, 0 to 59076
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   y_lat       3282 non-null   float64
 1   x_lon       3282 non-null   float64
 2   data_value  3275 non-null   float64
dtypes: float64(3)
memory usage: 102.6 KB


In [10]:
n_df = df_kb[['locationabbr', 'locationdesc','y_lat', 'x_lon', 'data_value']].copy()
unique_locations = n_df.groupby('locationabbr').agg({'y_lat': 'first', 'x_lon': 'first', 'data_value': 'mean'}).reset_index()
unique_locations.head(5)

Unnamed: 0,locationabbr,y_lat,x_lon,data_value
0,AK,55.442393,-161.95993,258.37037
1,AL,32.539426,-86.641296,471.527941
2,AR,34.291259,-91.367868,446.952632
3,AS,-14.301754,-170.719474,9.1
4,AZ,35.394866,-109.48687,286.93125


In [12]:
# Tạo custom color scale từ màu đỏ nhạt đến màu đỏ đậm
custom_scale = [
    [0, 'rgb(255, 255, 255)'],  # Màu trắng cho giá trị nhỏ
    [265 / unique_locations['data_value'].max(), 'rgb(255, 230, 230)'],  # Màu đỏ nhạt cho giá trị < 265
    [1, 'rgb(255,0,0)']  # Màu đỏ đậm cho giá trị > 300
]

fig = px.choropleth(
    unique_locations,
    color=unique_locations['data_value'],
    locations=unique_locations['locationabbr'],
    locationmode="USA-states",
    scope="usa",
    color_continuous_scale=custom_scale,
    range_color=[unique_locations['data_value'].min(), unique_locations['data_value'].max()],  # Đảm bảo mọi giá trị đều được hiển thị
    hover_name=unique_locations['locationabbr'],
    title='USA',
    width=1000,
    height=600,
)

fig.show()


In [13]:
df.columns

Index(['year', 'locationabbr', 'locationdesc', 'geographiclevel', 'datasource',
       'class', 'topic', 'data_value', 'data_value_unit', 'data_value_type',
       'stratificationcategory1', 'stratification1', 'stratificationcategory2',
       'stratification2', 'topicid', 'locationid', 'y_lat', 'x_lon',
       'data_value_footnote_symbol', 'data_value_footnote'],
      dtype='object')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59094 entries, 0 to 59093
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   year                        59094 non-null  object
 1   locationabbr                59094 non-null  object
 2   locationdesc                59094 non-null  object
 3   geographiclevel             59094 non-null  object
 4   datasource                  59094 non-null  object
 5   class                       59094 non-null  object
 6   topic                       59094 non-null  object
 7   data_value                  33087 non-null  object
 8   data_value_unit             59094 non-null  object
 9   data_value_type             59094 non-null  object
 10  stratificationcategory1     59094 non-null  object
 11  stratification1             59094 non-null  object
 12  stratificationcategory2     59094 non-null  object
 13  stratification2             59094 non-null  ob

In [15]:
# check missing values
df.isnull().sum()

year                              0
locationabbr                      0
locationdesc                      0
geographiclevel                   0
datasource                        0
class                             0
topic                             0
data_value                    26007
data_value_unit                   0
data_value_type                   0
stratificationcategory1           0
stratification1                   0
stratificationcategory2           0
stratification2                   0
topicid                           0
locationid                        0
y_lat                            18
x_lon                            18
data_value_footnote_symbol    33087
data_value_footnote           33087
dtype: int64

In [16]:
df = df.dropna(subset=['data_value'])

In [17]:
df = df.drop(columns=['geographiclevel', 'datasource', 'class', 'topic', 'data_value_footnote_symbol', 'data_value_footnote', 'stratificationcategory1', 'stratificationcategory2', 'stratificationcategory3', 'topicid'])
df

KeyError: "['stratificationcategory3'] not found in axis"

In [None]:
column_names = {'stratification1': 'age_group', 'stratification2': 'race/ethnicityage_group', 'stratification3': 'sex'}
df = df.rename(columns=column_names)
df

Unnamed: 0,locationid,year,locationabbr,locationdesc,data_value_unit,data_value_type,age_group,race/ethnicityage_group,sex,x_long,y_lat,data_value,confidence_limit_low,confidence_limit_high
44,01001,2000,AL,Autauga,"per 100,000","Age-Standardized, Spatiotemporally Smoothed Rate",Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,76.9,61.5,99.6
45,01001,2001,AL,Autauga,"per 100,000","Age-Standardized, Spatiotemporally Smoothed Rate",Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,83.4,69,126.7
46,01001,2002,AL,Autauga,"per 100,000","Age-Standardized, Spatiotemporally Smoothed Rate",Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,86.2,66.7,122.1
47,01001,2003,AL,Autauga,"per 100,000","Age-Standardized, Spatiotemporally Smoothed Rate",Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,81.6,58.5,117
48,01001,2004,AL,Autauga,"per 100,000","Age-Standardized, Spatiotemporally Smoothed Rate",Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,84.7,63.7,114.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999973,51053,2019,VA,Dinwiddie,"per 100,000","Age-Standardized, Spatiotemporally Smoothed Rate",Ages 65+ years,Black,Overall,-77.63220543,37.0760146,334.5,272.5,459.4
999974,51053,2000-2010,VA,Dinwiddie,%,Total percent change,Ages 65+ years,Black,Overall,-77.63220543,37.0760146,-14,-39.8,14.8
999975,51053,2010-2019,VA,Dinwiddie,%,Total percent change,Ages 65+ years,Black,Overall,-77.63220543,37.0760146,-13,-34.6,20.1
999998,51053,2000,VA,Dinwiddie,"per 100,000","Age-Standardized, Spatiotemporally Smoothed Rate",Ages 65+ years,Overall,Female,-77.63220543,37.0760146,433.1,342.3,525.8


In [None]:
total_percent_df = df[df['data_value_type'] == 'Total percent change'] 
total_percent_df = total_percent_df.drop(columns=['data_value_type', 'data_value_unit'])
total_percent_df.reset_index(drop=True, inplace=True)
total_percent_df

Unnamed: 0,locationid,year,locationabbr,locationdesc,age_group,race/ethnicityage_group,sex,x_long,y_lat,data_value,confidence_limit_low,confidence_limit_high
0,01001,2000-2010,AL,Autauga,Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,-7.5,-37.7,31.7
1,01001,2010-2019,AL,Autauga,Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,2.6,-25.8,31.2
2,01001,2000-2010,AL,Autauga,Ages 35-64 years,Overall,Female,-86.64295911,32.53530561,17.1,-19.8,73.5
3,01001,2010-2019,AL,Autauga,Ages 35-64 years,Overall,Female,-86.64295911,32.53530561,8.3,-19.1,62.1
4,01001,2000-2010,AL,Autauga,Ages 35-64 years,Overall,Male,-86.64295911,32.53530561,19.7,-10.2,98
...,...,...,...,...,...,...,...,...,...,...,...,...
47295,51053,2010-2019,VA,Dinwiddie,Ages 35-64 years,Overall,Overall,-77.63220543,37.0760146,20.4,-9.6,62.2
47296,51053,2000-2010,VA,Dinwiddie,Ages 35-64 years,White,Overall,-77.63220543,37.0760146,-20.1,-47.7,50.3
47297,51053,2010-2019,VA,Dinwiddie,Ages 35-64 years,White,Overall,-77.63220543,37.0760146,32.6,-4.4,99.1
47298,51053,2000-2010,VA,Dinwiddie,Ages 65+ years,Black,Overall,-77.63220543,37.0760146,-14,-39.8,14.8


In [None]:
smoothed_rate_df = df[df['data_value_type'] == 'Age-Standardized, Spatiotemporally Smoothed Rate']
smoothed_rate_df = smoothed_rate_df.drop(columns=['data_value_type', 'data_value_unit'])
smoothed_rate_df.reset_index(drop=True, inplace=True)
smoothed_rate_df

Unnamed: 0,locationid,year,locationabbr,locationdesc,age_group,race/ethnicityage_group,sex,x_long,y_lat,data_value,confidence_limit_low,confidence_limit_high
0,01001,2000,AL,Autauga,Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,76.9,61.5,99.6
1,01001,2001,AL,Autauga,Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,83.4,69,126.7
2,01001,2002,AL,Autauga,Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,86.2,66.7,122.1
3,01001,2003,AL,Autauga,Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,81.6,58.5,117
4,01001,2004,AL,Autauga,Ages 35-64 years,Black,Overall,-86.64295911,32.53530561,84.7,63.7,114.6
...,...,...,...,...,...,...,...,...,...,...,...,...
472677,51053,2017,VA,Dinwiddie,Ages 65+ years,Black,Overall,-77.63220543,37.0760146,338.8,286,402.8
472678,51053,2018,VA,Dinwiddie,Ages 65+ years,Black,Overall,-77.63220543,37.0760146,321.9,261.5,415.1
472679,51053,2019,VA,Dinwiddie,Ages 65+ years,Black,Overall,-77.63220543,37.0760146,334.5,272.5,459.4
472680,51053,2000,VA,Dinwiddie,Ages 65+ years,Overall,Female,-77.63220543,37.0760146,433.1,342.3,525.8


In [None]:
total_percent_df.to_csv('Data/total_percent_df.csv', index=False)
smoothed_rate_df.to_csv('Data/smoothed_rate_df.csv', index=False)