In [17]:
import requests
import pandas as pd

# Define the base URL for the CDC API
url = "https://data.cdc.gov/resource/waxm-p5qv.json"

# Send a GET request to the CDC API
try:
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Load data into a DataFrame if the request was successful
    data = response.json()
    df = pd.DataFrame(data)
    
    # Display the first few rows of the dataset
    print(df.head())

except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error occurred: {conn_err}")
except requests.exceptions.Timeout as timeout_err:
    print(f"Timeout error occurred: {timeout_err}")
except requests.exceptions.RequestException as req_err:
    print(f"An error occurred: {req_err}")


   year locationabbr                                       locationdesc  \
0  2010        15804                   Camden, NJ Metropolitan Division   
1  2010        40380        Rochester, NY Metropolitan Statistical Area   
2  2010        20500  Durham-Chapel Hill, NC Metropolitan Statistica...   
3  2010        23104     Fort Worth-Arlington, TX Metropolitan Division   
4  2010        41940  San Jose-Sunnyvale-Santa Clara, CA Metropolita...   

           class           topic                     question   response  \
0  Health Status  Overall Health  How is your general health?  Excellent   
1  Health Status  Overall Health  How is your general health?  Excellent   
2  Health Status  Overall Health  How is your general health?  Excellent   
3  Health Status  Overall Health  How is your general health?  Excellent   
4  Health Status  Overall Health  How is your general health?  Excellent   

  break_out break_out_category sample_size  ...  classid  topicid locationid  \
0   Overall 

In [21]:
# Check for columns with dictionary or list entries
for column in df.columns:
    if isinstance(df[column][0], dict):
        # Flatten the dictionary column into separate columns
        expanded_df = pd.json_normalize(df[column])
        df = pd.concat([df.drop(columns=[column]), expanded_df], axis=1)
    elif isinstance(df[column][0], list):
        # Convert list columns to strings, or handle differently if needed
        df[column] = df[column].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)

# Now df should be free of unhashable types and ready for further processing
print(df.head())

   year locationabbr                                       locationdesc  \
0  2010        15804                   Camden, NJ Metropolitan Division   
1  2010        40380        Rochester, NY Metropolitan Statistical Area   
2  2010        20500  Durham-Chapel Hill, NC Metropolitan Statistica...   
3  2010        23104     Fort Worth-Arlington, TX Metropolitan Division   
4  2010        41940  San Jose-Sunnyvale-Santa Clara, CA Metropolita...   

           class           topic                     question   response  \
0  Health Status  Overall Health  How is your general health?  Excellent   
1  Health Status  Overall Health  How is your general health?  Excellent   
2  Health Status  Overall Health  How is your general health?  Excellent   
3  Health Status  Overall Health  How is your general health?  Excellent   
4  Health Status  Overall Health  How is your general health?  Excellent   

  break_out break_out_category sample_size  ...  topicid locationid  \
0   Overall          

In [29]:
print(df.columns) 

Index(['year', 'locationabbr', 'locationdesc', 'class', 'topic', 'question',
       'response', 'break_out', 'break_out_category', 'sample_size',
       'data_value', 'confidence_limit_low', 'confidence_limit_high',
       'display_order', 'data_value_unit', 'data_value_type', 'datasource',
       'classid', 'topicid', 'locationid', 'breakoutid', 'breakoutcategoryid',
       'questionid', 'responseid', 'data_value_footnote', 'latitude',
       'longitude'],
      dtype='object')


In [27]:
# Inspect data types, missing values, and duplicates
print(df.info())
print(df.head())
print(df.duplicated().sum())

# Handle missing values
df.drop(columns=['data_value_footnote_symbol'], inplace=True)
df['numeric_column'] = df['numeric_column'].fillna(df['numeric_column'].mean())
df['categorical_column'] = df['categorical_column'].fillna('Unknown')

# Convert data types
df['date_column'] = pd.to_datetime(df['date_column'], errors='coerce')
df['numeric_column'] = pd.to_numeric(df['numeric_column'], errors='coerce')

# Standardize categorical values
df['category_column'] = df['category_column'].str.lower()
df['category_column'] = df['category_column'].astype('category')

# Remove outliers (example using z-scores)
from scipy import stats
df = df[(np.abs(stats.zscore(df['numeric_column'])) < 3)]

# Rename columns
df.rename(columns={'old_name': 'new_name'}, inplace=True)

# Save the cleaned dataset
df.to_csv("cleaned_cdc_data.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   year                        1000 non-null   object
 1   locationabbr                1000 non-null   object
 2   locationdesc                1000 non-null   object
 3   class                       1000 non-null   object
 4   topic                       1000 non-null   object
 5   question                    1000 non-null   object
 6   response                    1000 non-null   object
 7   break_out                   1000 non-null   object
 8   break_out_category          1000 non-null   object
 9   sample_size                 1000 non-null   object
 10  data_value                  999 non-null    object
 11  confidence_limit_low        999 non-null    object
 12  confidence_limit_high       999 non-null    object
 13  display_order               1000 non-null   objec

KeyError: 'numeric_column'