In [40]:
# import all dependencies
import pandas as pd
import requests
import json
import scipy.stats as st
from scipy.stats import linregress
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas

# store and import any api keys needed
#from angie_config.py import geopaify_key ### this is not working?? ###

# turn off warning messages
import warnings
warnings.filterwarnings("ignore")

In [2]:
# read health data from csv
health_df = pd.read_csv('obesity_diabetes_csv.csv')



In [3]:
# revise csv to show only needed columns
health_df = health_df[['Year', 'LocationName', 'Data_Value', 'Geolocation', 'Short_Question_Text']]

# split geolocation column into long/lat columns
health_df[['point','Longitude', 'Latitude']] = health_df['Geolocation'].str.split(' ', expand=True)

#clean up new long/lat columns
health_df['Longitude'] = health_df['Longitude'].str.replace("(", "")
health_df['Latitude'] = health_df['Latitude'].str.replace(")", "")

health_df.head()




Unnamed: 0,Year,LocationName,Data_Value,Geolocation,Short_Question_Text,point,Longitude,Latitude
0,2021,73002,13.1,POINT (-97.73097606 34.94935021),Diabetes,POINT,-97.73097606,34.94935021
1,2021,73002,40.6,POINT (-97.73097606 34.94935021),Obesity,POINT,-97.73097606,34.94935021
2,2021,73003,8.9,POINT (-97.49717346 35.66897845),Diabetes,POINT,-97.49717346,35.66897845
3,2021,73003,34.4,POINT (-97.49717346 35.66897845),Obesity,POINT,-97.49717346,35.66897845
4,2021,73004,10.2,POINT (-97.905542 35.13636931),Diabetes,POINT,-97.905542,35.13636931


In [4]:
# Clean up! Delete unnecessary columns, rename columns for useability, and reorder columns
health_df = health_df.drop(['Geolocation', 'point'], axis=1)

In [5]:
# Rename columns for useability
health_df = health_df.rename(columns={"LocationName":"zip_code", "Short_Question_Text":"Indicator"})

health_df.head()




Unnamed: 0,Year,zip_code,Data_Value,Indicator,Longitude,Latitude
0,2021,73002,13.1,Diabetes,-97.73097606,34.94935021
1,2021,73002,40.6,Obesity,-97.73097606,34.94935021
2,2021,73003,8.9,Diabetes,-97.49717346,35.66897845
3,2021,73003,34.4,Obesity,-97.49717346,35.66897845
4,2021,73004,10.2,Diabetes,-97.905542,35.13636931


In [7]:
# TEST BOX- Pull grocery stores for a single location to check data

latitude = 36.11893133
longitude = -95.806359
zipcode = 74134

categories = "commercial.supermarket"
radius = 8000

filters = f"circle:{longitude},{latitude},{radius}"
MyapiKey = "a81866f6eb0e4a30a1e9f0301a460b49"

params = {
    "categories":categories,
    "filter":filters,
    "apiKey":MyapiKey
    
}


base_url = "https://api.geoapify.com/v2/places"

response = requests.get(base_url, params=params)

grocery_data = response.json()
print(json.dumps(grocery_data, indent=4, sort_keys=True))


{
    "features": [
        {
            "geometry": {
                "coordinates": [
                    -95.77579920672153,
                    36.07625146375778
                ],
                "type": "Point"
            },
            "properties": {
                "address_line1": "Walmart Neighborhood Market",
                "address_line2": "1300 East Albany Street, Broken Arrow, OK 74012, United States of America",
                "categories": [
                    "building",
                    "building.commercial",
                    "commercial",
                    "commercial.supermarket",
                    "wheelchair",
                    "wheelchair.yes"
                ],
                "city": "Broken Arrow",
                "country": "United States",
                "country_code": "us",
                "county": "Tulsa County",
                "datasource": {
                    "attribution": "\u00a9 OpenStreetMap contributors",
                    

In [12]:
#create a smaller df to reduce the pull time for geoapify

#copy health_df & remove duplicated zip codes
zips_df = health_df.drop_duplicates(subset=['zip_code'], keep='first')

#add blank column to store count of grocery stores
zips_df["Store Count"] = ""

zips_df.head()


Unnamed: 0,Year,zip_code,Data_Value,Indicator,Longitude,Latitude,Store Count
0,2021,73002,13.1,Diabetes,-97.73097606,34.94935021,
2,2021,73003,8.9,Diabetes,-97.49717346,35.66897845,
4,2021,73004,10.2,Diabetes,-97.905542,35.13636931,
6,2021,73005,15.3,Diabetes,-98.23818097,35.05783297,
8,2021,73006,13.7,Diabetes,-98.40269923,34.90892372,


In [13]:
#NOTE: this takes 8-10 minutes to run

#use zips dataframe to find grocery stores within 5 miles (about 8000 m)

# set parameters
radius = 8000

categories = "commercial.supermarket"

params = {
    "categories":categories,
    "apiKey":MyapiKey
}


for index, row in zips_df.iterrows():
    #get lat and long
    latitude = zips_df.loc[index, "Latitude"]
    longitude = zips_df.loc[index, "Longitude"]
    
    #add filter & bias params with lat/long to params dictionary
    params["filter"] = f"circle:{longitude},{latitude},{radius}"
    params["bias"] = f"proximity:{longitude},{latitude}"
    
    #set base url
    base_url = "https://api.geoapify.com/v2/places"
    
    #make request using params dictionary
    grocery_stores = requests.get(base_url, params=params)
    
    #convert to json
    grocery_stores = grocery_stores.json()
    
    #append list
    zips_df.loc[index, "Store Count"] = (len(grocery_stores["features"]))

    

In [14]:
# display sample of new DF
zips_df.head()


Unnamed: 0,Year,zip_code,Data_Value,Indicator,Longitude,Latitude,Store Count
0,2021,73002,13.1,Diabetes,-97.73097606,34.94935021,0
2,2021,73003,8.9,Diabetes,-97.49717346,35.66897845,12
4,2021,73004,10.2,Diabetes,-97.905542,35.13636931,0
6,2021,73005,15.3,Diabetes,-98.23818097,35.05783297,1
8,2021,73006,13.7,Diabetes,-98.40269923,34.90892372,0


In [28]:
# merge zips df with health_df

full_health_df = pd.merge(health_df, zips_df, on='zip_code')

full_health_df.head()


Unnamed: 0,Year_x,zip_code,Data_Value_x,Indicator_x,Longitude_x,Latitude_x,Year_y,Data_Value_y,Indicator_y,Longitude_y,Latitude_y,Store Count
0,2021,73002,13.1,Diabetes,-97.73097606,34.94935021,2021,13.1,Diabetes,-97.73097606,34.94935021,0
1,2021,73002,40.6,Obesity,-97.73097606,34.94935021,2021,13.1,Diabetes,-97.73097606,34.94935021,0
2,2021,73003,8.9,Diabetes,-97.49717346,35.66897845,2021,8.9,Diabetes,-97.49717346,35.66897845,12
3,2021,73003,34.4,Obesity,-97.49717346,35.66897845,2021,8.9,Diabetes,-97.49717346,35.66897845,12
4,2021,73004,10.2,Diabetes,-97.905542,35.13636931,2021,10.2,Diabetes,-97.905542,35.13636931,0


In [29]:
# remove duplicated y-columns
full_health_df = full_health_df.drop(['Year_y', 'Data_Value_y', 'Indicator_y', 'Longitude_y', 'Latitude_y'], axis=1)

#rename columns to drop 'x'
full_health_df = full_health_df.rename(columns = {
    "Year_x":"Year",
    "Data_Value_x":"Data_Value",
    "Indicator_x":"Indicator",
    "Longitude_x":"Longitude",
    "Latitude_x":"Latitude"
})

full_health_df.head()


Unnamed: 0,Year,zip_code,Data_Value,Indicator,Longitude,Latitude,Store Count
0,2021,73002,13.1,Diabetes,-97.73097606,34.94935021,0
1,2021,73002,40.6,Obesity,-97.73097606,34.94935021,0
2,2021,73003,8.9,Diabetes,-97.49717346,35.66897845,12
3,2021,73003,34.4,Obesity,-97.49717346,35.66897845,12
4,2021,73004,10.2,Diabetes,-97.905542,35.13636931,0


In [30]:
# split data frame into one for obesity and one for diabetes
obesity_df = pd.DataFrame(full_health_df.loc[full_health_df['Indicator'] == "Obesity", :])
diabetes_df = pd.DataFrame(full_health_df.loc[full_health_df['Indicator'] == "Diabetes", :])

#display samples
obesity_df.head()
diabetes_df.head()




Unnamed: 0,Year,zip_code,Data_Value,Indicator,Longitude,Latitude,Store Count
0,2021,73002,13.1,Diabetes,-97.73097606,34.94935021,0
2,2021,73003,8.9,Diabetes,-97.49717346,35.66897845,12
4,2021,73004,10.2,Diabetes,-97.905542,35.13636931,0
6,2021,73005,15.3,Diabetes,-98.23818097,35.05783297,1
8,2021,73006,13.7,Diabetes,-98.40269923,34.90892372,0


In [34]:
# maps- grocery store count
grocery_map = obesity_df.hvplot.points('Longitude', 'Latitude', geo = True, tiles = "OSM")

grocery_map


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

:Overlay
   .Tiles.I  :Tiles   [x,y]
   .Points.I :Points   [Longitude,Latitude]

In [35]:
# maps- obesity count
obesity_map = obesity_df.hvplot.points("Longitude", "Latitude", geo = True, tiles = "OSM", size = "Data_Value",
                                      color = 'blue', hover_cols = ['zip_code', 'Data_Value'])

obesity_map


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

:Overlay
   .Tiles.I  :Tiles   [x,y]
   .Points.I :Points   [Longitude,Latitude]   (Data_Value,zip_code)

In [36]:
# maps- diabetes count
diabetes_map = diabetes_df.hvplot.points("Longitude", "Latitude", geo = True, tiles = "OSM", size = "Data_Value",
                                      color = 'blue', hover_cols = ['zip_code', 'Data_Value'])

diabetes_map


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

:Overlay
   .Tiles.I  :Tiles   [x,y]
   .Points.I :Points   [Longitude,Latitude]   (Data_Value,zip_code)

In [41]:
# scatter plots and linear regressions
# define a function for reg plots to use for diabetes and obesity

def reg_plot():
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
    regress_values = x_values * slope + intercept
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    plt.scatter(x_values,y_values)
    plt.plot(x_values,regress_values,"r-")
    plt.annotate(line_eq,(100, 100),xycoords='figure pixels',fontsize=15,color="red")
    plt.xlabel('Store Count (<= 5 mi)') #this will stay the same for all plotsd, keep in code
    plt.ylabel(y_label) #this will change for each north/south set, so the variable will be defined before the function is run
    print(f"The r-squared is: {rvalue**2}")
    plt.show()



In [42]:
#linear regression- obesity
x_values = obesity_df['Store Count']
y_values = obesity_df['Data_Value']
y_label =  "Obesity Rate"

reg_plot()


AttributeError: 'float' object has no attribute 'shape'

In [43]:
#linear regression- diabetes
x_values = diabetes_df['Store Count']
y_values = diabetes_df['Data_Value']
y_label =  "Diabetes Rate"

reg_plot()

AttributeError: 'float' object has no attribute 'shape'