In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%matplotlib inline

Let's first bring in and explore Farmers Market data

In [None]:
fmarket_df = pd.read_csv("../data/DOHMH_Farmers_Markets.csv")

In [None]:
fmarket_df.head()

In [None]:
fmarket_df.info

In [None]:
fmarket_df.shape

In [None]:
print(fmarket_df.dtypes)

In [None]:
fmarket_df.describe()

In [None]:
fmarket_df.corr()

Let's get some info on the Farmers Markets

In [None]:
fmarket_df.Accepts_EBT.value_counts()

In [None]:
plt.pie(fmarket_df.Accepts_EBT.value_counts())
plt.show()

In [None]:
fmarket_df.Open_Year_Round.value_counts()

In [None]:
plt.pie(fmarket_df.Open_Year_Round.value_counts())
plt.show()

In [None]:
fmarket_df.Stellar_Cooking_Demonstrations.value_counts()

In [None]:
plt.pie(fmarket_df.Stellar_Cooking_Demonstrations.value_counts())
plt.show()

In [None]:
fmarket_df.Food_Activities_for_Kids.value_counts()

In [None]:
plt.pie(fmarket_df.Food_Activities_for_Kids.value_counts())
plt.show()

95.6% of NYC farmers markets accept EBT and 73.2% are open year round

None of the Farmers Markets have cooking demonstrations or activities for kids.

In [None]:
fmarket_df.Days_of_Operation.value_counts()

Saturday is the most popular day for Farmers Markets to be opn in NYC

In [None]:
fmarket_df.Borough.value_counts()

In [None]:
plt.hist(fmarket_df.Borough)
plt.show()

Brooklyn has the most farmers markets

Now let's look into the community districts

In [None]:
fmarket_df.Community_District.value_counts()

Brooklyn Community District 16 has the most Farmers Markets in NYC

We will need this community district info later for further analysis, so let's turn it into a dataframe.

In [None]:
fmarket_cd = pd.DataFrame(fmarket_df.Community_District.value_counts().reset_index())

In [None]:
fmarket_cd

In [None]:
fmarket_cd.columns = ['Community District','Farmers Markets']

In [None]:
fmarket_cd

Lets bring in the SNAP Centers data

In [None]:
snapcenter_df = pd.read_csv("../data/Directory_of_SNAP_Centers.csv")

In [None]:
snapcenter_df.head()

In [None]:
snapcenter_df.info

In [None]:
snapcenter_df.shape

In [None]:
print(snapcenter_df.dtypes)

In [None]:
snapcenter_df.describe()

In [None]:
snapcenter_df.corr()

Let's get some info on the SNAP Centers

In [None]:
snapcenter_df.Borough.value_counts()

In [None]:
plt.hist(snapcenter_df.Borough)
plt.show()

Brooklyn has the most SNAP Centers

In [None]:
snapcenter_df.Community_District.value_counts()

Brooklyn Community District 13 has the most SNAP Centers. It's the only district in the city that has more than one.

We will need this community district info later for further analysis, so let's turn it into a dataframe.

In [None]:
snapcenter_cd = pd.DataFrame(snapcenter_df.Community_District.value_counts().reset_index())

In [None]:
snapcenter_cd

In [None]:
snapcenter_cd.columns = ['Community District', 'SNAP Centers']

In [None]:
snapcenter_cd

Lets bring in the Healthy Stores data

In [None]:
healthystores_df = pd.read_csv("../data/Recognized_Shop_Healthy_Stores.csv")

In [None]:
healthystores_df.head()

In [None]:
healthystores_df.info

In [None]:
healthystores_df.shape

In [None]:
print(healthystores_df.dtypes)

In [None]:
healthystores_df.describe()

In [None]:
healthystores_df.corr()

Let's get some info on the Healthy Stores

In [None]:
healthystores_df.Borough.value_counts()

In [None]:
plt.hist(healthystores_df.Borough)
plt.show()

The Bronx had the most health stores. There were no recognized healthy stores in Queens or Staten Island. That is both interesting and surprising. I'm going to look deeper into the history of the recognized healthy shops to see if there is a reason for that.

In [None]:
healthystores_df.Community_District.value_counts()

Manhattan Distirct 11 had the most health recognized shops.

We will need this community district info later for further analysis, so let's turn it into a dataframe.

In [None]:
healthystores_cd = pd.DataFrame(healthystores_df.Community_District.value_counts().reset_index())

In [None]:
healthystores_cd

In [None]:
healthystores_cd.columns = ['Community District','Healthy Stores']

In [None]:
healthystores_cd

Now that we have a value counts data fram for the Farmers Markets, Healthy Stores, and SNAP Centers, lets merge them all into one data frame.

In [None]:
marketsnap_df = pd.merge(fmarket_cd, snapcenter_cd, on=['Community District'],how='outer')

In [None]:
marketsnap_df.head()

In [None]:
healthy_facilities =  pd.merge(marketsnap_df, healthystores_cd, on=['Community District'],how='outer')

In [None]:
healthy_facilities.head()

Now that all the value count data frames are merged into one, lets add a column which represents the sum of farmers markets, healthy stores, and SNAP centers for each community district.

In [None]:
healthy_facilities['Total_Facilities'] = round(healthy_facilities['Farmers Markets'] + healthy_facilities['SNAP Centers'] + healthy_facilities['Healthy Stores'],0)

In [None]:
healthy_facilities

We will need to fix the NULL problem in order to get the correct calculations.

Let's replace the NULLs with zeros to get the correct calculations in the healthy facilities dataframe

In [None]:
healthy_facilities['Farmers Markets'].fillna(0, inplace = True)

In [None]:
healthy_facilities

In [None]:
healthy_facilities['SNAP Centers'].fillna(0, inplace = True)

In [None]:
healthy_facilities

In [None]:
healthy_facilities['Healthy Stores'].fillna(0, inplace = True)

In [None]:
healthy_facilities

In [None]:
healthy_facilities['Total_Facilities'].fillna(0, inplace = True)

In [None]:
healthy_facilities

In [None]:
healthy_facilities['Facilities'] = round(healthy_facilities['Farmers Markets'] + healthy_facilities['SNAP Centers'] + healthy_facilities['Healthy Stores'],0)

In [None]:
healthy_facilities

We can drop the total facilities column now that we've fixed our NULL issuse

In [None]:
healthy_facilities.drop(columns=['Total_Facilities'])

Now that we have the total number of facilitites, lets find the top & bottom 5 community districts.

In [None]:
healthy_facilities.Facilities.nlargest(5)

In [None]:
healthy_facilities.sort_values('Facilities')

In [None]:
healthy_facilities.Facilities.nsmallest(5)

Top 5: 111, 205, 206, 316, 305

Bottom 5: 210, 106, 209, 211, 315

Note: 355 is a special district. It's not a community, it's actually a park.
It's also important to note that 16 communities only have one facility.

We will need the list of these community districts to create dataframes for the top & bottom 5 community districts once we add the demographic data.