# Explore New York city geographical coordinates dataset


In [1]:
import numpy as n 
import pandas as p
p.set_option('display.max_columns', None)
p.set_option('display.max_rows', None)
import json 
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 
import requests # library to handle requests
from pandas.io.json import json_normalize 
# Matplotlib,plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering
import csv # read and write tabular data in CSV form
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.4.5.2         |   py36h9f0ad1d_0         152 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2020.4.5.2 |       hecda079_0         147 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0           conda-forge
    geopy:          

In [2]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


# Loading  and exploring  the data

In [3]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [4]:
neighbor_data = newyork_data['features']

In [5]:
neighbor_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [7]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
# instantiate the dataframe
neighbors = p.DataFrame(columns=column_names)

In [8]:
neighbors

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


In [13]:
for data in neighbor_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)


In [14]:
neighbors.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [15]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 612 neighborhoods.


In [16]:
neighbors.to_csv('BON1_NYC_GEO.csv',index=False)

In [17]:
address = 'New York City, NY'
geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [18]:
map_NewYork = folium.Map(location=[latitude, longitude], zoom_start=10)

In [19]:
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NewYork)  
    map_NewYork

# Web scrapping of Population and Demographics data of New York city from Wikipedia

In [20]:
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup # package for parsing HTML and XML documents
import csv
print('Libraries imported.')

Libraries imported.


In [21]:
website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
headers = [header.text for header in table.find_all('th')]
table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)
with open('BON2_POPULATION1.csv', 'w') as file:
   writer = csv.writer(file)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [24]:
Popul_data=p.read_csv('BON2_POPULATION1.csv')
Popul_data.drop(Popul_data.columns[[7,8,9,10,11]], axis=1,inplace=True)

In [25]:
Popul_data.columns = Popul_data.columns.str.replace(' ', '')
Popul_data.columns = Popul_data.columns.str.replace('\'','')
Popul_data.rename(columns={'Borough':'persons_sq_mi','County':'persons_sq_km'}, inplace=True)
Popul_data

Unnamed: 0,NewYorkCitysfiveboroughsvte,Jurisdiction,Population,GrossDomesticProduct,Landarea,Density,persons_sq_mi,squarekm,persons/sq.mi,persons/km2
0,The Bronx\n,\n Bronx\n,"1,418,207\n",42.695\n,"30,100\n",42.10\n,109.04\n,,,
1,Brooklyn\n,\n Kings\n,"2,559,903\n",91.559\n,"35,800\n",70.82\n,183.42\n,,,
2,Manhattan\n,\n New York\n,"1,628,706\n",600.244\n,"368,500\n",22.83\n,59.13\n,,,
3,Queens\n,\n Queens\n,"2,253,858\n",93.310\n,"41,400\n",108.53\n,281.09\n,,,
4,Staten Island\n,\n Richmond\n,"476,143\n",14.514\n,"30,500\n",58.37\n,151.18\n,,,
5,City of New York,8336817,842.343,101000,302.64,783.83,27547,,,
6,State of New York,19453561,1731.910,89000,47214,122284,412,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,


In [26]:
Popul_data.rename(columns = {'NewYorkCitysfiveboroughsvte\n' : 'Borough',
                   'Jurisdiction\n':'County',
                   'Population\n':'Estimate_2017', 
                   'Landarea\n':'square_miles',
                    'Density\n':'square_km'}, inplace=True)
Popul_data

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,squarekm,persons/sq.mi,persons/km2
0,The Bronx\n,\n Bronx\n,"1,418,207\n",42.695\n,"30,100\n",42.10\n,109.04\n,,,
1,Brooklyn\n,\n Kings\n,"2,559,903\n",91.559\n,"35,800\n",70.82\n,183.42\n,,,
2,Manhattan\n,\n New York\n,"1,628,706\n",600.244\n,"368,500\n",22.83\n,59.13\n,,,
3,Queens\n,\n Queens\n,"2,253,858\n",93.310\n,"41,400\n",108.53\n,281.09\n,,,
4,Staten Island\n,\n Richmond\n,"476,143\n",14.514\n,"30,500\n",58.37\n,151.18\n,,,
5,City of New York,8336817,842.343,101000,302.64,783.83,27547,,,
6,State of New York,19453561,1731.910,89000,47214,122284,412,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,


In [30]:
Popul_data['Borough']=Popul_data['Borough'].replace(to_replace='\n', value='', regex=True)
Popul_data['County']=Popul_data['County'].replace(to_replace='\n', value='', regex=True)
Popul_data['Estimate_2017']=Popul_data['Estimate_2017'].replace(to_replace='\n', value='', regex=True)
Popul_data['square_miles']=Popul_data['square_miles'].replace(to_replace='\n', value='', regex=True)
Popul_data['square_km']=Popul_data['square_km'].replace(to_replace='\n', value='', regex=True)
Popul_data['persons_sq_mi']=Popul_data['persons_sq_mi'].replace(to_replace='\n', value='', regex=True)
Popul_data['persons_sq_km']=Popul_data['squarekm'].replace(to_replace='\n', value='', regex=True)
Popul_data

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,squarekm,persons/sq.mi,persons/km2,persons_sq_km
0,The Bronx,Bronx,1418207.0,42.695\n,30100.0,42.1,109.04,,,,
1,Brooklyn,Kings,2559903.0,91.559\n,35800.0,70.82,183.42,,,,
2,Manhattan,New York,1628706.0,600.244\n,368500.0,22.83,59.13,,,,
3,Queens,Queens,2253858.0,93.310\n,41400.0,108.53,281.09,,,,
4,Staten Island,Richmond,476143.0,14.514\n,30500.0,58.37,151.18,,,,
5,City of New York,8336817,842.343,101000,302.64,783.83,27547.0,,,,
6,State of New York,19453561,1731.91,89000,47214.0,122284.0,412.0,,,,
7,Sources:[14] and see individual borough articles,,,,,,,,,,


In [31]:
Popul_data.loc[5:,['persons_sq_mi','persons_sq_km']] = Popul_data.loc[2:,['persons_sq_mi','persons_sq_km']].shift(1,axis=1)
Popul_data.loc[5:,['square_km','persons_sq_mi']] = Popul_data.loc[2:,['square_km','persons_sq_mi']].shift(1,axis=1)
Popul_data.loc[5:,['square_miles','square_km']] = Popul_data.loc[2:,['square_miles','square_km']].shift(1,axis=1)
Popul_data.loc[5:,['Estimate_2017','square_miles']] = Popul_data.loc[2:,['Estimate_2017','square_miles']].shift(1,axis=1)
Popul_data.loc[5:,['County','Estimate_2017']] = Popul_data.loc[2:,['County','Estimate_2017']].shift(1,axis=1)
Popul_data.loc[5:,['Borough','County']] = Popul_data.loc[2:,['Borough','County']].shift(1,axis=1)
Popul_data

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,squarekm,persons/sq.mi,persons/km2,persons_sq_km
0,The Bronx,Bronx,1418207.0,42.695\n,30100.0,42.1,109.04,,,,
1,Brooklyn,Kings,2559903.0,91.559\n,35800.0,70.82,183.42,,,,
2,Manhattan,New York,1628706.0,600.244\n,368500.0,22.83,59.13,,,,
3,Queens,Queens,2253858.0,93.310\n,41400.0,108.53,281.09,,,,
4,Staten Island,Richmond,476143.0,14.514\n,30500.0,58.37,151.18,,,,
5,,City of New York,8336817.0,101000,842.343,302.64,783.83,,,,
6,,State of New York,19453561.0,89000,1731.91,47214.0,122284.0,,,,
7,,Sources:[14] and see individual borough articles,,,,,,,,,


In [32]:
Popul_data = Popul_data.fillna('')
Popul_data

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,squarekm,persons/sq.mi,persons/km2,persons_sq_km
0,The Bronx,Bronx,1418207.0,42.695\n,30100.0,42.1,109.04,,,,
1,Brooklyn,Kings,2559903.0,91.559\n,35800.0,70.82,183.42,,,,
2,Manhattan,New York,1628706.0,600.244\n,368500.0,22.83,59.13,,,,
3,Queens,Queens,2253858.0,93.310\n,41400.0,108.53,281.09,,,,
4,Staten Island,Richmond,476143.0,14.514\n,30500.0,58.37,151.18,,,,
5,,City of New York,8336817.0,101000,842.343,302.64,783.83,,,,
6,,State of New York,19453561.0,89000,1731.91,47214.0,122284.0,,,,
7,,Sources:[14] and see individual borough articles,,,,,,,,,


In [33]:
i = Popul_data[((Popul_data.County == 'Sources: [2] and see individual borough articles'))].index
Popul_data.drop(i)

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,squarekm,persons/sq.mi,persons/km2,persons_sq_km
0,The Bronx,Bronx,1418207.0,42.695\n,30100.0,42.1,109.04,,,,
1,Brooklyn,Kings,2559903.0,91.559\n,35800.0,70.82,183.42,,,,
2,Manhattan,New York,1628706.0,600.244\n,368500.0,22.83,59.13,,,,
3,Queens,Queens,2253858.0,93.310\n,41400.0,108.53,281.09,,,,
4,Staten Island,Richmond,476143.0,14.514\n,30500.0,58.37,151.18,,,,
5,,City of New York,8336817.0,101000,842.343,302.64,783.83,,,,
6,,State of New York,19453561.0,89000,1731.91,47214.0,122284.0,,,,
7,,Sources:[14] and see individual borough articles,,,,,,,,,


In [57]:
Popul_data.to_csv('BON2_POPULATION.csv',index=False)

In [1]:
import requests
from bs4 import BeautifulSoup
website_url = requests.get('https://en.wikipedia.org/wiki/New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable collapsible'})
#print(soup.prettify())

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('NYC_DEMO.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [52]:
Sample_data=p.read_csv('NYC_DEMO.csv')

In [53]:
Sample_data

Unnamed: 0,New York City's five boroughsvte,Jurisdiction,Population,Gross Domestic Product,Land area,Density,Borough,County,Estimate (2019)[152],billions(US$)[153],per capita(US$),square miles,squarekm,persons / sq. mi,persons /km2
0,The Bronx\n,\n Bronx\n,"1,418,207\n",42.695\n,"30,100\n",42.10\n,109.04\n,"33,867\n","13,006\n",,,,,,
1,Brooklyn\n,\n Kings\n,"2,559,903\n",91.559\n,"35,800\n",70.82\n,183.42\n,"36,147\n","13,957\n",,,,,,
2,Manhattan\n,\n New York\n,"1,628,706\n",600.244\n,"368,500\n",22.83\n,59.13\n,"71,341\n","27,544\n",,,,,,
3,Queens\n,\n Queens\n,"2,253,858\n",93.310\n,"41,400\n",108.53\n,281.09\n,"20,767\n","8,018\n",,,,,,
4,Staten Island\n,\n Richmond\n,"476,143\n",14.514\n,"30,500\n",58.37\n,151.18\n,"8,157\n","3,150\n",,,,,,
5,City of New York,8336817,842.343,101000,302.64,783.83,27547,"10,636\n",,,,,,,
6,State of New York,19453561,1731.910,89000,47214,122284,412,159\n,,,,,,,
7,Sources:[154] and see individual borough artic...,,,,,,,,,,,,,,


In [54]:
Sample_data.columns

Index(['New York City's five boroughsvte\n', 'Jurisdiction\n', 'Population\n',
       'Gross Domestic Product\n', 'Land area\n', 'Density\n', 'Borough',
       'County', 'Estimate (2019)[152]', 'billions(US$)[153]',
       'per capita(US$)', 'square miles', 'squarekm', 'persons / sq. mi',
       'persons /km2\n'],
      dtype='object')

In [56]:
Sample_data.rename(columns = {'2010[237]' : '2010',
                   '1990[239]':'1990',
                   '1970[239]':'1970', 
                   '1940[239]\n':'1940',
                    }, inplace=True)
Sample_data

Unnamed: 0,New York City's five boroughsvte,Jurisdiction,Population,Gross Domestic Product,Land area,Density,Borough,County,Estimate (2019)[152],billions(US$)[153],per capita(US$),square miles,squarekm,persons / sq. mi,persons /km2
0,The Bronx\n,\n Bronx\n,"1,418,207\n",42.695\n,"30,100\n",42.10\n,109.04\n,"33,867\n","13,006\n",,,,,,
1,Brooklyn\n,\n Kings\n,"2,559,903\n",91.559\n,"35,800\n",70.82\n,183.42\n,"36,147\n","13,957\n",,,,,,
2,Manhattan\n,\n New York\n,"1,628,706\n",600.244\n,"368,500\n",22.83\n,59.13\n,"71,341\n","27,544\n",,,,,,
3,Queens\n,\n Queens\n,"2,253,858\n",93.310\n,"41,400\n",108.53\n,281.09\n,"20,767\n","8,018\n",,,,,,
4,Staten Island\n,\n Richmond\n,"476,143\n",14.514\n,"30,500\n",58.37\n,151.18\n,"8,157\n","3,150\n",,,,,,
5,City of New York,8336817,842.343,101000,302.64,783.83,27547,"10,636\n",,,,,,,
6,State of New York,19453561,1731.910,89000,47214,122284,412,159\n,,,,,,,
7,Sources:[154] and see individual borough artic...,,,,,,,,,,,,,,


# Download and Explore New York city and its Boroughs Cuisine dataset

In [11]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
from PIL import Image # converting images into arrays

%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot') # optional: for ggplot-like style

# check for latest version of Matplotlib
print ('Matplotlib version: ', mpl.__version__) # >= 2.0.0

# install wordcloud
!conda install -c conda-forge wordcloud==1.4.1 --yes

# import package and its set of stopwords
from wordcloud import WordCloud, STOPWORDS

print ('Wordcloud is installed and imported!')


Matplotlib version:  3.0.2
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - wordcloud==1.4.1


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2020.4.5.2         |   py36h9f0ad1d_0         152 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    wordcloud-1.4.1            |           py36_0         324 KB  conda-forge
    ca-certificates-2020.4.5.2 |       hecda079_0         147 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.7 MB

The following NEW packages will be INSTALLED:

    python_abi:      3.6-1_cp36m       conda-forge
    wordcloud:       1.4.1-py36_0      conda-forge

The follow

In [7]:
import project as pro
file1 = pro.get-file("BON3_NYC_CUISINE.csv")
# Read the CSV data file from the object storage into a pandas DataFrame
file1.seek(0)
NYC_CUISINE=pd.read_csv(file1)
NYC_CUISINE.drop(NYC_CUISINE.columns[[3,4,5,6,7]], axis=1,inplace=True) 
NYC_CUISINE.head()

ModuleNotFoundError: No module named 'project'

# Explore Farmers Market dataset

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker

# notice: installing seaborn might takes a few minutes
!conda install -c anaconda seaborn -y
import seaborn as sns

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

In [None]:
# Data from website - https://data.cityofnewyork.us/dataset/DOHMH-Farmers-Markets-and-Food-Boxes/8vwk-6iz2
my_file = project.get_file("DOHMH_Farmers_Markets_and_Food_Boxes.csv")

# Read the CSV data file from the object storage into a pandas DataFrame
my_file.seek(0)
FM_NYC=pd.read_csv(my_file)

In [None]:
FM_NYC.head()

In [None]:
FM_NYC.rename(columns={'Service Type':'Service_Type'}, inplace=True)
print(FM_NYC.Service_Type.unique())

In [None]:
FM_NYC['Service_Type'].value_counts().to_frame()

In [None]:
fig,ax = plt.subplots(1, 1, figsize=(5, 5))
sns.countplot(x='Service_Type',data=FM_NYC)
ax.set_title("Service_Type")
for t in ax.patches:
    if (np.isnan(float(t.get_height()))):
        ax.annotate('', (t.get_x(), 0))
    else:
        ax.annotate(str(format(int(t.get_height()), ',d')), (t.get_x(), t.get_height()*1.01))
    
plt.show();

In [None]:
FM_NYC_filtered = FM_NYC[FM_NYC['Service_Type'] == 'Farmers Markets'].copy()
FM_NYC_filtered ['Borough'] = FM_NYC_filtered['Borough'].map(lambda x: x.strip())
print(FM_NYC_filtered.shape)
FM_NYC_filtered.head()

In [None]:
fig,ax = plt.subplots(1, 1, figsize=(5, 5))
sns.countplot(x='Borough',data=FM_NYC_filtered)
ax.set_title("Borough")
for t in ax.patches:
    if (np.isnan(float(t.get_height()))):
        ax.annotate('', (t.get_x(), 0))
    else:
        ax.annotate(str(format(int(t.get_height()), ',d')), (t.get_x(), t.get_height()*1.01))
        ax.set_xticklabels([t.get_text().split("T")[0] for t in ax.get_xticklabels()])

# This sets the yticks "upright" with 0, as opposed to sideways with 90.
plt.xticks(rotation=90) 
plt.show()

In [None]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

In [None]:
map_markets = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, FacilityName, borough in zip(FM_NYC_filtered['Latitude'], FM_NYC_filtered['Longitude'], FM_NYC_filtered['FacilityName'], FM_NYC_filtered['Borough']):
            label = '{}, {}'.format(FacilityName, borough)
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker(
                [lat, lng],
                radius=5,
                popup=label,
                color='green',
                fill=True,
                fill_color='green',
                fill_opacity=0.7,
                parse_html = False).add_to(map_markets)  

map_markets