# Capstone Project - Battle Of Neighborhoods

## Comparison of the Neighborhoods of Berlin, London and New York

Import Section

In [1]:
import pandas as pd
import numpy as np
!pip install BeautifulSoup4
from bs4 import BeautifulSoup
import requests
!pip install lxml
!pip install geopandas
!pip install geopy
import geopy as geo
import geopandas as gpd
import folium 
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

pd.options.mode.chained_assignment = None  # default='warn'
print('Imports done!')

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 5.9MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.3 soupsieve-2.0.1
Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/64/28/0b761b64ecbd63d272ed0e7a6ae6e4402fc37886b59181bfdf274424d693/lxml-4.6.1-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 5.2MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.6.1
Collecting geopandas
[?25l  Downloading ht

### Berlin Data

In [2]:
url='http://www.places-in-germany.com/14356-places-within-a-radius-of-15km-around-berlin.html'
req=requests.get(url)
soup=BeautifulSoup(req.text,"html.parser")
table = soup.find_all('table')
df=pd.read_html(str(table), header=0)[0]


In [3]:
df

Unnamed: 0,Distance,Route,Postal code / Place,Population
0,1.2 km (0.8 miles),,10115 Mitte,79582
1,2.1 km (1.3 miles),,10119 Prenzlauer Berg,140881
2,2.8 km (1.7 miles),,10115 Mitte,333534
3,2.9 km (1.8 miles),,13347 Gesundbrunnen,82110
4,3.3 km (2.0 miles),,10243 Friedrichshain-Kreuzberg,269398
...,...,...,...,...
93,14.6 km (9.1 miles),,12459 Köpenick,59201
94,14.8 km (9.2 miles),,12524 Altglienicke,26101
95,14.9 km (9.3 miles),,16341 Schwanebeck bei Bernau bei Berlin,-
96,15.0 km (9.3 miles),,12305 Lichtenrade,49451


We only need the column with the postal codes

In [4]:
df.rename({'Postal code / Place':'Borough'},axis=1, inplace=True)

In [5]:
berlin=df.Borough.str.split(expand=True) # Split Zip-Code and Place name

In [6]:
# Splitted Zip-Code and Borough leads to also splitted Borough names. These will be concatenated again
borough=[]
for name, values in berlin.iterrows():
    #print(name, values[0], values[1],values[2])
    if values[2] is None:
        borough.append(values[1])
    else:
        borough.append(values[1] + ' ' + values[2])

In [7]:
berlin['Borough']=borough

In [8]:
berlin.rename({0:'Zipcode'}, axis=1, inplace=True)

In [9]:
berlin

Unnamed: 0,Zipcode,1,2,3,4,5,Borough
0,10115,Mitte,,,,,Mitte
1,10119,Prenzlauer,Berg,,,,Prenzlauer Berg
2,10115,Mitte,,,,,Mitte
3,13347,Gesundbrunnen,,,,,Gesundbrunnen
4,10243,Friedrichshain-Kreuzberg,,,,,Friedrichshain-Kreuzberg
...,...,...,...,...,...,...,...
93,12459,Köpenick,,,,,Köpenick
94,12524,Altglienicke,,,,,Altglienicke
95,16341,Schwanebeck,bei,Bernau,bei,Berlin,Schwanebeck bei
96,12305,Lichtenrade,,,,,Lichtenrade


In [10]:
#berlin.drop({3,4,5}, axis=1, inplace=True) #Keep only the columns with Zipcode and Borough
berlin=berlin[['Zipcode', 'Borough']]
berlin.shape

(98, 2)

In [11]:
berlin.drop_duplicates(inplace=True) #erase duplicates

In [12]:
berlin.shape

(97, 2)

In [13]:
berlin.dropna(axis=0, inplace=True) #Drop empty cells
berlin.shape

(96, 2)

In [14]:
#berlin.to_csv('Ausgabe.csv', sep=';', decimal=',', index=True)

## Getting latidude and longitude from Geocoders and Nominatim

In [15]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

lat=[]
lon=[]

geolocator = Nominatim(user_agent='markus.gorges')

for line, boroughs in berlin.iterrows():
    
    try:
        adress= boroughs[0],' Berlin ',boroughs[1]
        location = geolocator.geocode(adress)
        #print(location)
        lat.append(location.latitude)
        lon.append(location.longitude)
    except:
        lat.append(np.nan)
        lon.append(np.nan)
    
berlin['latitude']=lat
berlin['longitude']=lon


In [16]:
berlin.dropna(axis=0, inplace=True) #Drop empty cells
#berlin.to_csv('Ausgabe.csv', sep=';', decimal=',', index=True)
berlin.shape

(95, 4)

In [17]:
address = 'Berlin'

geolocator = Nominatim(user_agent="markus.gorges")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Berlin are {}, {}.'.format(latitude, longitude))



# Creating Folium Map
map_berlin = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough in zip(berlin['latitude'], berlin['longitude'], berlin['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_berlin)  
    
map_berlin

The geograpical coordinate of Berlin are 52.5015217, 13.4025498.


## Import Data from Foursquare

In [18]:
CLIENT_ID = 'YWIM44EX3FU5EP2NV2GRMEJDINAVC3HA3MMSYKL1YX5GTUWK' # your Foursquare ID
CLIENT_SECRET = '1ZKMG52DQNOPTM44BUFYZGY3YGPFEFXJWKGSD33E2FWVJENR' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value
radius=500

In [19]:
# Def for getting all Venues from Foursquare

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [20]:
berlin_venues = getNearbyVenues(names=berlin['Borough'],
                                   latitudes=berlin['latitude'],
                                   longitudes=berlin['longitude']
                                  )

Mitte
Prenzlauer Berg
Gesundbrunnen
Friedrichshain-Kreuzberg
Friedrichshain
Tiergarten
Wedding
Moabit
Kreuzberg
Hansaviertel
Fennpfuhl
Alt-Treptow
Weißensee
Pankow
Heinersdorf
Neukölln
Lichtenberg
Schöneberg
Lichtenberg
Rummelsburg
Niederschönhausen
Tempelhof
Stadtrandsiedlung Malchow
Reinickendorf
Plänterwald
Charlottenburg
Wilhelmsruh
Alt-Hohenschönhausen
Wilmersdorf
Charlottenburg-Nord
Friedenau
Neu-Hohenschönhausen
Charlottenburg-Wilmersdorf
Friedrichsfelde
Malchow
Rosenthal
Baumschulenweg
Tempelhof-Schöneberg
Halensee
Blankenburg
Grunewald
Märkisches Viertel
Britz
Steglitz
Westend
Karlshorst
Wartenberg
Schmargendorf
Wittenau
Siemensstadt
Französisch Buchholz
Marzahn
Mariendorf
Biesdorf
Falkenberg
Tegel
Niederschöneweide
Lübars
Oberschöneweide
Waidmannslust
Lankwitz
Blankenfelde
Dahlem
Karow
Johannisthal
Haselhorst
Gropiusstadt
Buckow
Lichterfelde
Hermsdorf
Kaulsdorf
Marzahn-Hellersdorf
Schildow
Marienfelde
Hellersdorf
Ahrensfelde
Ahrensfelde bei
Eiche bei
Glienicke /
Rudow
Adlersh

In [21]:
berlin_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Mitte,52.51769,13.402376,Lustgarten,52.518469,13.399454,Garden
1,Mitte,52.51769,13.402376,Kuppelumgang Berliner Dom,52.518966,13.400981,Scenic Lookout
2,Mitte,52.51769,13.402376,Radisson Blu,52.519561,13.402857,Hotel
3,Mitte,52.51769,13.402376,"Bronzestatue ""Heiliger St. Georg im Kampf mit ...",52.51629,13.405558,Outdoor Sculpture
4,Mitte,52.51769,13.402376,Designpanoptikum - surreales Museum für indust...,52.516941,13.406072,Museum


### Looking for diversity of Venues

In [22]:
berlin_venues['Venue Category'].unique() ##### Groupby erzeugen und nur interessante Venues herausfiltern

array(['Garden', 'Scenic Lookout', 'Hotel', 'Outdoor Sculpture', 'Museum',
       'History Museum', 'Neighborhood', 'Bookstore', 'Cupcake Shop',
       'Plaza', 'Art Museum', 'Art Gallery', 'Concert Hall',
       'Historic Site', 'German Restaurant', 'Café', 'Fountain',
       'Ice Cream Shop', 'Board Shop', 'Restaurant', 'Gourmet Shop',
       'Vietnamese Restaurant', 'Brewery', 'Italian Restaurant',
       'Costume Shop', 'Nightclub', 'Gym / Fitness Center', 'Boutique',
       'Movie Theater', 'Roof Deck', 'Cooking School', 'Deli / Bodega',
       'Bistro', 'Park', 'Spa', 'Modern European Restaurant',
       'Coffee Shop', 'Yoga Studio', 'Tea Room', 'Israeli Restaurant',
       'Supermarket', 'Thai Restaurant', 'Cocktail Bar', 'Dive Bar',
       'Hostel', 'Breakfast Spot', 'Currywurst Joint',
       'Syrian Restaurant', 'Grocery Store', 'Climbing Gym',
       'Halal Restaurant', 'Bar', 'Turkish Restaurant',
       'Pakistani Restaurant', 'Trail', 'Pide Place', 'Drugstore',
       'Or

## Looking only for relevant Venues
So my dear friend is only interested in certain venues. According to his interests we generated following list of Interests:
This Dataset is used at the end of the chapter of exporing Berlin. Clustering will be done with all venues

In [23]:
friends_venues=['Ice Cream', 'Restaurant', 'Café','Nightclub', 'Gym','Movie Theater','Cocktail Bar', 'Grocery Store', 'Bar', 'Drugstore', 'Record Shop', 'Trattoria/Osteria','Music Venue', 'Food Court', 'pub', 'Pub',
                'Bakery', 'Club', 'Pizza','Bank','Rock Climbing Spot','Liquor Store','Cafeteria','Indoor Play Area','Music Store','Steak']

And also a List of non interests

In [24]:
friends_non_interests=['Sea', #especially Sea Food Restaurants
                      'Vegan', # He doesn't like it
                       'Vegetarian',
                       'Vietnamese','Thai','Sushi','Indian', 'Korean', 'Asian', #Food he isn't interested in
                       'Jazz', 'HipHop', 'RnB','Gay' # Specialized Clubs their music he doesn't like
                      ]

Filtering the given Venues from Foursquare by interests and non interests of my friend

In [25]:
friends_venues_berlin=berlin_venues[berlin_venues['Venue Category'].str.contains('|'.join(friends_venues))] #Only keeping the requested Venues from my friend

In [26]:
friends_venues_berlin=friends_venues_berlin[~friends_venues_berlin['Venue Category'].str.contains('|'.join(friends_non_interests))] #Erasing all non interested venues

In [27]:
friends_venues_berlin['Venue Category'].unique() #Here a list of all Venues that are now left in the dataframe

array(['German Restaurant', 'Café', 'Ice Cream Shop', 'Restaurant',
       'Italian Restaurant', 'Nightclub', 'Gym / Fitness Center',
       'Movie Theater', 'Modern European Restaurant',
       'Israeli Restaurant', 'Cocktail Bar', 'Dive Bar',
       'Syrian Restaurant', 'Grocery Store', 'Climbing Gym',
       'Halal Restaurant', 'Bar', 'Turkish Restaurant',
       'Pakistani Restaurant', 'Drugstore', 'Doner Restaurant', 'Bakery',
       'Record Shop', 'Trattoria/Osteria', 'Karaoke Bar', 'Hotel Bar',
       'Austrian Restaurant', 'Middle Eastern Restaurant', 'Rock Club',
       'Music Venue', 'Mexican Restaurant', 'Pub', 'Beer Bar',
       'Falafel Restaurant', 'Food Court', 'Gastropub',
       'Greek Restaurant', 'Ethiopian Restaurant', 'Pizza Place',
       'Steakhouse', 'Scandinavian Restaurant', 'Peruvian Restaurant',
       'Fast Food Restaurant', 'Gym', 'Eastern European Restaurant',
       'Salon / Barbershop', 'Irish Pub', 'Mediterranean Restaurant',
       'Tapas Restaurant',

## Exploring the Dataset
Getting the count of ech category for all Boroughs

In [28]:
berlin_pivot = pd.get_dummies(berlin_venues[['Venue Category']], prefix="", prefix_sep="")
berlin_pivot['Neighborhood'] = berlin_venues['Neighborhood'] 
#Fix Column Order
fixed_columns = [berlin_pivot.columns[-1]] + list(berlin_pivot.columns[:-1])
berlin_pivot = berlin_pivot[fixed_columns]


Grouping after Neighborhoods and aggregate mean of the frequency of occurrence of each category

In [29]:
berlin_pivot = berlin_pivot.groupby('Neighborhood').mean().reset_index()
berlin_pivot.head()

Unnamed: 0,Neighborhood,Zoo Exhibit,ATM,Adult Boutique,African Restaurant,American Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Asian Restaurant,...,Used Bookstore,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Waterfall,Waterfront,Windmill,Wine Bar,Wine Shop,Yoga Studio
0,Adlershof,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ahrensfelde,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ahrensfelde bei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alt-Hohenschönhausen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alt-Treptow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Sorting a bit the Venues

In [30]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [31]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
berlin_venues_sorted = pd.DataFrame(columns=columns)
berlin_venues_sorted['Neighborhood'] = berlin_pivot['Neighborhood']

for ind in np.arange(berlin_pivot.shape[0]):
    berlin_venues_sorted.iloc[ind, 1:] = return_most_common_venues(berlin_pivot.iloc[ind, :], num_top_venues)

berlin_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adlershof,Greek Restaurant,Drugstore,Steakhouse,Supermarket,Italian Restaurant,Tram Station,Trattoria/Osteria,Yoga Studio,Fast Food Restaurant,Falafel Restaurant
1,Ahrensfelde,Supermarket,Train Station,Yoga Studio,Ethiopian Restaurant,Fried Chicken Joint,French Restaurant,Fountain,Food Court,Food & Drink Shop,Flower Shop
2,Ahrensfelde bei,Supermarket,Train Station,Yoga Studio,Ethiopian Restaurant,Fried Chicken Joint,French Restaurant,Fountain,Food Court,Food & Drink Shop,Flower Shop
3,Alt-Hohenschönhausen,Post Office,Tram Station,Greek Restaurant,Drugstore,Discount Store,Coffee Shop,Big Box Store,Supermarket,Indian Restaurant,Asian Restaurant
4,Alt-Treptow,Italian Restaurant,Platform,Bakery,Electronics Store,Newsstand,Tapas Restaurant,Nightclub,Garden Center,Big Box Store,Outdoor Sculpture


## And Finaly: Clustering Berlin

In [32]:
# set number of clusters
kclusters = 5

berlin_clustering = berlin_pivot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(berlin_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 2, 0, 1, 4, 2, 1, 0, 1], dtype=int32)

In [33]:
berlin_clustering

Unnamed: 0,Zoo Exhibit,ATM,Adult Boutique,African Restaurant,American Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Asian Restaurant,Austrian Restaurant,...,Used Bookstore,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Waterfall,Waterfront,Windmill,Wine Bar,Wine Shop,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.040000,0.0,...,0.0,0.0,0.040000,0.0,0.0,0.0,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00
89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.100000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00
90,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.022727,0.0,...,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.00
91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00


In [34]:
# add clustering labels
berlin_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

berlin.rename(columns={'Borough':'Neighborhood'}, inplace=True)

berlin_merged=pd.merge(berlin, berlin_venues_sorted, on='Neighborhood')


berlin_merged.head()

Unnamed: 0,Zipcode,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,10115,Mitte,52.51769,13.402376,1,German Restaurant,Museum,History Museum,Art Gallery,Hotel,Café,Fountain,Concert Hall,Italian Restaurant,Cupcake Shop
1,10119,Prenzlauer Berg,52.528634,13.420105,1,Hotel,Café,Gym / Fitness Center,German Restaurant,Coffee Shop,Yoga Studio,Dive Bar,Spa,Boutique,Roof Deck
2,13347,Gesundbrunnen,52.55092,13.384846,1,Turkish Restaurant,Drugstore,Hotel,Supermarket,Bar,Trail,Bookstore,History Museum,Big Box Store,Historic Site
3,10243,Friedrichshain-Kreuzberg,52.506862,13.450642,1,Coffee Shop,Hostel,Bar,Vegetarian / Vegan Restaurant,Middle Eastern Restaurant,Nightclub,Italian Restaurant,Vietnamese Restaurant,Pub,Hotel
4,10243,Friedrichshain,52.512215,13.45029,1,Coffee Shop,Café,Middle Eastern Restaurant,Bar,Vegetarian / Vegan Restaurant,Pub,Ice Cream Shop,Pizza Place,Bagel Shop,Doner Restaurant


In [35]:

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(berlin_merged['latitude'], berlin_merged['longitude'], berlin_merged['Neighborhood'], berlin_merged['Cluster Labels']):
    label = folium.Popup(poi, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [36]:
#Exploring the clusters by adjusting the value cluster_number

cluster_number= 2

berlin_merged.loc[berlin_merged['Cluster Labels'] == cluster_number, berlin_merged.columns[[1] + list(range(5, berlin_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
31,Neu-Hohenschönhausen,Supermarket,Shoe Store,Movie Theater,Yoga Studio,Event Space,Fried Chicken Joint,French Restaurant,Fountain,Food Court,Food & Drink Shop
36,Baumschulenweg,Supermarket,Ice Cream Shop,Asian Restaurant,Drugstore,Event Space,Fried Chicken Joint,French Restaurant,Fountain,Food Court,Food & Drink Shop
63,Karow,Supermarket,Restaurant,Bus Stop,Yoga Studio,Ethiopian Restaurant,French Restaurant,Fountain,Food Court,Food & Drink Shop,Flower Shop
67,Buckow,Supermarket,Pizza Place,Miscellaneous Shop,Yoga Studio,Ethiopian Restaurant,French Restaurant,Fountain,Food Court,Food & Drink Shop,Flower Shop
75,Ahrensfelde,Supermarket,Train Station,Yoga Studio,Ethiopian Restaurant,Fried Chicken Joint,French Restaurant,Fountain,Food Court,Food & Drink Shop,Flower Shop
76,Ahrensfelde bei,Supermarket,Train Station,Yoga Studio,Ethiopian Restaurant,Fried Chicken Joint,French Restaurant,Fountain,Food Court,Food & Drink Shop,Flower Shop
78,Glienicke /,Supermarket,Used Bookstore,Yoga Studio,Ethiopian Restaurant,Fried Chicken Joint,French Restaurant,Fountain,Food Court,Food & Drink Shop,Flower Shop


Let's calculate how much a Neighborhood fits the interests of my friend

In [37]:
berlin_fit = pd.get_dummies(friends_venues_berlin[['Venue Category']], prefix="", prefix_sep="")
berlin_fit['Neighborhood'] = friends_venues_berlin['Neighborhood'] 
#Fix Column Order
fixed_columns = [berlin_fit.columns[-1]] + list(berlin_fit.columns[:-1])
berlin_fit = berlin_fit[fixed_columns]

In [38]:
berlin_fit['Sum']=berlin_fit.sum(axis=1) # Summe aller Venues berechnen und sortieren, um ein Ranking zu bekommen.

In [39]:
berlin_fit2 = berlin_fit.groupby('Neighborhood').sum().reset_index()

In [40]:
berlin_fit2

Unnamed: 0,Neighborhood,African Restaurant,American Restaurant,Argentinian Restaurant,Austrian Restaurant,Bakery,Bank,Bar,Bavarian Restaurant,Beer Bar,...,Scandinavian Restaurant,Spanish Restaurant,Sports Club,Steakhouse,Syrian Restaurant,Tapas Restaurant,Trattoria/Osteria,Turkish Restaurant,Wine Bar,Sum
0,Adlershof,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,5
1,Alt-Hohenschönhausen,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,Alt-Treptow,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,1,0,0,0,11
3,Baumschulenweg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,Biesdorf,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,Westend,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,7
79,Wilhelmsruh,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
80,Wilmersdorf,0,0,0,0,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,19
81,Wittenau,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [41]:
berlin_fit2= berlin_fit2[['Neighborhood', 'Sum']] # Keeping only Neighborhood and Sum

In [42]:
berlin_merged=pd.merge(berlin_merged, berlin_fit2, on='Neighborhood')

In [43]:
berlin_merged

Unnamed: 0,Zipcode,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Sum
0,10115,Mitte,52.517690,13.402376,1,German Restaurant,Museum,History Museum,Art Gallery,Hotel,Café,Fountain,Concert Hall,Italian Restaurant,Cupcake Shop,13
1,10119,Prenzlauer Berg,52.528634,13.420105,1,Hotel,Café,Gym / Fitness Center,German Restaurant,Coffee Shop,Yoga Studio,Dive Bar,Spa,Boutique,Roof Deck,16
2,13347,Gesundbrunnen,52.550920,13.384846,1,Turkish Restaurant,Drugstore,Hotel,Supermarket,Bar,Trail,Bookstore,History Museum,Big Box Store,Historic Site,21
3,10243,Friedrichshain-Kreuzberg,52.506862,13.450642,1,Coffee Shop,Hostel,Bar,Vegetarian / Vegan Restaurant,Middle Eastern Restaurant,Nightclub,Italian Restaurant,Vietnamese Restaurant,Pub,Hotel,33
4,10243,Friedrichshain,52.512215,13.450290,1,Coffee Shop,Café,Middle Eastern Restaurant,Bar,Vegetarian / Vegan Restaurant,Pub,Ice Cream Shop,Pizza Place,Bagel Shop,Doner Restaurant,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,12623,Mahlsdorf,52.508699,13.613162,1,Platform,Automotive Shop,Tram Station,Italian Restaurant,Motorcycle Shop,Supermarket,Greek Restaurant,Light Rail Station,Bakery,Gym / Fitness Center,4
80,12529,Großziethen bei,52.389947,13.410746,1,Bakery,Mobile Phone Shop,Asian Restaurant,Supermarket,Gas Station,French Restaurant,Fried Chicken Joint,Fountain,Falafel Restaurant,Furniture / Home Store,2
81,12459,Köpenick,52.453910,13.576413,1,Clothing Store,Drugstore,German Restaurant,Bakery,Gym / Fitness Center,Tram Station,Electronics Store,Bookstore,Burger Joint,Indian Restaurant,6
82,12305,Lichtenrade,52.393456,13.402040,0,Soccer Field,Supermarket,Doner Restaurant,Bus Stop,Yoga Studio,Event Space,French Restaurant,Fountain,Food Court,Food & Drink Shop,1


## Sorting the DataFrame and presenting top 5 Rows
Now we see the most fitting Neighborhoods regarding the interests of my friend
And they are all in the same cluster, looks good

In [44]:
berlin_merged.sort_values(by=['Sum'], ascending=False, inplace=True)
berlin_merged.head(5)

Unnamed: 0,Zipcode,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Sum
15,12043,Neukölln,52.48115,13.43535,1,Bar,Café,Coffee Shop,Middle Eastern Restaurant,Dive Bar,Bistro,Cocktail Bar,Italian Restaurant,Supermarket,Nightclub,42
4,10243,Friedrichshain,52.512215,13.45029,1,Coffee Shop,Café,Middle Eastern Restaurant,Bar,Vegetarian / Vegan Restaurant,Pub,Ice Cream Shop,Pizza Place,Bagel Shop,Doner Restaurant,34
7,10551,Moabit,52.530102,13.342542,1,Café,German Restaurant,Bar,Burger Joint,Gym / Fitness Center,Hostel,Doner Restaurant,Hotel,Vegetarian / Vegan Restaurant,Cocktail Bar,34
18,10777,Schöneberg,52.482157,13.35519,1,Café,Bus Stop,Grocery Store,Restaurant,Supermarket,Park,Vietnamese Restaurant,Italian Restaurant,Doner Restaurant,Drugstore,33
3,10243,Friedrichshain-Kreuzberg,52.506862,13.450642,1,Coffee Shop,Hostel,Bar,Vegetarian / Vegan Restaurant,Middle Eastern Restaurant,Nightclub,Italian Restaurant,Vietnamese Restaurant,Pub,Hotel,33


# Now: London
We start with gathering the Data for the Boroughs and Neighborhoods

In [45]:
url='https://data.london.gov.uk/download/london-borough-profiles/80647ce7-14f3-4e31-b1cd-d5f7ea3553be/london-borough-profiles.xlsx' #Source for all Borough names of London

df_London=pd.read_excel(url,index_col=None, header=0, sheet_name='Data') #Names are in one colum of sheet Data in Excel File

In [46]:
df_London=df_London[['New code','Area name']] #only keep code and Area name

In [47]:
df_London.dropna(axis=0, inplace=True) #Drop empty cells

In [48]:
df_London=df_London[df_London['New code'].str.contains('E090')] #Filtering only the relevant Areas

In [49]:
df_London.head()

Unnamed: 0,New code,Area name
1,E09000001,City of London
2,E09000002,Barking and Dagenham
3,E09000003,Barnet
4,E09000004,Bexley
5,E09000005,Brent


## Getting Geodata

In [50]:

london_lat=[]
london_lon=[]

geolocator = Nominatim(user_agent='markus.gorges')

for line, boroughs in df_London.iterrows():
    
    try:
        adress= ' London ',boroughs[1]
        location = geolocator.geocode(adress)
        #print(location)
        london_lat.append(location.latitude)
        london_lon.append(location.longitude)
    except:
        london_lat.append(np.nan)
        london_lon.append(np.nan)
    
df_London['latitude']=london_lat
df_London['longitude']=london_lon

In [51]:
df_London.rename(columns={'Area name':'Neighborhood'}, inplace=True)
df_London.head()


Unnamed: 0,New code,Neighborhood,latitude,longitude
1,E09000001,City of London,51.515618,-0.091998
2,E09000002,Barking and Dagenham,51.554117,0.150504
3,E09000003,Barnet,51.65309,-0.200226
4,E09000004,Bexley,51.441679,0.150488
5,E09000005,Brent,51.563826,-0.27576


In [52]:
address = 'London'

geolocator = Nominatim(user_agent="markus.gorges")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London are {}, {}.'.format(latitude, longitude))



# Creating Folium Map
map_london = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough in zip(df_London['latitude'], df_London['longitude'], df_London['Neighborhood']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_london)  
    
map_london

The geograpical coordinate of London are 51.5073219, -0.1276474.


## Import Data From Foursquare

In [53]:
london_venues = getNearbyVenues(names=df_London['Neighborhood'],
                                   latitudes=df_London['latitude'],
                                   longitudes=df_London['longitude']
                                  )

City of London
Barking and Dagenham
Barnet
Bexley
Brent
Bromley
Camden
Croydon
Ealing
Enfield
Greenwich
Hackney
Hammersmith and Fulham
Haringey
Harrow
Havering
Hillingdon
Hounslow
Islington
Kensington and Chelsea
Kingston upon Thames
Lambeth
Lewisham
Merton
Newham
Redbridge
Richmond upon Thames
Southwark
Sutton
Tower Hamlets
Waltham Forest
Wandsworth
Westminster


In [54]:
london_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,City of London,51.515618,-0.091998,Goodman Steak House Restaurant,51.514398,-0.090745,Steakhouse
1,City of London,51.515618,-0.091998,Hawksmoor Guildhall,51.515498,-0.090849,Steakhouse
2,City of London,51.515618,-0.091998,The Ned Hotel,51.513755,-0.090067,Hotel
3,City of London,51.515618,-0.091998,Daunt Books,51.513982,-0.092995,Bookstore
4,City of London,51.515618,-0.091998,The Merchant House,51.513264,-0.093039,Cocktail Bar


Sorting for my friends interests

In [55]:
friends_venues_london=london_venues[london_venues['Venue Category'].str.contains('|'.join(friends_venues))] #Only keeping the requested Venues from my friend

In [56]:
friends_venues_london=friends_venues_london[~friends_venues_london['Venue Category'].str.contains('|'.join(friends_non_interests))] #Erasing all non interested venues

In [57]:
friends_venues_london['Venue Category'].unique() #Here a list of all Venues that are now left in the dataframe

array(['Steakhouse', 'Cocktail Bar', 'Grocery Store', 'French Restaurant',
       'Italian Restaurant', 'Gym / Fitness Center',
       'Latin American Restaurant', 'Modern European Restaurant',
       'New American Restaurant', 'Bakery', 'Falafel Restaurant',
       'Scandinavian Restaurant', 'Wine Bar', 'Udon Restaurant', 'Café',
       'Restaurant', 'Boxing Gym', 'Indie Movie Theater', 'Pizza Place',
       'Juice Bar', 'Pub', 'Hotel Bar', 'Gym', 'Liquor Store',
       'Fast Food Restaurant', 'Chinese Restaurant', 'Greek Restaurant',
       'Food Court', 'Sports Bar', 'Ice Cream Shop', 'Beer Bar',
       'Caribbean Restaurant', 'Rock Club', 'Middle Eastern Restaurant',
       'Bar', 'Record Shop', 'Ramen Restaurant', 'Mamak Restaurant',
       'Music Venue', 'Kebab Restaurant', 'Malay Restaurant',
       'African Restaurant', 'Irish Pub', 'Portuguese Restaurant',
       'Spanish Restaurant', 'Nightclub', 'Mediterranean Restaurant',
       'Southern / Soul Food Restaurant', 'Polish Re

## Exploring the Dataset

In [58]:
london_pivot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")
london_pivot['Neighborhood'] = london_venues['Neighborhood'] 
#Fix Column Order
fixed_columns = [london_pivot.columns[-1]] + list(london_pivot.columns[:-1])
london_pivot = london_pivot[fixed_columns]

In [59]:
london_pivot = london_pivot.groupby('Neighborhood').mean().reset_index()
london_pivot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,African Restaurant,American Restaurant,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Barking and Dagenham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Barnet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bexley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Brent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Bromley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Sorting the Venues

In [60]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
london_venues_sorted = pd.DataFrame(columns=columns)
london_venues_sorted['Neighborhood'] = london_pivot['Neighborhood']

for ind in np.arange(london_pivot.shape[0]):
    london_venues_sorted.iloc[ind, 1:] = return_most_common_venues(london_pivot.iloc[ind, :], num_top_venues)

london_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Barking and Dagenham,Bus Stop,Fishing Store,Liquor Store,Grocery Store,Yoga Studio,English Restaurant,Food,Flea Market,Fish Market,Fish & Chips Shop
1,Barnet,Coffee Shop,Pharmacy,Pizza Place,Convenience Store,Pub,Park,Restaurant,Bookstore,Grocery Store,Modern European Restaurant
2,Bexley,Pub,Italian Restaurant,Fast Food Restaurant,Greek Restaurant,Toy / Game Store,Train Station,Breakfast Spot,Indian Restaurant,Health & Beauty Service,Donut Shop
3,Brent,Coffee Shop,Hotel,Supermarket,Burger Joint,Sandwich Place,Park,Pedestrian Plaza,Café,Electronics Store,Food Court
4,Bromley,Supermarket,Pub,Coffee Shop,Eastern European Restaurant,Flea Market,Fishing Store,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


## Clustering of London

In [61]:
# set number of clusters
kclusters = 5

london_clustering = london_pivot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(london_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1], dtype=int32)

### Merge everything together

In [62]:
# add clustering labels
london_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)


london_merged=pd.merge(df_London, london_venues_sorted, on='Neighborhood')


london_merged.head()

Unnamed: 0,New code,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,E09000001,City of London,51.515618,-0.091998,1,Coffee Shop,Gym / Fitness Center,Art Gallery,Seafood Restaurant,Scenic Lookout,Café,Restaurant,Clothing Store,Steakhouse,Italian Restaurant
1,E09000002,Barking and Dagenham,51.554117,0.150504,1,Bus Stop,Fishing Store,Liquor Store,Grocery Store,Yoga Studio,English Restaurant,Food,Flea Market,Fish Market,Fish & Chips Shop
2,E09000003,Barnet,51.65309,-0.200226,1,Coffee Shop,Pharmacy,Pizza Place,Convenience Store,Pub,Park,Restaurant,Bookstore,Grocery Store,Modern European Restaurant
3,E09000004,Bexley,51.441679,0.150488,1,Pub,Italian Restaurant,Fast Food Restaurant,Greek Restaurant,Toy / Game Store,Train Station,Breakfast Spot,Indian Restaurant,Health & Beauty Service,Donut Shop
4,E09000005,Brent,51.563826,-0.27576,1,Coffee Shop,Hotel,Supermarket,Burger Joint,Sandwich Place,Park,Pedestrian Plaza,Café,Electronics Store,Food Court


Generating the Map with Clusters

In [63]:
# create map
london_map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(london_merged['latitude'], london_merged['longitude'], london_merged['Neighborhood'], london_merged['Cluster Labels']):
    label = folium.Popup(poi, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(london_map_clusters)
       
london_map_clusters

In [64]:
#Exploring the clusters by adjusting the value cluster_number

cluster_number= 1

london_merged.loc[london_merged['Cluster Labels'] == cluster_number, london_merged.columns[[1] + list(range(5, london_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,City of London,Coffee Shop,Gym / Fitness Center,Art Gallery,Seafood Restaurant,Scenic Lookout,Café,Restaurant,Clothing Store,Steakhouse,Italian Restaurant
1,Barking and Dagenham,Bus Stop,Fishing Store,Liquor Store,Grocery Store,Yoga Studio,English Restaurant,Food,Flea Market,Fish Market,Fish & Chips Shop
2,Barnet,Coffee Shop,Pharmacy,Pizza Place,Convenience Store,Pub,Park,Restaurant,Bookstore,Grocery Store,Modern European Restaurant
3,Bexley,Pub,Italian Restaurant,Fast Food Restaurant,Greek Restaurant,Toy / Game Store,Train Station,Breakfast Spot,Indian Restaurant,Health & Beauty Service,Donut Shop
4,Brent,Coffee Shop,Hotel,Supermarket,Burger Joint,Sandwich Place,Park,Pedestrian Plaza,Café,Electronics Store,Food Court
6,Camden,Pub,Coffee Shop,Café,Burger Joint,Market,Italian Restaurant,Pizza Place,Clothing Store,Ice Cream Shop,Supermarket
7,Croydon,Pub,Coffee Shop,Café,Malay Restaurant,Sushi Restaurant,Gaming Cafe,Nightclub,Museum,Clothing Store,Caribbean Restaurant
8,Ealing,Coffee Shop,Pub,Café,Platform,Clothing Store,Park,Burger Joint,Italian Restaurant,Bakery,Pizza Place
9,Enfield,Clothing Store,Coffee Shop,Café,Supermarket,Pub,Optical Shop,Shopping Mall,Fish & Chips Shop,Bookstore,Department Store
10,Greenwich,Pub,Boat or Ferry,Pizza Place,Bakery,History Museum,Garden,Market,Pier,Café,Burger Joint


### Sum of Interests
Again let's calculate, wich Neighborhood fits most the interests od m friend

In [65]:
london_fit = pd.get_dummies(friends_venues_london[['Venue Category']], prefix="", prefix_sep="")
london_fit['Neighborhood'] = friends_venues_london['Neighborhood'] 
#Fix Column Order
fixed_columns = [london_fit.columns[-1]] + list(london_fit.columns[:-1])
london_fit = london_fit[fixed_columns]

In [66]:
london_fit['Sum']=london_fit.sum(axis=1) # Summe aller Venues berechnen und sortieren, um ein Ranking zu bekommen.

In [67]:
london_fit2 = london_fit.groupby('Neighborhood').sum().reset_index()
london_fit2= london_fit2[['Neighborhood', 'Sum']] # Keeping only Neighborhood and Sum

In [68]:
london_merged=pd.merge(london_merged, london_fit2, on='Neighborhood')
london_merged.sort_values(by=['Sum'], ascending=False, inplace=True)
london_merged.head(5)

Unnamed: 0,New code,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Sum
27,E09000028,Southwark,51.502922,-0.103458,1,Coffee Shop,Hotel,Pub,Gym / Fitness Center,Bar,Sandwich Place,Burger Joint,Cocktail Bar,Italian Restaurant,Café,57
18,E09000019,Islington,51.538429,-0.099905,1,Pub,Mediterranean Restaurant,Burger Joint,Bakery,French Restaurant,Café,Mexican Restaurant,Coffee Shop,Cocktail Bar,Japanese Restaurant,46
20,E09000021,Kingston upon Thames,51.409627,-0.306262,1,Coffee Shop,Italian Restaurant,Café,Pub,Clothing Store,Department Store,Bakery,Sandwich Place,Hotel,Sushi Restaurant,41
8,E09000009,Ealing,51.512655,-0.305195,1,Coffee Shop,Pub,Café,Platform,Clothing Store,Park,Burger Joint,Italian Restaurant,Bakery,Pizza Place,41
0,E09000001,City of London,51.515618,-0.091998,1,Coffee Shop,Gym / Fitness Center,Art Gallery,Seafood Restaurant,Scenic Lookout,Café,Restaurant,Clothing Store,Steakhouse,Italian Restaurant,40


# Finally: New York

Also we start with gathering the Data about the Boroughs and Neighborhoods

In [80]:
# I will go with the data from the course
import json # library to handle JSON files

!wget -q -O 'newyork_data.json' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json
print('Data downloaded!')


    
    

Data downloaded!


In [81]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [89]:
ny_neighborhoods_data = newyork_data['features']

Gettin things into a Pandas DataFrame

In [90]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
ny_neighborhoods = pd.DataFrame(columns=column_names)

In [91]:
for data in ny_neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    ny_neighborhoods = ny_neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [92]:
ny_neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [93]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(ny_neighborhoods['Borough'].unique()),
        ny_neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


In [94]:
address = 'New York'

geolocator = Nominatim(user_agent="markus.gorges")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York are {}, {}.'.format(latitude, longitude))



# Creating Folium Map
map_ny = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough in zip(ny_neighborhoods['Latitude'], ny_neighborhoods['Longitude'], ny_neighborhoods['Neighborhood']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ny)  
    
map_ny

The geograpical coordinate of New York are 40.7127281, -74.0060152.


In [96]:
ny_venues = getNearbyVenues(names=ny_neighborhoods['Neighborhood'],
                                   latitudes=ny_neighborhoods['Latitude'],
                                   longitudes=ny_neighborhoods['Longitude']
                                  )

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [97]:
ny_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
2,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Wakefield,40.894705,-73.847201,Walgreens,40.896528,-73.8447,Pharmacy
4,Wakefield,40.894705,-73.847201,Dunkin',40.890459,-73.849089,Donut Shop


In [98]:
friends_venues_ny=ny_venues[ny_venues['Venue Category'].str.contains('|'.join(friends_venues))] #Only keeping the requested Venues from my friend

In [99]:
friends_venues_ny=friends_venues_ny[~friends_venues_ny['Venue Category'].str.contains('|'.join(friends_non_interests))] #Erasing all non interested venues

In [100]:
friends_venues_ny['Venue Category'].unique() #Here a list of all Venues that are now left in the dataframe

array(['Ice Cream Shop', 'Pizza Place', 'Grocery Store',
       'Fast Food Restaurant', 'Restaurant', 'Chinese Restaurant', 'Bar',
       'Caribbean Restaurant', 'Juice Bar', 'Bank', 'Gym',
       'Latin American Restaurant', 'Pub', 'Beer Bar',
       'Mexican Restaurant', 'Spanish Restaurant', 'Bakery', 'Café',
       'Sports Bar', 'Liquor Store', 'Steakhouse', 'Italian Restaurant',
       'American Restaurant', 'Nightclub', 'Gym / Fitness Center',
       'French Restaurant', 'Music Venue', 'African Restaurant',
       'Greek Restaurant', 'Puerto Rican Restaurant', 'Cuban Restaurant',
       'Sports Club', 'Peruvian Restaurant', 'South American Restaurant',
       'Southern / Soul Food Restaurant', 'Middle Eastern Restaurant',
       'Hookah Bar', 'Arepa Restaurant', 'Eastern European Restaurant',
       'Japanese Restaurant', 'Piano Bar', 'Social Club', 'Dive Bar',
       'Salon / Barbershop', 'Comfort Food Restaurant',
       'Caucasian Restaurant', 'New American Restaurant',
      

## Exploring the Dataset

In [101]:
ny_pivot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")
ny_pivot['Neighborhood'] = ny_venues['Neighborhood'] 
#Fix Column Order
fixed_columns = [ny_pivot.columns[-1]] + list(ny_pivot.columns[:-1])
ny_pivot = ny_pivot[fixed_columns]

In [117]:
ny_pivot = ny_pivot.groupby('Neighborhood').mean().reset_index()
ny_pivot.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Antique Shop,Arcade,...,Volleyball Court,Warehouse Store,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Allerton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Annadale,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arden Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arlington,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arrochar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Sorting the Venues

In [103]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
ny_venues_sorted = pd.DataFrame(columns=columns)
ny_venues_sorted['Neighborhood'] = ny_pivot['Neighborhood']

for ind in np.arange(ny_pivot.shape[0]):
    ny_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_pivot.iloc[ind, :], num_top_venues)

ny_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Allerton,Pizza Place,Deli / Bodega,Spa,Supermarket,Chinese Restaurant,Spanish Restaurant,Gas Station,Breakfast Spot,Fast Food Restaurant,Check Cashing Service
1,Annadale,American Restaurant,Pizza Place,Dance Studio,Train Station,Deli / Bodega,Park,Restaurant,Food,Diner,Women's Store
2,Arden Heights,Deli / Bodega,Pharmacy,Coffee Shop,Bus Stop,Pizza Place,Women's Store,Fish & Chips Shop,Eye Doctor,Factory,Falafel Restaurant
3,Arlington,Grocery Store,Deli / Bodega,Coffee Shop,Bus Stop,Home Service,Boat or Ferry,Women's Store,Factory,Falafel Restaurant,Farm
4,Arrochar,Bus Stop,Italian Restaurant,Deli / Bodega,Pizza Place,Pharmacy,Bagel Shop,Sandwich Place,Athletics & Sports,Supermarket,Polish Restaurant


## Clustering New York

In [120]:
# set number of clusters
kclusters = 10

ny_clustering = ny_pivot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([5, 5, 0, 0, 0, 1, 1, 1, 1, 1], dtype=int32)

In [121]:
# add clustering labels
#ny_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)


ny_merged=pd.merge(ny_neighborhoods, ny_venues_sorted, on='Neighborhood')


ny_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bronx,Wakefield,40.894705,-73.847201,0,Pharmacy,Sandwich Place,Donut Shop,Deli / Bodega,Gas Station,Dessert Shop,Pizza Place,Ice Cream Shop,Laundromat,Fast Food Restaurant
1,Bronx,Co-op City,40.874294,-73.829939,0,Bus Station,Bar,Restaurant,Baseball Field,Chinese Restaurant,Bagel Shop,Pharmacy,Trail,Grocery Store,Fast Food Restaurant
2,Bronx,Eastchester,40.887556,-73.827806,0,Bus Station,Caribbean Restaurant,Deli / Bodega,Diner,Bowling Alley,Donut Shop,Fast Food Restaurant,Chinese Restaurant,Seafood Restaurant,Automotive Shop
3,Bronx,Fieldston,40.895437,-73.905643,0,Medical Supply Store,River,Plaza,Fish & Chips Shop,Event Space,Exhibit,Eye Doctor,Factory,Falafel Restaurant,Farm
4,Bronx,Riverdale,40.890834,-73.912585,0,Park,Playground,Home Service,Bank,Plaza,Bus Station,Food Truck,Moving Target,Gym,Fast Food Restaurant


In [122]:
# create map
ny_map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Latitude'], ny_merged['Longitude'], ny_merged['Neighborhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(poi, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(ny_map_clusters)
       
ny_map_clusters

### Sum of interests

In [123]:
ny_fit = pd.get_dummies(friends_venues_ny[['Venue Category']], prefix="", prefix_sep="")
ny_fit['Neighborhood'] = friends_venues_ny['Neighborhood'] 
#Fix Column Order
fixed_columns = [ny_fit.columns[-1]] + list(ny_fit.columns[:-1])
ny_fit = ny_fit[fixed_columns]

In [124]:
ny_fit['Sum']=ny_fit.sum(axis=1) # Summe aller Venues berechnen und sortieren, um ein Ranking zu bekommen.

In [125]:
ny_fit2 = ny_fit.groupby('Neighborhood').sum().reset_index()
ny_fit2= ny_fit2[['Neighborhood', 'Sum']] # Keeping only Neighborhood and Sum

In [126]:
ny_merged=pd.merge(ny_merged, ny_fit2, on='Neighborhood')
ny_merged.sort_values(by=['Sum'], ascending=False, inplace=True)
ny_merged.head(5)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Sum
112,Queens,Murray Hill,40.764126,-73.812763,0,Korean Restaurant,Coffee Shop,Japanese Restaurant,Bar,Sandwich Place,American Restaurant,Gym / Fitness Center,Hotel,Restaurant,Pub,64
111,Manhattan,Murray Hill,40.748303,-73.978332,0,Korean Restaurant,Coffee Shop,Japanese Restaurant,Bar,Sandwich Place,American Restaurant,Gym / Fitness Center,Hotel,Restaurant,Pub,64
116,Manhattan,East Village,40.727847,-73.982226,0,Bar,Pizza Place,Ice Cream Shop,Mexican Restaurant,Wine Bar,Coffee Shop,Italian Restaurant,Korean Restaurant,Speakeasy,Cocktail Bar,60
127,Queens,Astoria,40.768509,-73.915654,0,Middle Eastern Restaurant,Bar,Hookah Bar,Pizza Place,Seafood Restaurant,Mediterranean Restaurant,Bakery,Greek Restaurant,Indian Restaurant,Café,59
93,Brooklyn,South Side,40.710861,-73.958001,0,Bar,Coffee Shop,American Restaurant,Wine Bar,Pizza Place,Yoga Studio,Mexican Restaurant,Burger Joint,Pub,Japanese Restaurant,57
