
<h1 align=center><font size = 5>Battle of Neighbourhoods - Final Assignment</font></h1>

## Initial Setup of libraries

In [53]:
!pip install geopy
!pip install folium
!pip install shapely
!pip install pyproj



In [54]:
import numpy as np
import pandas as pd

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import folium # map rendering library

import re # for regular expressions

# for transforming geocoordinates
import shapely.geometry
import pyproj
import math

import requests # library to handle requests

from sklearn.cluster import KMeans # for clustering

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from collections import Counter # to count repeated items in list

import warnings
warnings.simplefilter('ignore')

print('All Libraries imported :)')

All Libraries imported :)


## Generation of Neighbourhoods

In [55]:
address = 'Marienplatz, munich, Germany'

geolocator = Nominatim(user_agent="Battle of Berlin munich")
location = geolocator.geocode(address)
munich_lat = location.latitude
munich_lon = location.longitude
print('The geograpical coordinates for munich are {}, {}.'.format(munich_lat, munich_lon))

The geograpical coordinates for munich are 48.137031750000006, 11.575924590567384.


In [56]:
address = 'Bundestag, Berlin, Germany'

geolocator = Nominatim(user_agent="Battle of Berlin munich")
location = geolocator.geocode(address)
berlin_lat = location.latitude
berlin_lon = location.longitude
print('The geograpical coordinates for Berlin are {}, {}.'.format(berlin_lat, berlin_lon))

The geograpical coordinates for Berlin are 52.5185918, 13.3766658.


In [57]:
def lonlat_to_xy(lon, lat):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=33, datum='WGS84')
    xy = pyproj.transform(proj_latlon, proj_xy, lon, lat)
    return xy[0], xy[1]

def xy_to_lonlat(x, y):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=33, datum='WGS84')
    lonlat = pyproj.transform(proj_xy, proj_latlon, x, y)
    return lonlat[0], lonlat[1]

def calc_xy_distance(x1, y1, x2, y2):
    dx = x2 - x1
    dy = y2 - y1
    return math.sqrt(dx*dx + dy*dy)

print('Coordinate transformation check')
print('-------------------------------')
print('munich center longitude={}, latitude={}'.format(munich_lon, munich_lat))
x, y = lonlat_to_xy(munich_lon, munich_lat)
print('munich center UTM X={}, Y={}'.format(x, y))
lo, la = xy_to_lonlat(x, y)
print('munich center longitude={}, latitude={}'.format(lo, la))

Coordinate transformation check
-------------------------------
munich center longitude=11.575924590567384, latitude=48.137031750000006
munich center UTM X=245272.96923992495, Y=5337202.67661501
munich center longitude=11.575924590567386, latitude=48.13703174999999


In [58]:
berlin_center_x, berlin_center_y = lonlat_to_xy(berlin_lon, berlin_lat) # City center in Cartesian coordinates
munich_center_x, munich_center_y = lonlat_to_xy(munich_lon, munich_lat)

k = math.sqrt(3) / 2 # Vertical offset for hexagonal grid cells
square_width = 10000
neigborhood_radius = 1500
x_step = neigborhood_radius
y_step = neigborhood_radius * k

x_min = berlin_center_x - square_width/2
y_min = berlin_center_y - square_width/2 - (int(21/k)*k*neigborhood_radius - square_width)/2
berlin_latitudes = []
berlin_longitudes = []
berlin_distances_from_center = []
xs = []
ys = []
for i in range(0, int(21/k)):
    y = y_min + i * y_step
    x_offset = neigborhood_radius/2 if i%2==0 else 0
    for j in range(0, 21):
        x = x_min + j * x_step + x_offset
        berlin_distance_from_center = calc_xy_distance(berlin_center_x, berlin_center_y, x, y)
        if (berlin_distance_from_center <= square_width/2+1):
            lon, lat = xy_to_lonlat(x, y)
            berlin_latitudes.append(lat)
            berlin_longitudes.append(lon)
            berlin_distances_from_center.append(berlin_distance_from_center)
            xs.append(x)
            ys.append(y)
            
x_min = munich_center_x - square_width/2
y_min = munich_center_y - square_width/2 - (int(21/k)*k*neigborhood_radius - square_width)/2
munich_latitudes = []
munich_longitudes = []
munich_distances_from_center = []
xs = []
ys = []
for i in range(0, int(21/k)):
    y = y_min + i * y_step
    x_offset = neigborhood_radius/2 if i%2==0 else 0
    for j in range(0, 21):
        x = x_min + j * x_step + x_offset
        munich_distance_from_center = calc_xy_distance(munich_center_x, munich_center_y, x, y)
        if (munich_distance_from_center <= square_width/2+1):
            lon, lat = xy_to_lonlat(x, y)
            munich_latitudes.append(lat)
            munich_longitudes.append(lon)
            munich_distances_from_center.append(munich_distance_from_center)
            xs.append(x)
            ys.append(y)

print(len(berlin_latitudes), 'Berlin neighborhood centers generated')
print(len(munich_latitudes), 'munich neighborhood centers generated')

39 Berlin neighborhood centers generated
39 munich neighborhood centers generated


In [59]:
map_berlin = folium.Map(location=[berlin_lat, berlin_lon], zoom_start=12)

# add markers to map
for lat, lng in zip(berlin_latitudes, berlin_longitudes):
    label = '{}, {}'.format(lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=neigborhood_radius/40,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_berlin)  
    
map_berlin

In [60]:
map_munich = folium.Map(location=[munich_lat, munich_lon], zoom_start=12)

# add markers to map
for lat, lng in zip(munich_latitudes, munich_longitudes):
    label = '{}, {}'.format(lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=neigborhood_radius/40,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_munich)  
    
map_munich

In [61]:
munich_neighborhoods = []

for i in range(0,len(munich_latitudes)):
    reverse = geolocator.reverse((munich_latitudes[i],munich_longitudes[i]))
    address = reverse[0] 
    address_n = re.findall(".*, (.*), .*, .*,.*,.*,.*", address)[0]  
    geo_lat = reverse[1][0]
    geo_lon = reverse[1][1]
    city = "munich"
    munich_neighborhoods.append([address, geo_lat, geo_lon, address_n, city])

munich_neighborhoods = pd.DataFrame(munich_neighborhoods)
munich_neighborhoods.rename(columns={0:"Neighborhood",1:"Latitude",2:"Longitude",3:"Borough",4:"City"}, inplace=True)

In [62]:
berlin_neighborhoods = []

for i in range(0,len(berlin_latitudes)):
    reverse = geolocator.reverse((berlin_latitudes[i],berlin_longitudes[i]))
    address = reverse[0]
    try:
        address_n = re.findall(".*, (.*),.*,.*,.*", address)[0]
    except:
        address_n = re.findall("(.*),.*,.*,.*", address)[0]
    
    geo_lat = reverse[1][0]
    geo_lon = reverse[1][1]
    city = "Berlin"
    berlin_neighborhoods.append([address, geo_lat, geo_lon, address_n, city])

berlin_neighborhoods = pd.DataFrame(berlin_neighborhoods)
berlin_neighborhoods.rename(columns={0:"Neighborhood",1:"Latitude",2:"Longitude",3:"Borough",4:"City"}, inplace=True)

In [63]:
munich_neighborhoods.head(20)

Unnamed: 0,Neighborhood,Latitude,Longitude,Borough,City
0,"Tierparkstraße (Isar-Eingang), Tierparkstraße,...",48.10062,11.551424,Bezirksteil Siebenbrunn,munich
1,"90, Säbener Straße, Bezirksteil Giesing, Unter...",48.10193,11.571492,Bezirksteil Giesing,munich
2,"22, Hohenschwangaustraße, Bezirksteil Obergies...",48.102484,11.591668,Bezirksteil Obergiesing,munich
3,"107, Görzer Straße, Balanstraße-West, Bezirkst...",48.103028,11.611556,Bezirksteil Balanstraße-West,munich
4,"11, Johann-Houis-Straße, Bezirksteil Mittersen...",48.11196,11.520543,Bezirksteil Mittersendling,munich
5,"Brudermühlstraße, Bezirksteil Sendlinger Feld,...",48.112593,11.540597,Bezirksteil Sendlinger Feld,munich
6,"Brudermühlbrücke, Brudermühlstraße, Bezirkstei...",48.112617,11.560151,Bezirksteil Sendlinger Feld,munich
7,"Spielplatz an der Herzogstandstraße ""Louisoder...",48.113854,11.581082,Bezirksteil Obergiesing,munich
8,"15, Ruppertsberger Straße, Bezirksteil Balanst...",48.114605,11.600868,Bezirksteil Balanstraße-West,munich
9,"6, Raiffeisenplatz, Ramersdorf, Bezirksteil Ra...",48.114875,11.620952,Bezirksteil Ramersdorf,munich


In [64]:
berlin_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Borough,City
0,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin
1,"Informationsort Schwerbelastungskörper, 100, G...",52.484143,13.371623,Tempelhof-Schöneberg,Berlin
2,"P 3, Columbiadamm, Tempelhof, Tempelhof-Schöne...",52.48324,13.391364,Tempelhof-Schöneberg,Berlin
3,"Freiluftkino Hasenheide, Columbiadamm, Neuköll...",52.48378,13.416432,Neukölln,Berlin
4,"6, Wittelsbacherstraße, Wilmersdorf, Charlotte...",52.494681,13.314535,Charlottenburg-Wilmersdorf,Berlin


In [65]:
neighborhoods = pd.concat([munich_neighborhoods, berlin_neighborhoods])

In [66]:
munich_neighborhoods.to_csv("munich_neighborhoods.csv", index=False)
berlin_neighborhoods.to_csv("berlin_neighborhoods.csv", index=False)
neighborhoods.to_csv("neighborhoods.csv", index=False)

## Creation of venue data

In [67]:
munich_neighborhoods = pd.read_csv("munich_neighborhoods.csv")
munich_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Borough,City
0,"Tierparkstraße (Isar-Eingang), Tierparkstraße,...",48.10062,11.551424,Bezirksteil Siebenbrunn,munich
1,"90, Säbener Straße, Bezirksteil Giesing, Unter...",48.10193,11.571492,Bezirksteil Giesing,munich
2,"22, Hohenschwangaustraße, Bezirksteil Obergies...",48.102484,11.591668,Bezirksteil Obergiesing,munich
3,"107, Görzer Straße, Balanstraße-West, Bezirkst...",48.103028,11.611556,Bezirksteil Balanstraße-West,munich
4,"11, Johann-Houis-Straße, Bezirksteil Mittersen...",48.11196,11.520543,Bezirksteil Mittersendling,munich


In [68]:
berlin_neighborhoods = pd.read_csv("berlin_neighborhoods.csv")
berlin_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Borough,City
0,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin
1,"Informationsort Schwerbelastungskörper, 100, G...",52.484143,13.371623,Tempelhof-Schöneberg,Berlin
2,"P 3, Columbiadamm, Tempelhof, Tempelhof-Schöne...",52.48324,13.391364,Tempelhof-Schöneberg,Berlin
3,"Freiluftkino Hasenheide, Columbiadamm, Neuköll...",52.48378,13.416432,Neukölln,Berlin
4,"6, Wittelsbacherstraße, Wilmersdorf, Charlotte...",52.494681,13.314535,Charlottenburg-Wilmersdorf,Berlin


In [69]:
print("There are", berlin_neighborhoods.shape[0], "neighborhoods in Berlin and",
      munich_neighborhoods.shape[0], "in munich")
print("They belong to",
      berlin_neighborhoods.Borough.unique().shape[0],
      "and",
      munich_neighborhoods.Borough.unique().shape[0],
      "boroughs respectively."
     )

There are 39 neighborhoods in Berlin and 39 in munich
They belong to 7 and 34 boroughs respectively.


In [111]:
CLIENT_ID =  # your Foursquare ID
CLIENT_SECRET = # your Foursquare Secret
ACCESS_TOKEN = # your FourSquare Access Token
VERSION = '20180604'
limit = 100
radius = 1500 # see neighbourhood radius above
VERSION = '20180604' # Foursquare API version

In [71]:
#run credentials.py # client_id and client_secret for Foursquare

In [72]:
def getNearbyVenues(neighborhoods, latitudes, longitudes, boroughs, cities, radius):
    
    venues_list=[]
    for neigh, lat, lng, bor, city in zip(neighborhoods, latitudes, longitudes, boroughs, cities):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            neigh,
            lat,
            lng,
            bor,
            city,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Address Latitude', 
                  'Address Longitude',
                  'Borough',
                  'City',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [73]:
berlin_venues = getNearbyVenues(
    neighborhoods = berlin_neighborhoods['Neighborhood'],
    latitudes = berlin_neighborhoods['Latitude'],
    longitudes = berlin_neighborhoods['Longitude'],
    boroughs = berlin_neighborhoods['Borough'],
    cities = berlin_neighborhoods['City'],
    radius = radius
)

In [74]:
munich_venues = getNearbyVenues(
    neighborhoods = munich_neighborhoods['Neighborhood'],
    latitudes = munich_neighborhoods['Latitude'],
    longitudes = munich_neighborhoods['Longitude'],
    boroughs = munich_neighborhoods['Borough'],
    cities = munich_neighborhoods['City'],
    radius = radius
)

In [75]:
all_venues = pd.concat([berlin_venues, munich_venues], ignore_index=True)

In [76]:
all_venues.shape

(7302, 9)

In [77]:
# exclude neighborhoods that are too small or too big
a_count = all_venues.groupby("Neighborhood").count()
a_incl = a_count[a_count["Venue"] >= 100].reset_index().Neighborhood
all_venues = all_venues[all_venues.Neighborhood.isin(a_incl)]

all_venues.shape

(5700, 9)

In [78]:
all_venues.head()

Unnamed: 0,Neighborhood,Address Latitude,Address Longitude,Borough,City,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,Brunnen Goldener Hirsch,52.483355,13.344001,Fountain
1,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,Café de Enrico,52.481014,13.349788,Café
2,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,Rüyam Gemüse Kebab,52.484807,13.353681,Doner Restaurant
3,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,L’Antica Pizzeria da Michele,52.480267,13.347365,Pizza Place
4,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,Dolce Pizza,52.484779,13.345833,Pizza Place


In [79]:
all_venues.to_csv("all_venues.csv", index=False)

## Find Top 10 venue types for each neighborhood and borough

In [80]:
all_venues = pd.read_csv("all_venues.csv")
all_venues.head()

Unnamed: 0,Neighborhood,Address Latitude,Address Longitude,Borough,City,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,Brunnen Goldener Hirsch,52.483355,13.344001,Fountain
1,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,Café de Enrico,52.481014,13.349788,Café
2,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,Rüyam Gemüse Kebab,52.484807,13.353681,Doner Restaurant
3,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,L’Antica Pizzeria da Michele,52.480267,13.347365,Pizza Place
4,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,Dolce Pizza,52.484779,13.345833,Pizza Place


In [81]:
venue_cat_onehot = pd.get_dummies(all_venues[['Venue Category']], prefix="", prefix_sep="")
venue_cat_onehot['Neighborhood'] = all_venues['Neighborhood'] 
venue_cat_onehot['Borough'] = all_venues['Borough']

In [82]:
boroughs_grouped = venue_cat_onehot.groupby('Borough').mean().reset_index()
boroughs_grouped.head()

Unnamed: 0,Borough,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Aquarium,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,...,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yemeni Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Bezirksteil Altbogenhausen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bezirksteil Am Luitpoldpark,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bezirksteil Am Schlachthof,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
3,Bezirksteil Echarding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
4,Bezirksteil Englischer Garten Süd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
neighborhoods_grouped = venue_cat_onehot.groupby('Neighborhood').mean().reset_index()
neighborhoods_grouped.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Aquarium,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,...,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yemeni Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,"0, 21, Arcisstraße, Bezirksteil Königsplatz, M...",0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"1, Burgunderstraße, Bezirksteil Am Luitpoldpar...",0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"101, Gartenstraße, Spandauer Vorstadt, Mitte, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
3,"13, Hans-Otto-Straße, Bötzowviertel, Prenzlaue...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.01,0.0,0.0
4,"13, Löwestraße, Hausburgviertel, Friedrichshai...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0


In [84]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [85]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = neighborhoods_grouped['Neighborhood']

for ind in np.arange(neighborhoods_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(neighborhoods_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"0, 21, Arcisstraße, Bezirksteil Königsplatz, M...",Café,Plaza,Art Museum,Italian Restaurant,Ice Cream Shop,Steakhouse,Bar,Vietnamese Restaurant,Bakery,Coffee Shop
1,"1, Burgunderstraße, Bezirksteil Am Luitpoldpar...",Park,Greek Restaurant,Drugstore,Italian Restaurant,Pizza Place,Bar,Bakery,Pool,Restaurant,Museum
2,"101, Gartenstraße, Spandauer Vorstadt, Mitte, ...",Coffee Shop,Café,Hotel,Ice Cream Shop,Bookstore,Bakery,Park,Italian Restaurant,Playground,Theater
3,"13, Hans-Otto-Straße, Bötzowviertel, Prenzlaue...",Café,Park,Italian Restaurant,Playground,Bakery,Coffee Shop,Vietnamese Restaurant,Indie Movie Theater,Ice Cream Shop,Pizza Place
4,"13, Löwestraße, Hausburgviertel, Friedrichshai...",Café,Vegetarian / Vegan Restaurant,Coffee Shop,Pizza Place,Ice Cream Shop,Park,Middle Eastern Restaurant,Italian Restaurant,Cocktail Bar,Nightclub


In [86]:
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
boroughs_venues_sorted = pd.DataFrame(columns=columns)
boroughs_venues_sorted['Borough'] = boroughs_grouped['Borough']

for ind in np.arange(boroughs_grouped.shape[0]):
    boroughs_venues_sorted.iloc[ind, 1:] = return_most_common_venues(boroughs_grouped.iloc[ind, :], num_top_venues)

boroughs_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bezirksteil Altbogenhausen,Italian Restaurant,Restaurant,Hotel,Plaza,Beer Garden,Supermarket,Bakery,Café,Indian Restaurant,Gourmet Shop
1,Bezirksteil Am Luitpoldpark,Park,Greek Restaurant,Drugstore,Italian Restaurant,Pizza Place,Bar,Bakery,Pool,Restaurant,Museum
2,Bezirksteil Am Schlachthof,Café,German Restaurant,Italian Restaurant,Ice Cream Shop,Bar,Vietnamese Restaurant,Asian Restaurant,Burger Joint,Drugstore,Bistro
3,Bezirksteil Echarding,Italian Restaurant,Café,Plaza,German Restaurant,Thai Restaurant,Bar,Gym / Fitness Center,French Restaurant,Supermarket,Hotel
4,Bezirksteil Englischer Garten Süd,Café,Italian Restaurant,Plaza,Ice Cream Shop,German Restaurant,Trattoria/Osteria,Pizza Place,Bar,Cocktail Bar,River


In [87]:
boroughs = all_venues[["Borough","City"]]
boroughs.drop_duplicates(inplace=True)
boroughs.index = range(0,len(boroughs))
boroughs.head()

Unnamed: 0,Borough,City
0,Tempelhof-Schöneberg,Berlin
1,Neukölln,Berlin
2,Charlottenburg-Wilmersdorf,Berlin
3,Schöneberg,Berlin
4,Friedrichshain-Kreuzberg,Berlin


In [88]:
boroughs_loc = []
geolocator = Nominatim(user_agent="munich_explorer")

for i in range(0,len(boroughs)):
    borough = boroughs["Borough"][i]
    city = boroughs["City"][i]
    address = '{}, {}'.format(borough, city)
    location = geolocator.geocode(address)
    lat = location.latitude
    lon = location.longitude
    boroughs_loc.append([borough, city, lat, lon])

boroughs_loc = pd.DataFrame(boroughs_loc)
boroughs_loc.rename(columns={0:"Borough",1:"City",2:"Latitude",3:"Longitude"}, inplace=True)
boroughs_loc.head()

Unnamed: 0,Borough,City,Latitude,Longitude
0,Tempelhof-Schöneberg,Berlin,52.440603,13.373703
1,Neukölln,Berlin,52.48115,13.43535
2,Charlottenburg-Wilmersdorf,Berlin,52.507856,13.263952
3,Schöneberg,Berlin,52.482157,13.35519
4,Friedrichshain-Kreuzberg,Berlin,52.501115,13.444285


In [89]:
venues_loc = all_venues[["Neighborhood","Address Latitude","Address Longitude","Borough","City"]]
venues_loc = venues_loc.drop_duplicates()
venues_loc.index = range(0, venues_loc.shape[0])

In [90]:
neighborhoods_grouped.to_csv("neighborhoods_grouped.csv", index=False)
boroughs_grouped.to_csv("boroughs_grouped.csv", index=False)
neighborhoods_venues_sorted.to_csv("neighborhoods_venues_sorted.csv", index=False)
boroughs_venues_sorted.to_csv("boroughs_venues_sorted.csv", index=False)
boroughs_loc.to_csv("boroughs_loc.csv", index=False)
venues_loc.to_csv("venues_loc.csv", index=False)

## Cluster neighborhoods and boroughs
I want to have realtively small clusters with an average of 4 neighborhoods or boroughs.

In [91]:
neighborhoods_grouped = pd.read_csv("neighborhoods_grouped.csv")
boroughs_grouped = pd.read_csv("boroughs_grouped.csv")
neighborhoods_venues_sorted = pd.read_csv("neighborhoods_venues_sorted.csv")
boroughs_venues_sorted = pd.read_csv("boroughs_venues_sorted.csv")
all_venues = pd.read_csv("all_venues.csv")
boroughs_loc = pd.read_csv("boroughs_loc.csv")
venues_loc = pd.read_csv("venues_loc.csv")

In [92]:
kclusters_n = round(neighborhoods_grouped.shape[0]/4)
kclusters_b = round(boroughs_grouped.shape[0]/4)

neighborhood_clustering = neighborhoods_grouped.drop('Neighborhood', 1)
borough_clustering = boroughs_grouped.drop('Borough', 1)

In [93]:
kmeans_n = KMeans(n_clusters=kclusters_n, random_state=0).fit(neighborhood_clustering)
kmeans_b = KMeans(n_clusters=kclusters_b, random_state=0).fit(borough_clustering)

In [94]:
#neighborhoods_venues_sorted.drop("Cluster Labels",1, inplace=True)
#boroughs_venues_sorted.drop("Cluster Labels",1, inplace=True)

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans_n.labels_)
boroughs_venues_sorted.insert(0, 'Cluster Labels', kmeans_b.labels_)

In [95]:
neighborhoods_merged = venues_loc
neighborhoods_merged = neighborhoods_merged.merge(neighborhoods_venues_sorted.set_index('Neighborhood'), left_on='Neighborhood', right_on="Neighborhood")
neighborhoods_merged.head()

Unnamed: 0,Neighborhood,Address Latitude,Address Longitude,Borough,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"47, Hauptstraße, Rote Insel, Schöneberg, Tempe...",52.483193,13.34842,Tempelhof-Schöneberg,Berlin,4,Café,Bakery,Pizza Place,Cocktail Bar,Italian Restaurant,French Restaurant,Ice Cream Shop,Vietnamese Restaurant,Plaza,Bistro
1,"Informationsort Schwerbelastungskörper, 100, G...",52.484143,13.371623,Tempelhof-Schöneberg,Berlin,5,Italian Restaurant,Café,Ice Cream Shop,Pizza Place,Gym / Fitness Center,Cocktail Bar,French Restaurant,Bistro,Bar,Organic Grocery
2,"P 3, Columbiadamm, Tempelhof, Tempelhof-Schöne...",52.48324,13.391364,Tempelhof-Schöneberg,Berlin,5,Italian Restaurant,Café,Bar,Ice Cream Shop,Historic Site,Music Venue,Cocktail Bar,Coffee Shop,Park,Gym
3,"Freiluftkino Hasenheide, Columbiadamm, Neuköll...",52.48378,13.416432,Neukölln,Berlin,0,Bar,Café,Coffee Shop,Pizza Place,Indie Movie Theater,Italian Restaurant,Vegetarian / Vegan Restaurant,Turkish Restaurant,Farmers Market,Garden
4,"6, Wittelsbacherstraße, Wilmersdorf, Charlotte...",52.494681,13.314535,Charlottenburg-Wilmersdorf,Berlin,3,Bakery,Italian Restaurant,Hotel,Drugstore,Plaza,Supermarket,Coffee Shop,Wine Shop,Gourmet Shop,Noodle House


In [107]:
boroughs_merged = boroughs_loc
boroughs_merged = boroughs_merged.merge(boroughs_venues_sorted.set_index('Borough'), left_on='Borough', right_on="Borough")
boroughs_merged.head(100)

Unnamed: 0,Borough,City,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Tempelhof-Schöneberg,Berlin,52.440603,13.373703,3,Café,Italian Restaurant,Ice Cream Shop,Cocktail Bar,Bakery,Bar,Pizza Place,Park,Vietnamese Restaurant,French Restaurant
1,Neukölln,Berlin,52.48115,13.43535,3,Bar,Café,Coffee Shop,Pizza Place,Indie Movie Theater,Italian Restaurant,Vegetarian / Vegan Restaurant,Turkish Restaurant,Farmers Market,Garden
2,Charlottenburg-Wilmersdorf,Berlin,52.507856,13.263952,3,Hotel,Italian Restaurant,Bakery,Gourmet Shop,Coffee Shop,Plaza,Café,Supermarket,Dessert Shop,Drugstore
3,Schöneberg,Berlin,52.482157,13.35519,3,Café,Coffee Shop,Bar,Hotel,Pizza Place,Plaza,Ice Cream Shop,Cocktail Bar,Park,Organic Grocery
4,Friedrichshain-Kreuzberg,Berlin,52.501115,13.444285,3,Coffee Shop,Bar,Café,Italian Restaurant,Hotel,Ice Cream Shop,Cocktail Bar,Vegetarian / Vegan Restaurant,Park,Bakery
5,Mitte,Berlin,52.517885,13.40406,3,Hotel,Café,Coffee Shop,Bar,Park,Plaza,Bakery,Italian Restaurant,Ice Cream Shop,Art Gallery
6,Pankow,Berlin,52.597917,13.435316,3,Café,Bakery,Coffee Shop,Ice Cream Shop,Vietnamese Restaurant,Italian Restaurant,Park,Bar,Wine Bar,Beer Bar
7,Bezirksteil Sendlinger Feld,munich,48.114144,11.550177,5,Italian Restaurant,Café,German Restaurant,Bakery,Supermarket,Hotel,Greek Restaurant,Bar,Ice Cream Shop,Gastropub
8,Bezirksteil Am Schlachthof,munich,48.124568,11.555181,5,Café,German Restaurant,Italian Restaurant,Ice Cream Shop,Bar,Vietnamese Restaurant,Asian Restaurant,Burger Joint,Drugstore,Bistro
9,Bezirksteil Glockenbach,munich,48.127864,11.570614,6,Café,German Restaurant,Ice Cream Shop,Italian Restaurant,Coffee Shop,Plaza,Bar,Vietnamese Restaurant,Cocktail Bar,Bavarian Restaurant


In [97]:
neighborhoods_merged.to_csv("neighborhoods_merged.csv", index=False)
boroughs_merged.to_csv("boroughs_merged.csv", index=False)

## Map neighborhood clusters

In [109]:
# set color scheme for the clusters
x = np.arange(kclusters_n)
ys = [i + x + (i*x)**2 for i in range(kclusters_n)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# create map
map_clusters = folium.Map(location=[munich_lat, munich_lon], zoom_start=12)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(neighborhoods_merged['Address Latitude'],
                                  neighborhoods_merged['Address Longitude'],
                                  neighborhoods_merged['Neighborhood'],
                                  neighborhoods_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=35, # neighborhood_radius/40
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.5,
        parse_html=False
    ).add_to(map_clusters)
       
map_clusters

## Map borough clusters

In [106]:
# set color scheme for the clusters
x = np.arange(kclusters_b)
ys = [i + x + (i*x)**2 for i in range(kclusters_b)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# create map
map_clusters_b = folium.Map(location=[berlin_lat, berlin_lon], zoom_start=12)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(boroughs_merged['Latitude'],
                                  boroughs_merged['Longitude'],
                                  boroughs_merged['Borough'],
                                  boroughs_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=40, # neighborhood_radius/40
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.5,
        parse_html=False
    ).add_to(map_clusters_b)
       
map_clusters_b

Explore neighborhood clusters
The boroughs are clearly too large to be in any way distinct. I will therefore continue exploring the neighborhoods.

In [101]:
cluster_neigh_num = []
for i in range(0,kclusters_n):
    city_group = neighborhoods_merged[neighborhoods_merged["Cluster Labels"] == i].groupby("City").count()["Neighborhood"]
    city_group = pd.DataFrame(city_group).reset_index()
    
    try:
        munich = city_group[city_group.City == "munich"]["Neighborhood"][1]
    except:
        try: 
            munich = city_group[city_group.City == "munich"]["Neighborhood"][0]
        except:
            munich = 0
    
    try:
        berlin = city_group[city_group.City == "Berlin"]["Neighborhood"][0]
    except:
        berlin = 0
    
    cluster_neigh_num.append({"Cluster Labels": i, "munich": munich, "Berlin": berlin})
    print("Cluster", i, "has", munich, "munich neighborhoods and", berlin, "Berlin neighborhoods.")
    
cluster_neigh_num = pd.DataFrame(cluster_neigh_num)

Cluster 0 has 0 munich neighborhoods and 3 Berlin neighborhoods.
Cluster 1 has 5 munich neighborhoods and 0 Berlin neighborhoods.
Cluster 2 has 0 munich neighborhoods and 7 Berlin neighborhoods.
Cluster 3 has 4 munich neighborhoods and 2 Berlin neighborhoods.
Cluster 4 has 0 munich neighborhoods and 5 Berlin neighborhoods.
Cluster 5 has 1 munich neighborhoods and 4 Berlin neighborhoods.
Cluster 6 has 0 munich neighborhoods and 4 Berlin neighborhoods.
Cluster 7 has 4 munich neighborhoods and 0 Berlin neighborhoods.
Cluster 8 has 0 munich neighborhoods and 3 Berlin neighborhoods.
Cluster 9 has 0 munich neighborhoods and 3 Berlin neighborhoods.
Cluster 10 has 0 munich neighborhoods and 3 Berlin neighborhoods.
Cluster 11 has 0 munich neighborhoods and 2 Berlin neighborhoods.
Cluster 12 has 2 munich neighborhoods and 0 Berlin neighborhoods.
Cluster 13 has 5 munich neighborhoods and 0 Berlin neighborhoods.


In [102]:
cluster_neigh_num[(cluster_neigh_num.munich > 0) & (cluster_neigh_num.Berlin > 0)]

Unnamed: 0,Cluster Labels,munich,Berlin
3,3,4,2
5,5,1,4


In [103]:
cluster_list = cluster_neigh_num[(cluster_neigh_num.munich > 0) & (cluster_neigh_num.Berlin > 0)]["Cluster Labels"].tolist()

venues_list = [[]]*len(cluster_list)

for i in range(0,len(cluster_list)):
    j = cluster_list[i]
    venues = neighborhoods_merged[neighborhoods_merged["Cluster Labels"] == j].iloc[:, 6:16].values.tolist()
    venues_list[i] = []
    for sublist in venues:
        for item in sublist:
            venues_list[i].append(item)

In [104]:
for i in range(0,len(cluster_list)):
    venues_count = Counter(venues_list[i])
    venues_count = pd.DataFrame.from_dict(venues_count, orient='index').reset_index()
    j = cluster_list[i]
    print("Top 5 venues types in Cluster", j)
    display(venues_count.sort_values(0, ascending=False).head())
    display(neighborhoods_merged[neighborhoods_merged["Cluster Labels"] == j].Neighborhood.values.tolist())
    print("\n")

Top 5 venues types in Cluster 3


Unnamed: 0,index,0
2,Hotel,6
1,Italian Restaurant,6
0,Bakery,5
5,Supermarket,5
10,Café,4


['6, Wittelsbacherstraße, Wilmersdorf, Charlottenburg-Wilmersdorf, Berlin, 10707, Deutschland',
 '1A, Guerickestraße, Charlottenburg, Charlottenburg-Wilmersdorf, Berlin, 10587, Deutschland',
 'Brudermühlstraße, Bezirksteil Sendlinger Feld, Sendling, München, Bayern, 81371, Deutschland',
 'Zschokkestraße, Neu-Friedenheim, Bezirksteil Friedenheim, Laim, München, Bayern, 80686, Deutschland',
 '155, Prinzregentenstraße, Parkstadt Bogenhausen, Bezirksteil Parkstadt, Bogenhausen, München, Bayern, 81677, Deutschland',
 '18, Scheinerstraße, Altbogenhausen, Bezirksteil Altbogenhausen, Bogenhausen, München, Bayern, 81679, Deutschland']



Top 5 venues types in Cluster 5


Unnamed: 0,index,0
0,Italian Restaurant,5
1,Café,5
8,Bar,5
5,Cocktail Bar,4
7,Bistro,3


['Informationsort Schwerbelastungskörper, 100, General-Pape-Straße, Tempelhof, Tempelhof-Schöneberg, Berlin, 12101, Deutschland',
 'P 3, Columbiadamm, Tempelhof, Tempelhof-Schöneberg, Berlin, 12101, Deutschland',
 '14, Wartenburgstraße, Kreuzberg, Friedrichshain-Kreuzberg, Berlin, 10963, Deutschland',
 'Freiwillige Feuerwehr Urban, Wilmsstraße, Kreuzberg, Friedrichshain-Kreuzberg, Berlin, 10961, Deutschland',
 'Brudermühlbrücke, Brudermühlstraße, Bezirksteil Sendlinger Feld, Sendling, München, Bayern, 81379, Deutschland']



