# Warsaw Neighborhods - getting data and clustering

In [477]:
import pandas as pd
import numpy as np
import requests
import json
import time
import folium
import matplotlib.pyplot as plt
import seaborn as sns

Ideas:
- getting the coordinates for postcode data
- getting the venues around the given postcode
- clustering the similar areas of the city
- getting the centroids of those similar areas
- calculating the distance between those centroids and the time of travel
- work areas in warsaw???
- getting the flat prices in warsaw, the m^2 prices in warsaw
- calculating the best places in warsaw, the most likable places
- get the rent costs based on the area? Is it even possible? I guess only buying would be possible.

The things that this can be useful for are getting the most appropriate areas to the given person, the areas that are most liked by this person, the best areas, calculating which are similar for them, calculatin the distance between liked places and work, travel time between them, visualising it

the problem it solves is how to sell flat and where for each person, where is the most appropriate place for them with the least travel time and the best price range. It is purely based on the ergonomics of life in the city and geospatial locations, more than actual state of the flats. Optimisation of everything really. Not really looking for a best place but also best place based on the many criteria.

In [235]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

## Getting the addresses and postcodes of Warsaw City

In [236]:
df = pd.read_csv('adresy-Warszawa.csv', sep=';')
df

Unnamed: 0,dzielnica,nazwa_peln,nazwa_skr,numer,x,y,id_instancji,kod
0,Bemowo,Osiedle Przyjaźń,os. Przyjaźń,101,7.494187e+06,5.789198e+06,1272121,01-355
1,Bemowo,Osiedle Przyjaźń,os. Przyjaźń,102,7.494238e+06,5.789209e+06,1272123,01-355
2,Bemowo,Osiedle Przyjaźń,os. Przyjaźń,103,7.494222e+06,5.789175e+06,1272107,01-355
3,Bemowo,Osiedle Przyjaźń,os. Przyjaźń,104,7.494244e+06,5.789179e+06,1272108,01-355
4,Bemowo,Osiedle Przyjaźń,os. Przyjaźń,105,7.494265e+06,5.789184e+06,1272110,01-355
...,...,...,...,...,...,...,...,...
112615,Żoliborz,ulica Zygmunta Krasińskiego,ul. Z. Krasińskiego,67,7.497536e+06,5.791730e+06,1435331,01-755
112616,Żoliborz,ulica Zygmunta Krasińskiego,ul. Z. Krasińskiego,69,7.497481e+06,5.791741e+06,1435334,01-755
112617,Żoliborz,ulica Zygmunta Krasińskiego,ul. Z. Krasińskiego,7,7.499482e+06,5.792954e+06,1434278,01-530
112618,Żoliborz,ulica Zygmunta Krasińskiego,ul. Z. Krasińskiego,8,7.499286e+06,5.792946e+06,1434266,01-601


In [237]:
data = df.drop_duplicates('kod').reset_index(drop=True)
data.head()

Unnamed: 0,dzielnica,nazwa_peln,nazwa_skr,numer,x,y,id_instancji,kod
0,Bemowo,Osiedle Przyjaźń,os. Przyjaźń,101,7494187.0,5789198.0,1272121,01-355
1,Bemowo,Plac Kasztelański,pl. Kasztelański,1,7494292.0,5788650.0,1271876,01-362
2,Bemowo,ulica Antoniego Bolesława Dobrowolskiego,ul. A. B. Dobrowolskiego,1,7492704.0,5791632.0,1273498,01-483
3,Bemowo,ulica Arki Bożka,ul. A. Bożka,1,7494352.0,5789961.0,1275040,01-464
4,Bemowo,ulica Alberta Einsteina,ul. A. Einsteina,1,7492734.0,5791192.0,1273347,01-480


In [238]:
data = data.drop(columns=['nazwa_peln', 'numer', 'x', 'y', 'id_instancji'])
data.head()

Unnamed: 0,dzielnica,nazwa_skr,kod
0,Bemowo,os. Przyjaźń,01-355
1,Bemowo,pl. Kasztelański,01-362
2,Bemowo,ul. A. B. Dobrowolskiego,01-483
3,Bemowo,ul. A. Bożka,01-464
4,Bemowo,ul. A. Einsteina,01-480


In [239]:
data = data.reindex(columns=['kod', 'dzielnica', 'nazwa_skr'])
data = data.rename(columns={'kod': 'Postcode', 'dzielnica': 'Borough', 'nazwa_skr': 'Street'})
data.head()

Unnamed: 0,Postcode,Borough,Street
0,01-355,Bemowo,os. Przyjaźń
1,01-362,Bemowo,pl. Kasztelański
2,01-483,Bemowo,ul. A. B. Dobrowolskiego
3,01-464,Bemowo,ul. A. Bożka
4,01-480,Bemowo,ul. A. Einsteina


In [240]:
data.groupby('Borough').size()

Borough
Bemowo            106
Białołęka         184
Bielany           194
Mokotów           355
Ochota            192
Praga-Południe    378
Praga-Północ      144
Rembertów         114
Targówek          267
Ursus             5  
Ursynów           96 
Wawer             444
Wesoła            2  
Wilanów           39 
Wola              313
Włochy            206
Śródmieście       551
Żoliborz          150
dtype: int64

In [241]:
data['Latitude'] = np.nan
data['Longitude'] = np.nan
data.head(10)

Unnamed: 0,Postcode,Borough,Street,Latitude,Longitude
0,01-355,Bemowo,os. Przyjaźń,,
1,01-362,Bemowo,pl. Kasztelański,,
2,01-483,Bemowo,ul. A. B. Dobrowolskiego,,
3,01-464,Bemowo,ul. A. Bożka,,
4,01-480,Bemowo,ul. A. Einsteina,,
5,01-473,Bemowo,ul. A. Kocjana,,
6,01-391,Bemowo,ul. A. Krzywoń,,
7,01-494,Bemowo,ul. A. Sołtana,,
8,01-318,Bemowo,ul. A. Świętochowskiego,,
9,01-497,Bemowo,ul. Afrodyty,,


In [280]:
from geopy.geocoders import Nominatim

address = 'Pałac Kultury, Warszawa'

geolocator = Nominatim(user_agent="warsaw")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of center of Warsaw are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of center of Warsaw are 52.2317641, 21.005799675616117.


### Missing data - 1248, 1309, 1310, 1311, 1901, 1902, 1903, 2200, 2201, 2202, 2499, 2729, 2730, 2731, 2792, 2793, 2794, 3092, 3391, 3685, 3686, 3687,

In [347]:
data.to_csv('data2.csv')

In [357]:
data_fill.to_csv('data_fill.csv')

In [362]:
data_fill.tail()

Unnamed: 0,Postcode,Borough,Street,Latitude,Longitude
3735,01-580,Żoliborz,ul. Z. Krasińskiego,52.267,20.9801
3736,01-769,Żoliborz,ul. Z. Krasińskiego,52.2658,20.9733
3737,01-784,Żoliborz,ul. Z. Krasińskiego,52.2651,20.9743
3738,01-779,Żoliborz,ul. Z. Krasińskiego,52.2628,20.9689
3739,01-755,Żoliborz,ul. Z. Krasińskiego,52.2608,20.9667


In [408]:
data_fill_droped.to_csv('data_fill_drop.csv')

In [409]:
# create map of Warsaw using latitude and longitude values
map_warsaw = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postcode in zip(data_fill_droped['Latitude'], data_fill_droped['Longitude'], data_fill_droped['Postcode']):
    label = postcode
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_warsaw)  


In [410]:
map_warsaw.save('warsaw.html')

# Downloading the venue data via Foursquare API

In [441]:
# Get Foursquare credentials

CLIENT_ID = 'JLPFMODERBG0EMQHWGJ5KIZPLE3T2FCATEDLKSQZHOUM0WUU' # your Foursquare ID
CLIENT_SECRET = '1PALU5LZP0FNWBFJUXTHHDTHDGLALBZRVE0Y1CLQ20C0REJF' # your Foursquare Secret
VERSION = '20191231' # Foursquare API version
LIMIT = 100

radius = 150

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JLPFMODERBG0EMQHWGJ5KIZPLE3T2FCATEDLKSQZHOUM0WUU
CLIENT_SECRET:1PALU5LZP0FNWBFJUXTHHDTHDGLALBZRVE0Y1CLQ20C0REJF


In [442]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            latitude, 
            longitude, 
            radius, 
            LIMIT)

results = requests.get(url).json()["response"]['groups'][0]['items']
results

[{'reasons': {'count': 0,
   'items': [{'summary': 'This spot is popular',
     'type': 'general',
     'reasonName': 'globalInteractionReason'}]},
  'venue': {'id': '4b8c1c46f964a52085be32e3',
   'name': 'Pałac Kultury i Nauki',
   'location': {'address': 'pl. Defilad 1',
    'lat': 52.231906208533545,
    'lng': 21.007063693321104,
    'labeledLatLngs': [{'label': 'display',
      'lat': 52.231906208533545,
      'lng': 21.007063693321104}],
    'distance': 87,
    'postalCode': '00-901',
    'cc': 'PL',
    'neighborhood': 'Śródmieście Północne',
    'city': 'Warszawa',
    'state': 'Województwo mazowieckie',
    'country': 'Polska',
    'formattedAddress': ['pl. Defilad 1', '00-901 Warszawa', 'Polska']},
   'categories': [{'id': '4bf58dd8d48988d130941735',
     'name': 'Building',
     'pluralName': 'Buildings',
     'shortName': 'Building',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/building/default_',
      'suffix': '.png'},
     'primary': True}],
   'photo

In [443]:
venues_list = []

venues_list.append([(
            
            latitude, 
            longitude, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'], 
            v['venue']['categories'][0]['name']) for v in results])

In [444]:
venues_list

[[(52.2317641,
   21.005799675616117,
   'Pałac Kultury i Nauki',
   52.231906208533545,
   21.007063693321104,
   'Building'),
  (52.2317641,
   21.005799675616117,
   'Bar Studio',
   52.23245210691395,
   21.006704775325204,
   'Cocktail Bar'),
  (52.2317641,
   21.005799675616117,
   'Kinoteka',
   52.23108684369295,
   21.006662986402535,
   'Movie Theater'),
  (52.2317641,
   21.005799675616117,
   'Muzeum Domków Dla Lalek',
   52.23115169570596,
   21.005695120397654,
   'Museum'),
  (52.2317641,
   21.005799675616117,
   'Teatr Dramatyczny',
   52.231435085260294,
   21.00755139352701,
   'Theater'),
  (52.2317641,
   21.005799675616117,
   'Cafe Kulturalna',
   52.23118087367174,
   21.007068363608944,
   'Cocktail Bar'),
  (52.2317641,
   21.005799675616117,
   'Teatr Studio im. St. I. Witkiewicza',
   52.23227027366112,
   21.007110904074768,
   'Theater'),
  (52.2317641,
   21.005799675616117,
   'Titanic The Exhibition',
   52.23183685971087,
   21.007330299094573,
   'Exh

In [446]:
def getNearbyVenues(names, boroughs, latitudes, longitudes, radius=150):
    """Function that returns nearby venues of a given locations"""
    
    venues_list=[]
    for name, borough, lat, lng in zip(names, boroughs, latitudes, longitudes):
        
        
        try:
            print(name + '|', end='')
            
            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
                LIMIT)

            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name,
                borough,
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])

            nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
            nearby_venues.columns = ['Postcode',
                          'Borough',
                          'Postcode Latitude', 
                          'Postcode Longitude', 
                          'Venue', 
                          'Venue Latitude', 
                          'Venue Longitude', 
                          'Venue Category']

        except KeyError:
            print('____' + name + '____', end='')
    
    return(nearby_venues)

In [447]:
warsaw_venues = getNearbyVenues(names=data_fill_droped['Postcode'],
                                boroughs=data_fill_droped['Borough'],
                                   latitudes=data_fill_droped['Latitude'],
                                   longitudes=data_fill_droped['Longitude']
                                  )


01-355|01-362|01-483|01-464|01-480|01-473|01-391|01-494|01-318|01-497|01-491|01-493|01-481|01-466|01-305|01-308|01-307|01-366|01-487|01-461|01-354|01-384|01-319|01-472|01-360|01-482|01-490|01-361|01-341|01-385|01-470|01-376|01-496|01-339|01-347|01-478|01-315|01-476|01-471|01-364|01-485|01-460|01-459|01-488|02-422|01-368|01-345|01-373|01-317|01-313|01-462|01-321|01-365|01-356|01-452|01-934|01-314|01-479|01-474|01-343|01-342|01-329|01-492|01-469|01-352|01-390|01-330|01-357|01-328|01-338|01-359|01-309|01-475|01-351|01-498|01-327|01-378|01-302|01-303|01-304|01-377|01-337|01-336|01-301|01-728|01-381|03-381|01-386|01-388|01-340|01-310|01-312|01-353|01-350|01-383|01-367|01-348|01-382|01-349|01-320|01-389|01-486|01-346|01-468|01-418|01-495|03-144|03-173|03-134|03-176|03-130|03-193|03-289|03-149|03-013|03-106|03-153|03-236|03-009|03-276|03-126|03-253|03-251|03-156|03-044|03-279|03-026|03-007|03-257|03-034|03-102|03-090|03-046|03-256|03-075|03-082|03-135|03-029|03-127|03-136|03-061|03-051|03-037

04-377|04-391|04-396|03-928|04-145|03-801|02-801|04-171|04-156|04-204|03-805|03-812|04-046|04-112|04-042|04-167|04-152|04-154|04-019|04-017|04-324|04-334|04-072|04-359|03-835|04-371|04-389|03-371|04-370|04-372|04-191|04-161|04-149|04-048|04-285|04-085|04-109|04-206|03-934|04-082|04-065|04-105|03-949|04-090|04-294|04-373|04-397|04-390|04-369|04-361|04-386|04-376|04-379|04-387|04-050|04-049|04-284|03-921|03-974|03-970|04-192|04-326|03-802|04-002|04-210|04-316|03-918|04-123|04-113|04-133|04-288|04-088|04-110|04-319|04-208|04-341|04-344|04-280|04-307|04-202|03-948|04-215|04-069|04-080|04-041|04-010|04-013|04-038|04-040|04-027|04-024|04-025|03-939|03-922|03-806|03-808|03-828|04-136|04-004|04-224|04-035|04-203|04-205|04-343|04-336|04-342|03-924|03-915|03-911|04-362|03-821|03-931|04-322|04-337|03-933|03-927|04-131|04-281|04-128|04-039|04-211|04-173|04-302|04-312|04-325|04-332|04-351|04-118|04-026|04-193|04-175|04-174|04-163|04-102|03-825|04-219|03-954|03-935|03-945|04-164|04-180|04-124|04-501

03-320|03-392|03-878|03-665|03-683|03-318|03-322|03-345|03-357|03-542|03-395|03-286|03-617|03-886|03-667|03-625|03-655|03-690|03-660|03-585|03-595|03-654|03-553|03-374|03-779|03-284|03-349|03-795|03-365|03-382|03-867|03-692|03-532|03-605|03-633|03-316|03-334|03-571|03-503|03-888|03-388|03-369|03-202|03-321|03-502|03-501|03-368|03-613|03-638|03-630|03-686|03-309|03-510|03-511|03-672|03-879|03-691|03-574|03-548|03-549|03-560|03-576|03-575|03-671|03-611|03-674|03-643|03-694|03-512|03-639|03-343|03-362|03-616|03-521|03-623|03-628|03-698|03-786|03-794|03-246|03-533|03-340|03-592|03-588|03-656|03-636|03-317|03-331|03-209|03-622|03-241|03-528|03-621|03-558|03-649|03-252|03-790|03-632|03-243|03-291|03-505|03-526|03-531|03-530|03-509|03-551|03-547|03-679|03-563|03-254|03-545|03-603|03-586|03-594|03-648|03-877|03-335|03-538|03-507|03-634|03-645|03-590|03-587|03-693|03-206|03-337|03-584|03-593|03-787|03-380|03-624|03-294|03-695|03-567|03-631|03-796|03-360|02-495|02-497|02-484|02-236|02-496|02-786

01-048|01-009|01-136|01-142|01-044|00-810|01-040|01-259|01-132|01-145|01-150|01-155|01-159|01-168|01-212|01-181|01-178|00-826|01-162|01-137|00-811|00-839|00-869|01-221|00-853|00-831|00-818|01-241|01-457|00-851|00-865|01-240|01-239|01-018|01-235|01-126|01-201|01-187|01-134|01-196|01-229|01-141|00-846|00-870|00-874|01-167|00-821|00-819|01-219|01-017|00-802|00-806|00-832|00-817|00-836|00-841|00-852|00-866|00-871|00-894|00-875|00-879|01-004|01-014|01-191|01-198|01-179|01-156|01-149|01-524|01-515|01-554|01-503|01-748|01-578|01-552|01-514|01-553|01-624|01-528|01-615|01-626|01-636|01-603|01-513|01-555|01-569|____01-569____01-563|01-794|01-540|01-517|01-551|01-562|01-616|01-625|01-650|01-529|01-585|01-654|01-632|01-506|01-773|01-612|01-611|01-736|01-647|01-796|01-519|01-605|01-622|01-646|01-737|01-747|01-613|01-609|01-617|01-607|01-644|01-582|01-518|01-510|01-505|01-522|01-743|01-588|01-623|01-630|01-572|01-738|01-640|01-645|01-521|01-576|01-571|01-556|01-627|01-592|01-634|01-629|01-602|01-610

In [450]:
len(warsaw_venues)

9605

In [451]:
warsaw_venues.head()

Unnamed: 0,Postcode,Borough,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,01-355,Bemowo,52.2374,20.9164,Kolorado,52.236842,20.917444,Playground
1,01-355,Bemowo,52.2374,20.9164,Травка возле общаги,52.238032,20.916197,Park
2,01-362,Bemowo,52.2317,20.9159,Motylem Jestem,52.232568,20.914433,Spa
3,01-483,Bemowo,52.2577,20.8922,Poligon Wat,52.25668,20.890971,Playground
4,01-391,Bemowo,52.2279,20.9179,Beerhub,52.228087,20.915867,Beer Bar


In [452]:
warsaw_venues.to_csv('warsaw_venues.csv')

In [466]:
warsaw_venues.Venue.isnull().reset_index(drop=True).value_counts()

False    9605
Name: Venue, dtype: int64

In [495]:
count = warsaw_venues.groupby('Postcode').Venue.count().reset_index().groupby('Venue').count()
count


Unnamed: 0_level_0,Postcode
Venue,Unnamed: 1_level_1
1,776
2,512
3,301
4,292
5,103
6,63
7,55
8,42
9,35
10,40


In [471]:
print('There are {} uniques categories.'.format(len(warsaw_venues['Venue Category'].unique())))

There are 348 uniques categories.


In [474]:
print('There are {} uniques categories.'.format(len(warsaw_venues['Postcode'].unique())))

There are 2434 uniques categories.


## Clustering the points