# Importing required packages

In [None]:
# Importing packages
import numpy as np
import pandas as pd
import urllib.request
import time
import random
import string
import urllib.parse
import hashlib
import hmac
import base64
import requests
import webbrowser
import sys
from requests.auth import HTTPBasicAuth
from datetime import datetime
from datetime import timedelta
import json

# Importing the dataset and making some small changes

In [None]:
# Import the dataset
Tourists = pd.read_csv('tourists_total_themes.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
# Making sure the dataset corresponds with the one used in R.

Tourists = Tourists[Tourists['geo_lon'] > 4]
Tourists["unique"] = Tourists["geo_lon"].map(str) + Tourists["geo_lat"].map(str)

# Drop everything that has more than 200.
Tourists = Tourists[Tourists['unique'] != "4.551.9167"]
Tourists = Tourists[Tourists['unique'] != "4.475251.9235"]
Tourists = Tourists[Tourists['unique'] != "4.4694936351.92508148"]
Tourists = Tourists[Tourists['unique'] != "4.4784751.92286"]

# Drop the unique column
Tourists = Tourists.drop(columns=('unique'))

In [None]:
Tourists.head()

# Mapping the tweets to the identified clusters in the heatmap

I identified 6 clusters in R, which are the following:
1: Rotterdam airport
2: Rotterdam city center
3: Rotterdam Blaak
4: Erasmus bridge / katendrecht
5: Ahoy
6: Feyenoord stadium
Using the shiny, leaflet and leaflet.extra packages in R, I am able to get the coordinates of each of the clusters.


In [None]:
# Coordinates are given as follows: [upperLeft, upperRight, lowerLeft, lowerRight] in which upperLeft [lon, lat]
Airport = [
    [4.430966, 51.94635],
    [4.430966, 51.95143],
    [4.438562, 51.95143],
    [4.438562, 51.94635]
]

Blijdorp = [
    [4.43676, 51.9222],
    [4.43676, 51.93204],
    [4.457188, 51.93204],
    [4.457188, 51.9222]
]

CityCenter = [
    [4.464054, 51.91664],
    [4.464054, 51.92855],
    [4.482336, 51.92855],
    [4.482336, 51.91664]
]

Blaak = [
    [4.482937, 51.91579],
    [4.482937, 51.92474],
    [4.495983, 51.92474],
    [4.495983, 51.91579]
]

Euromast = [
    [4.462423, 51.90192],
    [4.462423, 51.90949],
    [4.470921, 51.90949],
    [4.470921, 51.90192]
]

KopVanZuid = [
    [4.468431, 51.90033],
    [4.488688, 51.91219],
    [4.498129, 51.90658],
    [4.474182, 51.8936]
]

Ahoy = [
    [4.481564, 51.87898],
    [4.481564, 51.88693],
    [4.494824, 51.88693],
    [4.494824, 51.87898]
]

Feyenoord = [
    [4.518728, 51.89021],
    [4.518728, 51.89702],
    [4.529586, 51.89702],
    [4.529586, 51.89021]
]

#### Now, each tweet can be assigned to a cluster based on the cluster coordinates defined above. 

In [None]:
# First, create arrays with the min and max values of lon and lat
AirportMinMax = [4.430966, 4.438562, 51.94635, 51.95143] #[minLon, maxLon, minLat, maxLat]
BlijdorpMinMax = [4.43676, 4.457188, 51.9222, 51.93204]
CityMinMax = [4.464054, 4.482336, 51.91664, 51.92855]
BlaakMinMax = [4.482937, 4.495983, 51.91579, 51.92474]
EuromastMinMax = [4.462423, 4.470921, 51.90192, 51.90949]
AhoyMinMax = [4.481564, 4.494824, 51.87898, 51.88693]
FeyenoordMinMax = [4.518728, 4.529586, 51.89021, 51.89702]

#### Kop van Zuid requires a different approach

In [None]:
# Construct two functions for kop van zuid
slopeUpperLine = (KopVanZuid[1][1] - KopVanZuid[0][1]) / (KopVanZuid[1][0] - KopVanZuid[0][0])
slopeLowerLine = (KopVanZuid[2][1] - KopVanZuid[3][1]) / (KopVanZuid[2][0] - KopVanZuid[3][0])

interceptUpperline = KopVanZuid[0][1] - (slopeUpperLine * KopVanZuid[0][0])
interceptLowerLine = KopVanZuid[3][1] - (slopeLowerLine * KopVanZuid[3][0])

#### Define two functions, one for the upperline function, one for the lowerline function of the erasmusbridge cluster

In [None]:
# Calculate the value of the latitude of the upperline at a certain longitude value
def getUpperLine(lon):
    lat = lon * slopeUpperLine + interceptUpperline
    return lat

In [None]:
# Calculate the value of the latitude of the lowerline at a certain longitude value
def getLowerLine(lon):
    lat = lon * slopeLowerLine + interceptLowerLine
    return lat

#### A function to assign each tweet to a cluster based on the clustervalues defined above

In [None]:
def assignCluster(lon, lat):
    
    cluster = 'None'
    
    # Belongs to airport?
    if (lon >= AirportMinMax[0] and lon <= AirportMinMax[1]) and (lat >= AirportMinMax[2] and lat <= AirportMinMax[3]):
        cluster = 'Rotterdam The Hague Airport'
    
    # Belongs to Blijdorp?
    if (lon >= BlijdorpMinMax[0] and lon <= BlijdorpMinMax[1]) and (lat >= BlijdorpMinMax[2] and lat <= BlijdorpMinMax[3]):
        cluster = 'Blijdorp Zoo'
            
    # Belongs to City?  
    if (lon >= CityMinMax[0] and lon <= CityMinMax[1]) and (lat >= CityMinMax[2] and lat <= CityMinMax[3]):
        cluster = 'City Center'
    
    # Belongs to Blaak?
    if (lon >= BlaakMinMax[0] and lon <= BlaakMinMax[1]) and (lat >= BlaakMinMax[2] and lat <= BlaakMinMax[3]):
        cluster = 'Blaak'
    
    # Belongs to Euromast?
    if (lon >= EuromastMinMax[0] and lon <= EuromastMinMax[1]) and (lat >= EuromastMinMax[2] and lat <= EuromastMinMax[3]):
        cluster = 'Euromast'
        
    # Belongs to Kop van Zuid?
    if (lon >= KopVanZuid[0][0] and lon <= KopVanZuid[1][0]) and (getUpperLine(lon) >= lat and getLowerLine(lon) <= lat):
        cluster = 'Kop van Zuid'
    
    # Belongs to Ahoy?
    if (lon >= AhoyMinMax[0] and lon <= AhoyMinMax[1]) and (lat >= AhoyMinMax[2] and lat <= AhoyMinMax[3]):
        cluster = 'Rotterdam Ahoy'
    
    # Belongs to Feyenoord?
    if (lon >= FeyenoordMinMax[0] and lon <= FeyenoordMinMax[1]) and (lat >= FeyenoordMinMax[2] and lat <= FeyenoordMinMax[3]):
        cluster = 'Feyenoord Stadium'
    
    
    
    return cluster       

In [None]:
# Use an apply and lambda function assign the cluster to each tweet
Tourists['cluster'] = Tourists.apply(lambda x: assignCluster(x['geo_lon'],x['geo_lat']),axis=1)

In [None]:
# See the distribution among the clusters
Tourists['cluster'].value_counts()

In [None]:
# Create CSV for later usage.
Tourists.to_csv('CSV/tourists_clustered.csv', index=False)

In [None]:
# Create the photo dataset and do the same.
Tourists_Attach = pd.read_csv('CSV/tourists_attachments.csv', sep= ',', low_memory = False, lineterminator='\n')
TouristsFlickr = pd.read_csv('CSV/Flickr_Tourists.csv', sep= ',', low_memory = False, lineterminator='\n')

TouristsFlickr.rename(columns ={'longitude':'geo_lon', 'latitude':'geo_lat'}, inplace=True)

Photoset_Flickr = TouristsFlickr[['photoID','photoSecret','geo_lon','geo_lat']]

Photoset_Twitter = Tourists_Attach[['item_number','geo_lon','geo_lat']]

frames = [Photoset_Flickr, Photoset_Twitter]
Photoset = pd.concat(frames)

In [None]:
# Use an apply and lambda function assign the cluster to each tweet
Photoset['cluster'] = Photoset.apply(lambda x: assignCluster(x['geo_lon'],x['geo_lat']),axis=1)

In [None]:
# See the distribution among the clusters
Photoset['cluster'].value_counts()

In [None]:
# Create CSV for later usage.
Photoset.to_csv('CSV/photo_dataset_clustered.csv', index=False)

# Descriptive statistics on the themes

In [None]:
# Creating dataframes for all clusters.
Tourists_Airport = Tourists[Tourists['cluster'] == 'Rotterdam The Hague Airport']
Tourists_Blijdorp = Tourists[Tourists['cluster'] == 'Blijdorp Zoo']
Tourists_City = Tourists[Tourists['cluster'] == 'City Center']
Tourists_Blaak = Tourists[Tourists['cluster'] == 'Blaak']
Tourists_Euromast = Tourists[Tourists['cluster'] == 'Euromast']
Tourists_KvZ = Tourists[Tourists['cluster'] == 'Kop van Zuid']
Tourists_Ahoy = Tourists[Tourists['cluster'] == 'Rotterdam Ahoy']
Tourists_Feyenoord = Tourists[Tourists['cluster'] == 'Feyenoord Stadium']

In [None]:
# Get total number of themes.
total = 0
for i in list(Tourists_Blijdorp['theme'].value_counts()):
    total += i
total

In [None]:
Tourists_Blijdorp['theme'].value_counts()

# Randomly selecting photos per cluster

In [None]:
# Creating dataframes for all clusters.
Photo_Airport = Photoset[Photoset['cluster'] == 'Rotterdam The Hague Airport']
Photo_Blijdorp = Photoset[Photoset['cluster'] == 'Blijdorp Zoo']
Photo_City = Photoset[Photoset['cluster'] == 'City Center']
Photo_Blaak = Photoset[Photoset['cluster'] == 'Blaak']
Photo_Euromast = Photoset[Photoset['cluster'] == 'Euromast']
Photo_KvZ = Photoset[Photoset['cluster'] == 'Kop van Zuid']
Photo_Ahoy = Photoset[Photoset['cluster'] == 'Rotterdam Ahoy']
Photo_Feyenoord = Photoset[Photoset['cluster'] == 'Feyenoord Stadium']

In [None]:
Photo_Feyenoord[Photo_Feyenoord['photoID'].isnull() == False].sample(n=5)

In [None]:
# Get the dataset with the links to the photos for twitter.
PhotoID_Twitter = pd.read_csv('CSV/TwitterPhotoIDs.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
PhotoID_Twitter.loc[PhotoID_Twitter['item_number'] == 580943.0]['media_url']

In [None]:
# Use the Flickr API to retrieve Flickr photos
print('photoID: {}'.format(Photo_Feyenoord.loc[13355]['photoID']))
print('photo secret: {}'.format(Photo_Feyenoord.loc[13355]['photoSecret']))