# Importing packages

In [None]:
import numpy as np
import pandas as pd

# Importing tourist files

In [None]:
TouristsTwitter = pd.read_csv('CSV/tourists_clustered.csv', sep= ',', low_memory = False, lineterminator='\n')
TouristsFlickr = pd.read_csv('CSV/Flickr_Tourists.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
# Convert to datetime columns.
TouristsFlickr['post_date'] = pd.to_datetime(TouristsFlickr['postedTime'])
TouristsTwitter['tweet_date'] = pd.to_datetime(TouristsTwitter['tweet_date'])
TouristsTwitter['post_date'] = pd.to_datetime(TouristsTwitter['tweet_date'])

In [None]:
# Create bounding box.
boundingBox = [
    [4.441395, 51.89715],
    [4.441395, 51.92849],
    [4.503365, 51.92849],
    [4.503365, 51.89715]
]

boundingBoxMinMax = [4.441395, 4.503365, 51.89715, 51.92849] #lon min, lon max, lat min, lat max

In [None]:
TouristsTwitter.head()

# Combining the datasets into one with only necessary information

In [None]:
FlickrDateInfo = TouristsFlickr[['photoID', 'post_date', 'userID', 'latitude', 'longitude']]
FlickrDateInfo.rename(columns={'latitude': 'geo_lat', 'longitude': 'geo_lon'}, inplace = True)

In [None]:
TwitterDateInfo = TouristsTwitter[['item_number','post_date','user_id', 'geo_lat', 'geo_lon']]

In [None]:
frames = [TwitterDateInfo, FlickrDateInfo]
Tourists = pd.concat(frames)

In [None]:
Tourists["user"] = Tourists["user_id"].astype(str) + Tourists["userID"].astype(str)

In [None]:
Tourists.drop(columns= ['userID', 'user_id'], inplace=True)

In [None]:
Tourists.head()

# Filtering the dataset on the bounding box

In [None]:
def assignToBox(lon, lat):
     
    cluster = 'None'
    
    if (lon >= boundingBoxMinMax[0] and lon <= boundingBoxMinMax[1]) and (lat >= boundingBoxMinMax[2] and lat <= boundingBoxMinMax[3]):
        cluster = 'Center'
    
    return cluster

In [None]:
# Use an apply and lambda function assign the cluster to each tweet
Tourists['cluster'] = Tourists.apply(lambda x: assignToBox(x['geo_lon'],x['geo_lat']),axis=1)

In [None]:
TouristsCenter = Tourists[Tourists['cluster'] == 'Center']

In [None]:
TouristsCenter

# Assigning posts to a certain area in the center (based on the bounding box)

In [None]:
def assignCluster(lon, lat, currentCluster, box, newCluster):
    
    cluster = currentCluster
    
    # Belongs to airport?
    if (lon >= box[0] and lon <= box[1]) and (lat >= box[2] and lat <= box[3]):
        cluster = newCluster
    return cluster  

In [None]:
numberHorizontal = 0;
numberVertical = 0;
horizontalMax = 20; # Change this if you want more granularity
verticalMax = 20; # Change this if you want more granularity

In [None]:
horizontalInterval = (boundingBoxMinMax[1] - boundingBoxMinMax[0]) / horizontalMax
verticalInterval = (boundingBoxMinMax[3] - boundingBoxMinMax[2]) / verticalMax

horizontalStart = boundingBoxMinMax[0]
verticalStart = boundingBoxMinMax[2]

while(numberVertical < verticalMax):
    
    #Add one to the vertical ones, essentially moving one layer up.
    numberVertical = numberVertical + 1
    
    while(numberHorizontal < horizontalMax):
        
        # Add one to the horizontal ones, essentially moving one layer to the left.
        numberHorizontal = numberHorizontal + 1
        
        # Create box.
        boxMinMax = [horizontalStart, horizontalStart + horizontalInterval, # lon min, lon max.
                     verticalStart, verticalStart + verticalInterval] # lat min, lat max.
        
        # Change horizontal start
        horizontalStart = horizontalStart + horizontalInterval
        
        # Create new cluster name.
        newCluster = str('{},{}'.format(numberVertical,numberHorizontal))
        
        # Change clusters in the dataset accordingly
        TouristsCenter['cluster'] = TouristsCenter.apply(lambda x: assignCluster(x['geo_lon'],x['geo_lat'], x['cluster'], boxMinMax, newCluster),axis=1)
        
    # Reset number horizontal for next layer
    numberHorizontal = 0
    
    # Reset horizontal start
    horizontalStart = boundingBoxMinMax[0]
    
    # Change vertical start
    verticalStart = verticalStart + verticalInterval
    
    print(numberVertical)

In [None]:
TouristsCenter['cluster'].value_counts()

# Create raw trajectories

In [None]:
# Create a set of unique users.
UniqueUsers = TouristsCenter['user'].unique()

In [None]:
# Create an empty trajectory dictionary
Trajectories = {}

In [None]:
# For each user, create a timed path, get the cluster of each post and add it to the empty dictionary.
for i in UniqueUsers:
    
    # Get all the posts of the user.
    Posts = TouristsCenter[TouristsCenter['user'] == i]
    
    # Sort the posts.
    Posts = Posts.sort_values(by='post_date')
    
    # Get a list of all the clusters and put it in an array.
    ClustersOfPosts = Posts['cluster'].values
    
    # Add the user and the timed path to the dictionary.
    Trajectories.update( {i : ClustersOfPosts} )

In [None]:
# Trajectory in dataframe format
Trajectory = pd.DataFrame(list(Trajectories.items()))
Trajectory = Trajectory.rename(columns={0: "user", 1: "trajectory"})

In [None]:
Trajectory.head()

# Creating pairs

In [None]:
# A dictionary that will hold all the pairs
AreaPairs = {}

In [None]:
# A function to filter the dataframe based on number of posts
def filterOnNumberOfPosts(x):
    
    if len(x) > 1:
        return True
    return False

In [None]:
Trajectory['isCandidate'] = Trajectory['trajectory'].map(lambda x: filterOnNumberOfPosts(x))

In [None]:
# Get only the candidate ones
Areas = Trajectory[Trajectory['isCandidate']]

In [None]:
# Get an idea of how many there are
Areas.shape

In [None]:
# A function that creates paires of POIS.
def createAreaPairs(dataframeWithAreas):
    
    # Loop over the given dataframe.
    for index, row in dataframeWithAreas.iterrows():
        
        pairs = []
        
        # Loop over the array of trajectories for each row.
        for i in range(len(row['trajectory'])-1):
            
            # Create a pair.
            pair = [row['trajectory'][i], row['trajectory'][i+1]]
            
            # If the pair already exists, continue (I only want unique pairs for each user)
            if pair in pairs:
                continue
            
            # Else process the pair.
            else:
                
                # Append the pair to the pair list.
                pairs.append(pair)
            
                # If i already in the dictionary, append.
                if row['trajectory'][i] in AreaPairs.keys():
                    AreaPairs[row['trajectory'][i]].append(row['trajectory'][i+1])
                # Else, create new key.
                else:
                    AreaPairs[row['trajectory'][i]] = [row['trajectory'][i+1]]  

In [None]:
createAreaPairs(Areas)

In [None]:
for i in sorted(AreaPairs):
    print(i)

# Create a transition matrix

In [None]:
# Empty transition matrix
TransitionMatrix = []

In [None]:
# Empty matrix to store the order in.
OrderOfKeys = []

In [None]:
# Iterate over the keys to fill the transition matrix
for key in AreaPairs:
    
    # Count the total number of observations.
    totalObs = len(AreaPairs[key])
    
    # Create an array to store values in.
    valuesArray = []
    
    # Count how many times the other keys are part of this current key.
    for keyInner in AreaPairs:
        
        # Get the probability of going to a certain key.
        probability = AreaPairs[key].count(keyInner) / totalObs
        
        # Put in the values array.
        valuesArray.append([keyInner, probability])
    
    # Put in the transition matrix.
    TransitionMatrix.append(valuesArray)
    
    # Put key in the ordermatrix.
    OrderOfKeys.append(key)

In [None]:
TransitionMatrixCleaned = []

In [None]:
# Update transition matrix to include only non-zero values.
for i in range(len(TransitionMatrix) -1):
    
    transitions = []
    
    for j in range(len(TransitionMatrix[i]) - 1):
        
        if TransitionMatrix[i][j][1] > 0.10: # Change this value to change threshold.
            transitions.append([TransitionMatrix[i][j][0], TransitionMatrix[i][j][1]])
        
        # Append something to keep the right order if nothing matches the criterium above.
        if len(transitions) == 0:
            transitions.append(['Nothing', 'here'])
            
    TransitionMatrixCleaned.append(transitions)

In [None]:
TransitionTable = []
# Create a table for each startpoint to get trajectory information.
for i in range(len(OrderOfKeys) - 1):
    
    Row = [];
    
    Row.append(OrderOfKeys[i])
    
    Row.append(TransitionMatrixCleaned[i])
    
    TransitionTable.append(Row)
    
