# Importing packages

In [None]:
import numpy as np
import pandas as pd

# Import clustered tourist file

In [None]:
# Import the dataset
Tourists = pd.read_csv('tourists_clustered.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
# Get an overview of the clusters.
Tourists['cluster'].value_counts()

In [None]:
Tourists['tweet_date'] = pd.to_datetime(Tourists['tweet_date'])

In [None]:
# Create a dataset with only clustered tweets.
Tourists_InCluster = Tourists[Tourists['cluster'] != 'None']

In [None]:
Tourists_InCluster = Tourists_InCluster[Tourists_InCluster['cluster'] != 'Blijdorp Zoo']
Tourists_InCluster = Tourists_InCluster[Tourists_InCluster['cluster'] != 'Euromast']

In [None]:
Tourists_InCluster['user_id'].nunique()

In [None]:
# Get an overview of the clusters.
Tourists_InCluster['cluster'].value_counts()

#### Next, I want to generate a path for each individual user

This path will partly correspond to the one De Choudhury et al. used. However, I will not be looking at time between tweets for now. Time will only be used to construct the path in the right order.

In [None]:
# Create a set of unique users.
UniqueUsers = Tourists_InCluster['user_id'].unique()

In [None]:
# Create an empty trajectory dictionary
Trajectories = {}

In [None]:
# For each user, create a timed path, get the cluster of each tweet and add it to the empty dictionary.
for i in UniqueUsers:
    
    # Get all the tweets of the user.
    Tweets = Tourists_InCluster[Tourists_InCluster['user_id'] == i]
    
    # Sort the tweets.
    Tweets = Tweets.sort_values(by='tweet_date')
    
    # Get a list of all the clusters and put it in an array.
    ClustersOfTweets = Tweets['cluster'].values
    
    # Add the user and the timed path to the dictionary.
    Trajectories.update( {i : ClustersOfTweets} )

In [None]:
# Trajectories in string format
TrajectoryString = {}

In [None]:
for key, value in Trajectories.items():
    
    trajectoryString = ''
    
    for i in range(len(value)):
        if trajectoryString == '':
            trajectoryString += value[i]
        else:
            trajectoryString += (' -> ' + str(value[i]))
    
    TrajectoryString.update({key:trajectoryString })
        

In [None]:
# Trajectory in array format
Trajectory = pd.DataFrame(list(Trajectories.items()))
Trajectory = Trajectory.rename(columns={0: "user_id", 1: "trajectory"})

In [None]:
# Trajectory in string format
TrajectoryStr = pd.DataFrame(list(TrajectoryString.items()))
TrajectoryStr = TrajectoryStr.rename(columns={0: "user_id", 1: "trajectory"})

In [None]:
TrajectoryStr.head()

#### Create pairs in a dictionary

In [None]:
# A dictionary that will hold all the pairs
POIPairs = {}

In [None]:
POIPairs

In [None]:
# A function to filter the dataframe based on number of tweets
def filterOnNumberOfTweets(x):
    
    if len(x) > 1:
        return True
    return False

In [None]:
Trajectory['isCandidate'] = Trajectory['trajectory'].map(lambda x: filterOnNumberOfTweets(x))

In [None]:
POI = Trajectory[Trajectory['isCandidate']]

In [None]:
POI.shape

In [None]:
# A function that creates paires of POIS.
def createPOIPairs(dataframeWithPOI):
    
    # Loop over the given dataframe.
    for index, row in dataframeWithPOI.iterrows():
        
        pairs = []
        
        # Loop over the array of trajectories for each row.
        for i in range(len(row['trajectory'])-1):
            
            # Create a pair.
            pair = [row['trajectory'][i], row['trajectory'][i+1]]
            print(pair)
            
            # If the pair already exists, continue (I only want unique pairs for each user)
            if pair in pairs:
                continue
            
            # Else process the pair.
            else:
                
                # Append the pair to the pair list.
                pairs.append(pair)
            
                # If i already in the dictionary, append.
                if row['trajectory'][i] in POIPairs.keys():
                    POIPairs[row['trajectory'][i]].append(row['trajectory'][i+1])
                # Else, create new key.
                else:
                    POIPairs[row['trajectory'][i]] = [row['trajectory'][i+1]]
    
    

In [None]:
createPOIPairs(POI)

#### Building a markov prediction model

In [None]:
# Pick a random start point.
startPoint = np.random.choice(list(POIPairs.keys()))

In [None]:
# Begin the markov chain with the start point.
MarkovChain = [startPoint]

In [None]:
numberOfVisitedLocations = 6

In [None]:
for i in range(numberOfVisitedLocations):
    MarkovChain.append(np.random.choice(POIPairs[MarkovChain[-1]]))

In [None]:
print(MarkovChain)

#### Generating a transition matrix

In [None]:
# Empty transition matrix
TransitionMatrix = []

In [None]:
# Iterate over the keys to fill the transition matrix
for key in POIPairs:
    
    # Count the total number of observations.
    totalObs = len(POIPairs[key])
    
    # Count how many times each POI occurs and get percentages
    totalCityCenter = POIPairs[key].count('City Center') / totalObs
    totalBlaak = POIPairs[key].count('Blaak') / totalObs
    totalAhoy = POIPairs[key].count('Rotterdam Ahoy') / totalObs
    totalErasmusBridge = POIPairs[key].count('Kop van Zuid') / totalObs
    totalFeyenoord = POIPairs[key].count('Feyenoord Stadium') / totalObs
    totalAirport = POIPairs[key].count('Rotterdam The Hague Airport')/ totalObs
    totalPercentage = totalCityCenter + totalBlaak + totalAhoy + totalErasmusBridge + totalFeyenoord + totalAirport
    # Append the percentages to the transition matrix.
    TransitionMatrix.append([totalCityCenter, totalAhoy, totalFeyenoord, totalErasmusBridge, totalBlaak, totalAirport])
    print(key)

In [None]:
# Create a dataframe for better visualization.
TransitionMatrixDF = pd.DataFrame(TransitionMatrix)

In [None]:
TransitionMatrixDF

In [None]:
# Rename columns to match the order
TransitionMatrixDF.rename(columns = {0: "City Center", 1: " Rotterdam Ahoy", 2: "Feyenoord Stadium", 3: "Kop van Zuid",
                                    4: "Blaak", 5: "Rotterdam The Hague Airport"}, inplace=True)

In [None]:
# Rename index to match the order
TransitionMatrixDF.rename(index = {0: "City Center", 1: "Rotterdam Ahoy", 2: "Feyenoord Stadium", 3: "Kop van Zuid",
                                    4: "Blaak", 5: "Rotterdam The Hague Airport"}, inplace=True)

In [None]:
TransitionMatrixDF