# Importing packages

In [None]:
import numpy as np
import pandas as pd

# Import clustered tourist file

In [None]:
# Import the dataset
Tourists = pd.read_csv('CSV/photo_dataset_clustered.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
# Import the other files
Flickr = pd.read_csv('CSV/Flickr_Tourists.csv', sep= ',', low_memory = False, lineterminator='\n')
TwitterPhoto = pd.read_csv('CSV/tourists_attachments.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
Flickr.head()

In [None]:
Flickr['post_date'] = pd.to_datetime(Flickr['postedTime'])

In [None]:
FlickrDateInfo = Flickr[['photoID', 'post_date', 'userID']]

In [None]:
TwitterPhoto['post_date'] = pd.to_datetime(TwitterPhoto['tweet_date'])

In [None]:
TwitterDateInfo = TwitterPhoto[['item_number','post_date','user_id']]

In [None]:
# Get an overview of the clusters.
Tourists['cluster'].value_counts()

In [None]:
Tourists.dtypes

In [None]:
Tourists = pd.merge(Tourists, TwitterDateInfo, on='item_number', how='left')

In [None]:
Tourists = pd.merge(Tourists, FlickrDateInfo, on='photoID', how='left')

In [None]:
Tourists.shape

In [None]:
Tourists

In [None]:
# Create a dataset with only clustered tweets.
Tourists_InCluster = Tourists[Tourists['cluster'] != 'None']

In [None]:
Tourists_InCluster = Tourists_InCluster[Tourists_InCluster['cluster'] != 'Rotterdam The Hague Airport']

In [None]:
Tourists_InCluster['user_id'].nunique() + Tourists_InCluster['userID'].nunique()

In [None]:
Tourists_InCluster.loc[Tourists_InCluster['cluster'] == 'Blaak', ['cluster']] = 'City Center/Blaak'
Tourists_InCluster.loc[Tourists_InCluster['cluster'] == 'City Center', ['cluster']] = 'City Center/Blaak'


In [None]:
# Get an overview of the clusters.
Tourists_InCluster['cluster'].value_counts()

In [None]:
Tourists_InCluster.tail()

In [None]:
Tourists_InCluster["user"] = Tourists_InCluster["user_id"].astype(str) + Tourists_InCluster["userID"].astype(str)


In [None]:
Tourists_InCluster['post_date'] = Tourists_InCluster["post_date_x"].astype(str) + Tourists_InCluster["post_date_y"].astype(str)

In [None]:
Tourists_InCluster.loc[69441]['post_date'][:26]

In [None]:
def removeNaT(x):
    
    startingIndex = x.find("NaT")
    returnString = ''
    if startingIndex == 0:
        returnString = x[3:]
    else:
        returnString = x[:startingIndex]
    return returnString   

In [None]:
Tourists_InCluster['tweet_date'] = Tourists_InCluster['post_date'].map(lambda x: removeNaT(x))

In [None]:
Tourists_InCluster['tweet_date'] = pd.to_datetime(Tourists_InCluster['tweet_date'])

#### Next, I want to generate a path for each individual user

This path will partly correspond to the one De Choudhury et al. used. However, I will not be looking at time between tweets for now. Time will only be used to construct the path in the right order.

In [None]:
# Create a set of unique users.
UniqueUsers = Tourists_InCluster['user'].unique()

In [None]:
# Create an empty trajectory dictionary
Trajectories = {}

In [None]:
# For each user, create a timed path, get the cluster of each posts and add it to the empty dictionary.
for i in UniqueUsers:
    
    # Get all the posts of the user.
    Posts = Tourists_InCluster[Tourists_InCluster['user'] == i]
    
    # Sort the tweets.
    Posts = Posts.sort_values(by='tweet_date')
    
    # Get a list of all the clusters and put it in an array.
    ClustersOfPosts = Posts['cluster'].values
    
    # Add the user and the timed path to the dictionary.
    Trajectories.update( {i : ClustersOfPosts} )

In [None]:
# Trajectories in string format
TrajectoryString = {}

In [None]:
for key, value in Trajectories.items():
    
    trajectoryString = ''
    
    for i in range(len(value)):
        if trajectoryString == '':
            trajectoryString += value[i]
        else:
            trajectoryString += (' -> ' + str(value[i]))
    
    TrajectoryString.update({key:trajectoryString })
        

In [None]:
# Trajectory in array format
Trajectory = pd.DataFrame(list(Trajectories.items()))
Trajectory = Trajectory.rename(columns={0: "user_id", 1: "trajectory"})

In [None]:
# Trajectory in string format
TrajectoryStr = pd.DataFrame(list(TrajectoryString.items()))
TrajectoryStr = TrajectoryStr.rename(columns={0: "user_id", 1: "trajectory"})

In [None]:
TrajectoryStr.tail()

#### Next, I will explore the data to see how many people actually moved in the city

In [None]:
#AllTrajectories = Trajectory['trajectory'].values

In [None]:
#numberOfTrajectories = len(Trajectory)
#numberOfMovingTrajectories = 0

In [None]:
# Get the number of trajectories that actually move between POIs.
#for i in range(len(Trajectory)):
    
    #for j in range(len(Trajectory)):
        
        #if len(set(Trajectory)) != 1 and len(Trajectory) != 1:
            #numberOfMovingTrajectories += 1

In [None]:
#print('Number of trajectories total: {}'.format(numberOfTrajectories))
#print('Number of moving trajectories: {}'.format(numberOfMovingTrajectories))
#print('Percentage of trajectories that actually moves: {}'.format(str(numberOfMovingTrajectories/numberOfTrajectories)))

#### Create pairs in a dictionary

In [None]:
# A dictionary that will hold all the pairs
POIPairs = {}

In [None]:
POIPairs

In [None]:
# A function to filter the dataframe based on number of tweets
def filterOnNumberOfTweets(x):
    
    if len(x) > 1:
        return True
    return False

In [None]:
Trajectory['isCandidate'] = Trajectory['trajectory'].map(lambda x: filterOnNumberOfTweets(x))

In [None]:
POI = Trajectory[Trajectory['isCandidate']]

In [None]:
POI.shape

In [None]:
# A function that creates paires of POIS.
def createPOIPairs(dataframeWithPOI):
    
    # Loop over the given dataframe.
    for index, row in dataframeWithPOI.iterrows():
        
        pairs = []
        
        # Loop over the array of trajectories for each row.
        for i in range(len(row['trajectory'])-1):
            
            # Create a pair.
            pair = [row['trajectory'][i], row['trajectory'][i+1]]
            print(pair)
            
            # If the pair already exists, continue (I only want unique pairs for each user)
            if pair in pairs:
                continue
            
            # Else process the pair.
            else:
                
                # Append the pair to the pair list.
                pairs.append(pair)
            
                # If i already in the dictionary, append.
                if row['trajectory'][i] in POIPairs.keys():
                    POIPairs[row['trajectory'][i]].append(row['trajectory'][i+1])
                # Else, create new key.
                else:
                    POIPairs[row['trajectory'][i]] = [row['trajectory'][i+1]]
    
    

In [None]:
createPOIPairs(POI)

In [None]:
POIPairs

#### Building a markov prediction model

In [None]:
# Pick a random start point.
startPoint = np.random.choice(list(POIPairs.keys()))

In [None]:
# Begin the markov chain with the start point.
MarkovChain = [startPoint]

In [None]:
numberOfVisitedLocations = 6

In [None]:
for i in range(numberOfVisitedLocations):
    MarkovChain.append(np.random.choice(POIPairs[MarkovChain[-1]]))

In [None]:
print(MarkovChain)

#### Generating a transition matrix

In [None]:
# Empty transition matrix
TransitionMatrix = []

In [None]:
# Iterate over the keys to fill the transition matrix
for key in POIPairs:
    
    # Count the total number of observations.
    totalObs = len(POIPairs[key])
    
    # Count how many times each POI occurs and get percentages
    totalCityCenterBlaak = POIPairs[key].count('City Center/Blaak') / totalObs
    totalAhoy = POIPairs[key].count('Rotterdam Ahoy') / totalObs
    totalBlijdorp = POIPairs[key].count('Blijdorp Zoo') / totalObs
    totalErasmusBridge = POIPairs[key].count('Kop van Zuid') / totalObs
    totalEuromast = POIPairs[key].count('Euromast')/ totalObs
    totalFeyenoord = POIPairs[key].count('Feyenoord Stadium') / totalObs
    
    totalPercentage = totalCityCenterBlaak + totalBlijdorp + totalAhoy + totalErasmusBridge + totalFeyenoord + totalEuromast
    # Append the percentages to the transition matrix.
    TransitionMatrix.append([totalCityCenterBlaak, totalAhoy, totalBlijdorp, totalErasmusBridge, totalEuromast, totalFeyenoord])
    print(key)

In [None]:
# Create a dataframe for better visualization.
TransitionMatrixDF = pd.DataFrame(TransitionMatrix)

In [None]:
TransitionMatrixDF

In [None]:
# Rename columns to match the order
TransitionMatrixDF.rename(columns = {0: "City Center/Blaak", 1: " Rotterdam Ahoy", 2: "Blijdorp Zoo", 3: "Kop van Zuid",
                                    4: "Euromast", 5: "Feyenoord Stadium"}, inplace=True)

In [None]:
# Rename index to match the order
TransitionMatrixDF.rename(index = {0: "City Center/Blaak", 1: "Rotterdam Ahoy", 2: "Blijdorp Zoo", 3: "Kop van Zuid",
                                    4: "Euromast", 5: "Feyenoord Stadium"}, inplace=True)

In [None]:
TransitionMatrixDF