In [3]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from pytz import timezone
from apyori import apriori


In [4]:
# Before running this script do:
# export GIO_EXTRA_MODULES=/usr/lib/x86_64-linux-gnu/gio/modules/
# in order to successfully visualize the plots.

#######################################################################################################################
# INPUT AND TRANSFORM THE DATA
#######################################################################################################################
# Read the data into a DataFrame (the original input file has been slightly edited):
original_df = pd.read_json('Donnees_Louvre_Zones_FORMATTED.json')
# The DataFrame's index allows access to each visit recorded in the log file:
#print(original_df.iloc[0])
#print(original_df.iloc[1])
# ...

# Dictionaries are one of python's default data structures which allow to store "key: value" pairs.
# Panda's Series are one-dimensional ndarrays (numpy built-in structures) with axis-labels, which allow to store array-like, dict, or scalar values.

# From the "original_df" dataframe we create two dataframes: one containing per visit data ("visits_df") and one containing per zone data ("zones_df")
ut = timezone('UTC')
visits_df = pd.DataFrame(columns = ["Id", "User Id", "Begin At", "Duration", "Positions", "Zones"])
zone_list2 = []
new_zone_list = [] # a list holding a lists of zones id of each visit, new_zone_list=[[zoneId1, zoneId2, ...],[zoneId1, zoneId2, ...], ...]
for index, visit in original_df._source.iteritems():
   #print('index:', index, 'Visit Id:', original_df.loc[index,'_id'], ',User Id:', visit['user_id'], ',Visit Begin:', visit['visit_begin'], ',Visit Duration:', visit['visit_duration'], ',Number of Positions:', visit['number_of_positions'])
   #visits_df.set_value(index, 'Id', original_df.loc[index,'_id']) # Deprecated
   visits_df.at[index, 'Id'] = original_df.loc[index,'_id']
   if 'user_id' in visit: # alternatively  we could pre-process the input data to not distinguish between 'user_id' and 'userId' and then do a simple: visits_df.set_value(index, 'User Id', visit['user_id'])
      #visits_df.set_value(index, 'User Id', visit['user_id']) # Deprecated
      visits_df.at[index, 'User Id'] = visit['user_id']
   elif 'userId' in visit:
      #visits_df.set_value(index, 'User Id', visit['userId']) # Deprecated
      visits_df.at[index, 'User Id'] = visit['userId']
   #visits_df.set_value(index, 'Begin At', (pd.to_datetime(visit['visit_begin']))) # Deprecated
   visits_df.at[index, 'Begin At'] = pd.to_datetime(visit['visit_begin'])
   #visits_df.set_value(index, 'Begin At', visit['visit_begin']) # if we alternatively wanted to treat it as text # Deprecated
   #visits_df.at[index, 'Begin At'] = visit['visit_begin']       # if we alternatively wanted to treat it as text
   #visits_df.set_value(index, 'Duration', visit['visit_duration']) # Deprecated
   visits_df.at[index, 'Duration'] = visit['visit_duration']
   #visits_df.set_value(index, 'Positions', visit['number_of_positions']) # Deprecated
   visits_df.at[index, 'Positions'] = visit['number_of_positions']
   #visits_df.set_value(index, 'Zones', []) # Deprecated
   visits_df.at[index, 'Zones'] = []
   zone_list1 = [] # temporary list to help create the 'Zones' column of the "visits_df" dataframe.
   list = [] # temporary list that holds the zones id of one visist
   for zone in visit['visit_times_zones'][:]:
      list.append(zone['zone']['id'])
      #print('Begin At:', zone['begin_at'], ',Zone Id:', zone['zone']['id'], ',Zone Name:', zone['zone']['name'],  ',Zone Level:', zone['zone']['level'], ',Duration:', zone['duration'])
      # Iteratively populate the "zone_list1" and "zone_list2" structures (NOTE: the timestamps - originally timezone unaware - now get localized as UTC):
      if 'nextZone' in zone.keys():
         zone_list1.append({"ZoneBeginAt": (pd.to_datetime(zone['begin_at'])).astimezone('UTC'), "ZoneId": zone['zone']['id'], "ZoneName": zone['zone']['name'], "ZoneLevel": zone['zone']['level'], "ZoneDuration": zone['duration'], "NextZoneId": zone['nextZone']})
         if 'user_id' in visit: # instead of this if condition we could pre-process the input data to include only 'user_id' or 'userId'
            zone_list2.append({"VisitId": original_df.loc[index,'_id'], "UserId": visit['user_id'], "VisitDuration": visit['visit_duration'], "VisitPositions": visit['number_of_positions'], "ZoneBeginAt": (pd.to_datetime(zone['begin_at'])).astimezone('UTC'), "ZoneId": zone['zone']['id'], "ZoneName": zone['zone']['name'], "ZoneLevel": zone['zone']['level'], "ZoneDuration": zone['duration'], "NextZoneId": zone['nextZone']})
         else: # 'userId' in visit
            zone_list2.append({"VisitId": original_df.loc[index,'_id'], "UserId": visit['userId'], "VisitDuration": visit['visit_duration'], "VisitPositions": visit['number_of_positions'], "ZoneBeginAt": (pd.to_datetime(zone['begin_at'])).astimezone('UTC'), "ZoneId": zone['zone']['id'], "ZoneName": zone['zone']['name'], "ZoneLevel": zone['zone']['level'], "ZoneDuration": zone['duration'], "NextZoneId": zone['nextZone']})
      else:
         zone_list1.append({"ZoneBeginAt": (pd.to_datetime(zone['begin_at'])).astimezone('UTC'), "ZoneId": zone['zone']['id'], "ZoneName": zone['zone']['name'], "ZoneLevel": zone['zone']['level'], "ZoneDuration": zone['duration'], "NextZoneId": 'none'})
         if 'user_id' in visit: # instead of this if we could pre-process the input data to have only one of 'user_id' or 'userId'
            zone_list2.append({"VisitId": original_df.loc[index,'_id'], "UserId": visit['user_id'], "VisitDuration": visit['visit_duration'], "VisitPositions": visit['number_of_positions'], "ZoneBeginAt": (pd.to_datetime(zone['begin_at'])).astimezone('UTC'), "ZoneId": zone['zone']['id'], "ZoneName": zone['zone']['name'], "ZoneLevel": zone['zone']['level'], "ZoneDuration": zone['duration'], "NextZoneId": 'none'})
         else: # 'userId' in visit
            zone_list2.append({"VisitId": original_df.loc[index,'_id'], "UserId": visit['userId'], "VisitDuration": visit['visit_duration'], "VisitPositions": visit['number_of_positions'], "ZoneBeginAt": (pd.to_datetime(zone['begin_at'])).astimezone('UTC'), "ZoneId": zone['zone']['id'], "ZoneName": zone['zone']['name'], "ZoneLevel": zone['zone']['level'], "ZoneDuration": zone['duration'], "NextZoneId": 'none'})
   #visits_df.set_value(index, 'Zones', zone_list1) # Deprecated
   visits_df.at[index, 'Zones'] = zone_list1
   new_zone_list.append(list)
zones_df = pd.DataFrame(zone_list2, columns=["UserId", "VisitId", "VisitDuration", "VisitPositions", "ZoneName", "ZoneLevel", "ZoneId", "ZoneBeginAt", "ZoneDuration", "NextZoneId"]) # Use "zone_list2" for "zones_df".

# We add the "ZoneDurationWithGap" column to the "zones_df" dataframe:
# Its values are equal to the duration between the "BeginAt" timestamps of each zone and its subsequent zone
# It represents the sum of "ZoneDuration" + non-detection gap (if any)
for index, row in zones_df.iterrows():
   if row['NextZoneId'] != "none":
      zones_df.at[index,'ZoneDurationWithGap'] = (zones_df.at[index+1,'ZoneBeginAt'] - zones_df.at[index,'ZoneBeginAt']).total_seconds()
   else:
      zones_df.at[index,'ZoneDurationWithGap'] = zones_df.at[index,'ZoneDuration']
zones_df['ZoneDurationWithGap'] = zones_df['ZoneDurationWithGap'].astype(int) # forcing float values to become int values

print("import finished ...")

import finished ...


## Co-occurrence

In [5]:
association_rules = apriori(new_zone_list, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)

nbr_items_print = 5

for item in association_rules:
    if nbr_items_print == 0:
        break
    nbr_items_print -= 1
    # first index of the inner list
    # Contains base item and add item
    pair = item[0]
    items = [x for x in pair]
    print("Rule: " + str(items[0]) + " -> " + str(items[1]))

    #second index of the inner list
    print("Support: " + str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")


Rule: 60852 -> 60839
Support: 0.008493427704752275
Confidence: 0.5060240963855422
Lift: 3.074065302980966
Rule: 60853 -> 60839
Support: 0.008493427704752275
Confidence: 0.5060240963855422
Lift: 3.418427809599052
Rule: 60846 -> 60847
Support: 0.028311425682507583
Confidence: 0.440251572327044
Lift: 9.22476281846285
Rule: 60848 -> 60846
Support: 0.013751263902932255
Confidence: 0.21383647798742136
Lift: 3.887578616352201
Rule: 60849 -> 60846
Support: 0.030333670374115267
Confidence: 0.4716981132075471
Lift: 6.425749779094548


## Trasition Matrix
This matrix contains all the transitions stored in  the file.The value stored in Donnees_Louvre_Zones_FORMATTED.json file.<br>
matrix[i, j] represents the probability to transit from zone j to the zone i based on the trajectories of visitors stored in our data. For each visit, we take sequential visited zones two by two, and we add 1 to the value stored in matrix[departure zone, arrival zone].
<br>At the end, we divide each value by the sum of the row to provide a stochastic matrix.

In [6]:
zones_N1 = ['0'] # used to initialize the data frame
nbr_arcs = {} # nbr of transitions from the from_zone, in order to calculate the average
trasition_matrix = pd.DataFrame( index=zones_N1)

# loop through zones, and add a transition each time.
for index, row in zones_df.iterrows():
   from_zone = str(row['ZoneId'])
   if  from_zone not in trasition_matrix.index : # Adding new row/column with zeros if it doesn't exist
       trasition_matrix.loc[:, from_zone] = 0.0
       trasition_matrix.loc[from_zone] = 0.0
       nbr_arcs[from_zone] = 0.0

   if row['NextZoneId'] != "none": # if it's note the last zone visited
      to_zone = str(row['NextZoneId'])
      if  to_zone not in trasition_matrix.index : # Adding new row/column with zeros if it doesn't exist
          trasition_matrix.loc[:, to_zone] = 0.0
          trasition_matrix.loc[to_zone] = 0.0
          nbr_arcs[to_zone] = 0.0

      trasition_matrix.at[ from_zone, to_zone ] += 1 # Add the trasition to the value in matrix[from_zone, to_zone]
      nbr_arcs[from_zone] += 1

trasition_matrix = trasition_matrix.drop('0')

# Convert the matrix to a stochastic matrix (sum of values in a row = 1)
for index, row in trasition_matrix.iterrows():
    trasition_matrix.loc[index] = [round(value/nbr_arcs[index], 6) for value in row]

#print(trasition_matrix.loc[:,:].sum(axis=1)) # to make sure it is a stochastic matrix

print("matrix shape : ",trasition_matrix.shape)
print("Transition matrix smaple :")
trasition_matrix.iloc[:10,:]

#df.to_csv('trasition_matrix.csv', index=True, header=True, sep=' ') # write matrix to a csv file

matrix shape :  (30, 30)
Transition matrix smaple :


Unnamed: 0,60890,60888,60852,60902,60904,60853,60854,60839,60849,60903,...,60850,60889,60907,60893,60851,60846,60855,60847,60894,60911
60890,0.0,0.666667,0.013889,0.0,0.0,0.013889,0.013889,0.0,0.0,0.0,...,0.125,0.041667,0.013889,0.013889,0.013889,0.0,0.0,0.0,0.0,0.0
60888,0.110429,0.0,0.019939,0.044479,0.009202,0.194785,0.058282,0.009202,0.016871,0.003067,...,0.021472,0.131902,0.004601,0.003067,0.064417,0.010736,0.001534,0.018405,0.0,0.0
60852,0.0,0.014156,0.0,0.054601,0.250758,0.185035,0.010111,0.012133,0.004044,0.058645,...,0.006067,0.0,0.003033,0.003033,0.340748,0.002022,0.003033,0.001011,0.0,0.001011
60902,0.000976,0.00878,0.087805,0.0,0.081951,0.310244,0.324878,0.0,0.000976,0.007805,...,0.002927,0.000976,0.0,0.0,0.009756,0.0,0.011707,0.0,0.0,0.124878
60904,0.0,0.008562,0.315068,0.145548,0.0,0.068493,0.005137,0.0,0.0,0.212329,...,0.005137,0.0,0.008562,0.0,0.010274,0.0,0.010274,0.001712,0.0,0.006849
60853,0.003158,0.035789,0.263158,0.351579,0.075789,0.0,0.188421,0.02,0.003158,0.005263,...,0.001053,0.004211,0.0,0.0,0.024211,0.002105,0.001053,0.001053,0.0,0.001053
60854,0.0,0.020472,0.029921,0.581102,0.011024,0.277165,0.0,0.00315,0.001575,0.001575,...,0.001575,0.001575,0.0,0.0,0.007874,0.001575,0.006299,0.0,0.0,0.047244
60839,0.0,0.042553,0.446809,0.0,0.0,0.404255,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.042553,0.0,0.0,0.0,0.042553,0.0
60849,0.0,0.022936,0.018349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.004587,0.013761,0.0,0.098624,0.21789,0.002294,0.013761,0.0,0.0
60903,0.0,0.0,0.177215,0.075949,0.35865,0.033755,0.012658,0.004219,0.0,0.0,...,0.012658,0.0,0.033755,0.0,0.008439,0.004219,0.016878,0.0,0.0,0.008439


## Weighted matrix
This matrix has the same properties of the Transition matrix, except it stores on top of that the duration with the gap of the arrival zone.<br>
For each cell (i, j) in the matrix, contains the average duration that a visitor stays in zone j and came form zone i, this average is divided by the sum of the row to keep the matrix normalized (Stochastic).<br>
<sub><sup>Ps. We only use duration with gap and not the absolute duration</sub></sup>

In [7]:
zones_N1 = ['0'] # used to initialize the data frame
nbr_arcs = {} # nbr of transitions from the from_zone, in order to calculate the average
weighted_matrix = pd.DataFrame( index=zones_N1)
i = 500 # used for debuging
# loop through zones, and add a transition each time.
for index, row in zones_df.iterrows():
   from_zone = str(row['ZoneId'])
   if  from_zone not in weighted_matrix.index : # Adding new row/column with zeros if it doesn't exist
       weighted_matrix.loc[:, from_zone] = 0.0
       weighted_matrix.loc[from_zone] = 0.0
       nbr_arcs[from_zone] = 0.0

   if row['NextZoneId'] != "none": # if it's note the last zone visited
      to_zone = str(row['NextZoneId'])
      if  to_zone not in weighted_matrix.index : # Adding new row/column with zeros if it doesn't exist
          weighted_matrix.loc[:, to_zone] = 0.0
          weighted_matrix.loc[to_zone] = 0.0
          nbr_arcs[to_zone] = 0.0
      
      # Add the trasition to the value in matrix[from_zone, to_zone]
      weighted_matrix.at[ from_zone, to_zone ] += zones_df.iloc[index+1]['ZoneDurationWithGap']
      
      # weighted_matrix.at[ from_zone, to_zone ] += zones_df.loc[(zones_df['VisitId'] == row['VisitId']) 
      #                                          & (zones_df['ZoneId'] == row['NextZoneId'])]['ZoneDurationWithGap'].iloc[0]
     
      nbr_arcs[from_zone] += zones_df.iloc[index+1]['ZoneDurationWithGap']
    
      if(i == 0):
          print('.', end ="")  
          i = 500
      i -= 1
        
weighted_matrix = weighted_matrix.drop('0')

# Convert the matrix to a stochastic matrix (sum of values in a row = 1)
for index, row in weighted_matrix.iterrows():
    weighted_matrix.loc[index] = [round(value/nbr_arcs[index], 6) for value in row]

print()

print("matrix shape : ",weighted_matrix.shape)
print("Transition matrix sample :")
weighted_matrix.iloc[:10,:10]

#df.to_csv('weighted_matrix.csv', index=True, header=True, sep=' ') # write matrix to a csv file

..............................
matrix shape :  (30, 30)
Transition matrix sample :


Unnamed: 0,60890,60888,60852,60902,60904,60853,60854,60839,60849,60903
60890,0.0,0.415353,0.003672,0.0,0.0,0.019181,0.000232,0.0,0.0,0.0
60888,0.060185,0.0,0.025617,0.041891,0.001805,0.142507,0.113199,0.009336,0.021496,0.001004
60852,0.0,0.010102,0.0,0.081194,0.209551,0.151232,0.003386,0.010759,0.003703,0.041261
60902,5.8e-05,0.026515,0.106638,0.0,0.096084,0.199542,0.244208,0.0,0.0,0.004772
60904,0.0,0.010525,0.250595,0.185274,0.0,0.051511,0.007663,0.0,0.0,0.21232
60853,0.00524,0.034125,0.275687,0.331149,0.07766,0.0,0.206255,0.019737,0.006018,0.001231
60854,0.0,0.025767,0.034842,0.594174,0.006323,0.258773,0.0,0.005451,0.001284,0.000292
60839,0.0,0.05297,0.515362,0.0,0.0,0.330751,0.0,0.0,0.0,0.0
60849,0.0,0.020797,0.041515,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60903,0.0,0.0,0.087834,0.059625,0.342566,0.044277,0.009083,0.00012,0.0,0.0


## Adjacency matrix
This matrix contains 1 if there is an accessibility between two zones, the data is stored in zone_To_zone.csv file.<br>
This matrix is symmetric, so each time we add 1 to (i, j) we add it also to (j, i)We also modify this matrix to be stochastic for later use.

In [8]:
with open('zone_To_zone.csv', newline='') as csv_file:
    
    zone_to_zone = pd.read_csv(csv_file) # read from file
        
    zones_N1 = ['0'] # used to initialize the data frame
    adjacency_matrix = pd.DataFrame( index=zones_N1)
    
    for index, row in zone_to_zone.iterrows():
        zone1 = str(row['Zone1'])
        if  zone1 not in adjacency_matrix.index : # Adding new row/column with zeros if it doesn't exist
            adjacency_matrix.loc[:, zone1] = 0.0
            adjacency_matrix.loc[zone1] = 0.0
            nbr_arcs[zone1] = 0.0
        
        zone2 = str(row['Zone2'])
        if  zone2 not in adjacency_matrix.index : # Adding new row/column with zeros if it doesn't exist
            adjacency_matrix.loc[:, zone2] = 0.0
            adjacency_matrix.loc[zone2] = 0.0
            nbr_arcs[zone2] = 0.0
        
        adjacency_matrix.at[ zone1, zone2 ] += 1
        adjacency_matrix.at[ zone2, zone1 ] += 1
   
    adjacency_matrix = adjacency_matrix.drop('0')
    
    # Convert the matrix to a stochastic matrix (sum of values in a row = 1)
    for index, row in adjacency_matrix.iterrows():
        adjacency_matrix.loc[index,:] /= adjacency_matrix.loc[index,:].sum()
    
print(adjacency_matrix.shape) 
adjacency_matrix.iloc[:10,:]     

(82, 82)


Unnamed: 0,60896,60895,60894,60891,60893,70002,70003,60892,60911,60902,...,65023,60888,60887,65022,60848,65011,65012,70013,65006,60839
60896,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60895,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
60894,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60891,0.0,0.0,0.142857,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60893,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70002,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.166667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70003,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.166667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60892,0.0,0.0,0.0,0.166667,0.0,0.166667,0.166667,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Artificial Matrix
Artificial matrix is the merge the transition matrix and the adjacency matrix.We sum the two matrices unequally.<br>
<b>Alpha</b> is the coefficient of using the transition matrix.

In [9]:
alpha = 0.85 # coeffecient of the probability
    
artificial_matrix = adjacency_matrix * (1 - alpha) # copy the adjacency matrix and multiply it by (1 - alpha)


# add the trasition matrix multiplied by alpha
idx = trasition_matrix.index.values
artificial_matrix.loc[idx, idx] += (trasition_matrix.loc[idx,idx] * alpha)

#print(artificial_matrix.sum(axis=1))
artificial_matrix.iloc[:10,:]

Unnamed: 0,60896,60895,60894,60891,60893,70002,70003,60892,60911,60902,...,65023,60888,60887,65022,60848,65011,65012,70013,65006,60839
60896,0.0,0.233273,0.017347,0.0,0.0,0.0,0.0,0.002478,0.0,0.0,...,0.0,0.007434,0.0,0.0,0.004956,0.0,0.0,0.0,0.0,0.0
60895,0.165183,0.0,0.010945,0.0,0.0,0.0,0.0,0.0,0.0,0.003648,...,0.0,0.014592,0.0,0.0,0.219903,0.0,0.0,0.0,0.0,0.0
60894,0.052307,0.026154,0.0,0.15,0.013077,0.0,0.0,0.326923,0.0,0.0,...,0.0,0.0,0.0,0.0,0.013077,0.0,0.0,0.0,0.0,0.026154
60891,0.0,0.0,0.021429,0.0,0.050804,0.021429,0.021429,0.074187,0.001954,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60893,0.0,0.0,0.0,0.088542,0.0,0.075,0.075,0.305469,0.0,0.004427,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70002,0.0,0.0,0.0,0.025,0.025,0.0,0.0,0.025,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70003,0.0,0.0,0.0,0.025,0.025,0.0,0.0,0.025,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60892,0.0,0.003633,0.108974,0.119444,0.188889,0.025,0.025,0.0,0.0,0.0,...,0.0,0.010898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.551571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60902,0.0,0.00083,0.0,0.003317,0.0,0.0,0.0,0.001658,0.156146,0.0,...,0.0,0.007463,0.0,0.0,0.00083,0.0,0.0,0.0,0.0,0.0


## Artificial Weighted Matrix
Same matrix as above, except we use weighted matrix instead

In [11]:
alpha = 0.85 # coeffecient of the probability
    
artificial_matrix = adjacency_matrix * (1 - alpha) # copy the adjacency matrix and multiply it by (1 - alpha)

# add the trasition matrix multiplied by alpha
idx = weighted_matrix.index.values
artificial_matrix.loc[idx, idx] += (weighted_matrix.loc[idx,idx] * alpha)

#print(artificial_matrix.sum(axis=1))
artificial_matrix.iloc[:10,:]

Unnamed: 0,60896,60895,60894,60891,60893,70002,70003,60892,60911,60902,...,65023,60888,60887,65022,60848,65011,65012,70013,65006,60839
60896,0.0,0.38339,0.007353,0.0,0.0,0.0,0.0,0.002091,0.0,0.0,...,0.0,0.00419,0.0,0.0,0.001834,0.0,0.0,0.0,0.0,0.0
60895,0.117338,0.0,0.002152,0.0,0.0,0.0,0.0,0.0,0.0,0.032941,...,0.0,0.016452,0.0,0.0,0.095131,0.0,0.0,0.0,0.0,0.0
60894,0.004777,0.045637,0.0,0.15,0.003237,0.0,0.0,0.482579,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00308,0.0,0.0,0.0,0.0,0.007817
60891,0.0,0.0,0.021429,0.0,0.024175,0.021429,0.021429,0.090885,9.7e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60893,0.0,0.0,0.0,0.16938,0.0,0.075,0.075,0.471311,0.0,0.018562,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70002,0.0,0.0,0.0,0.025,0.025,0.0,0.0,0.025,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70003,0.0,0.0,0.0,0.025,0.025,0.0,0.0,0.025,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60892,0.0,0.001212,0.07408,0.370649,0.091671,0.025,0.025,0.0,0.0,0.0,...,0.0,0.005199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64623,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60902,0.0,0.000949,0.0,0.002325,0.0,0.0,0.0,0.007793,0.220626,0.0,...,0.0,0.022538,0.0,0.0,3.1e-05,0.0,0.0,0.0,0.0,0.0


## Trajectory generation
In this part we based ourselves on the probabilities of passage between two zones of the artificial matrix.we generate a random value based on the algorithm of Mersenne Twister known for its reliability and we compare this generated value with the probability between two zones of the artificial matrix (the more this probability is big the more we will have chance to make this transition).<br>
For the exit conditions we have 2 : 
<ul>
    <li> If you are on an exit zone, you have a 70% chance of getting out</li>
    <li> If the duration of the generated visit is >= 2 * the longest visit of the original dataset </li>
</ul>
Difficulties:
<ul>
    <li> Too many outliers in the dataset</li>
    <li> And only 30 zones out of 82 have a duration so 52 zones have never been used </li>
</ul>

In [152]:
Zone_df_duration = zones_df[['ZoneId','ZoneDurationWithGap']]
Zone_df_duration_bounded = Zone_df_duration.loc[Zone_df_duration['ZoneDurationWithGap'] > 5]
Zone_df_duration_bounded = Zone_df_duration_bounded.loc[Zone_df_duration_bounded['ZoneDurationWithGap'] < 3600]
Zone_df_duration_bounded_grouped = Zone_df_duration_bounded.groupby("ZoneId")
#This function will take as input a ZoneId and return a random duration
from numpy import random
def generateDuration(zoneID):  
    if zoneID in Zone_df_duration_bounded['ZoneId']:##if the zone exists in our dataframe we'll generate a random duration according to the max duration spent in that zone previously
        m = Zone_df_duration_bounded.loc[Zone_df_duration_bounded['ZoneId'] == zoneID].max()
        z = random.exponential(scale=m['ZoneDurationWithGap']/10, size= 1)[0]
    else: #if not, we generate a random duration based on the max durention spent in all zones together
        m = Zone_df_duration_bounded['ZoneDurationWithGap'].max()
        z = random.exponential(scale=m/10, size= 1)
        
    return int(z)

In [153]:
artificial_weighted_matrix =pd.read_csv('artificial_weighted_matrix.csv', index_col=0)

In [154]:
#a ne pas oublier combien de temps il y reste aussi si zone de sorti sortir forcément ou non je test avec oui ?
exit_zones = ['70020', '65019', '65020', '65021', '60887', '60911', '60910']
enter_area = ['60888', '65023']
max_visite_duration = 27697 * 2 #the duration limit of a generated visit will be 2*max_visit_duration
trajectories = []
duree_visite = 0
trajectories.append('60888')#adding enter area 65023 doesn't work
####################################################################################################################################
def GenerateWithProbability(enter):
    y = 0
    if(len(trajectories) > 100) :
        return
    if(duree_visite > max_visite_duration):
        print('hello')
    for i in range(1000):
        if(random.random() < artificial_weighted_matrix.loc[enter][y]) :
            if(artificial_weighted_matrix.columns[y] in exit_zones): #if exit zone add the zone to finish the trajectory and add exit to  recognize it later
                trajectories.append(artificial_weighted_matrix.columns[y])
                trajectories.append('exit')
                trajectories.append('60888')
                GenerateWithProbability(60888) #recursive call with one of the enter zone
            else :
                trajectories.append(artificial_weighted_matrix.columns[y])
                GenerateWithProbability(int(artificial_weighted_matrix.columns[y]))
        else :
            if(y<81) :
                y = y+1
            else :
                y=0
####################################################################################################################################

In [155]:
GenerateWithProbability(60888)

In [156]:
def treatment_output(trajectories) :
    size = len(trajectories)
    idx_list = [idx + 1 for idx, val in
                enumerate(trajectories) if val == 'exit']
    trajectories = [trajectories[i: j] for i, j in
            zip([0] + idx_list, idx_list + 
            ([size] if idx_list[-1] != size else []))]
    trajectories.remove(trajectories[len(trajectories)-1])
    #for i in range (len(trajectories)-1):
        #trajectories = list(dict.fromkeys(trajectories))
    return trajectories
    

In [157]:
treatment_output(trajectories)

[['60888', '60887', 'exit'],
 ['60888', '60889', '60888', '60895', '60910', 'exit'],
 ['60888', '60895', '60848', '60910', 'exit'],
 ['60888', '60887', 'exit'],
 ['60888', '60910', 'exit'],
 ['60888',
  '60853',
  '60854',
  '60902',
  '60904',
  '70011',
  '60904',
  '70011',
  '60906',
  '60904',
  '60903',
  '60904',
  '60903',
  '60904',
  '60852',
  '60855',
  '60852',
  '60906',
  '60908',
  '60909',
  '60891',
  '60909',
  '60891',
  '60906',
  '60891',
  '60909',
  '60910',
  'exit'],
 ['60888',
  '65017',
  '70026',
  '65013',
  '65014',
  '65015',
  '65014',
  '65013',
  '65014',
  '65015',
  '65015',
  '70025',
  '65013',
  '65015',
  '70026',
  '65013',
  '70026',
  '65017',
  '60888',
  '60887',
  'exit'],
 ['60888', '60854', '60902', '60853', '60902', '60911', 'exit'],
 ['60888', '60887', 'exit'],
 ['60888',
  '60851',
  '60850',
  '60907',
  '70007',
  '60907',
  '60891',
  '60906',
  '60904',
  '60906',
  '60908',
  '60906',
  '60902',
  '70010',
  '60854',
  '60902',
 

In [150]:
generateDuration(60888)

432

In [151]:
def generateDurationFromTrajectory(trajectory):
    dictionnary = dict.fromkeys(trajectory,0)
    for i in trajectory:
        dictionnary[i] = generateDuration(i)
    return dictionnary