# Mobility data analysis of researchers trajectories

*Author: Liubov*; *Collaborators: Marc, Bastian, Vero*

Notebook inspired from Open humans project https://www.openhumans.org

Read mobility files from Open humans
https://www.openhumans.org/api/public-data/?source=direct-sharing-138

Other notebooks 
https://exploratory.openhumans.org/notebooks/?source=Moves%20connection

Openhumans open API 
https://open-humans-api.readthedocs.io/en/latest/cli.html#example-use-cases


# Content 
1. Data preprocessing
2. Data analysis
3. Data visualisation

In [1]:


'''
function to create new dataframe from the json trajectory file
code adapted from openhumans notebook
'''

def dataframe_from_json(moves_data):
    for datapoint in moves_data:
        # we need to have observed segments for that day. If moves wasn't running we ignore the day
        if datapoint['segments'] != None:
            # did we stay in a place that day and did we walk that day?
            has_places = False
            walked = False
            for i in datapoint['segments']:
                if i['type'] == 'place':
                        # yes, we were in one place w/o moving around too much, we can keep this day
                        has_places = True
                        
            # is this day in our date range of interest and has data?
            if datapoint['summary'] != None and has_places and datetime.strptime(datapoint['date'],"%Y%m%d") > datetime.strptime(DATARANGE_START,"%Y-%m-%d"):
                moves_processed_data['date'].append(datapoint['date'])
                for activity in datapoint['summary']:
                    if activity['activity'] == 'walking':
                        moves_processed_data['steps'].append(activity['steps'])
                        moves_processed_data['distance'].append(activity['distance'])
                        walked = True
                        
                # in case of not walking, step count is zero
                if not walked:
                    moves_processed_data['steps'].append(0)
                    moves_processed_data['distance'].append(0)  
                    
                # distribution of stops lengths
                stops_distrib = duration_stop_distribution(datapoint['segments'])
                moves_processed_data['duration'].append(stops_distrib)
                location = longest_daily_location(datapoint['segments'])
                moves_processed_data['lat'].append(location['lat'])
                moves_processed_data['lon'].append(location['lon'])    
    
    
    
    #Now that we have all of the data we can convert it into a single pandas dataframe for easier processing and visualization
    
    moves_dataframe = pd.DataFrame(data={
        'date': moves_processed_data['date'],
        'steps': moves_processed_data['steps'],
        'distance': moves_processed_data['distance'],
        'latitude': moves_processed_data['lat'],
        'longitude': moves_processed_data['lon']
    })   
    
    return moves_dataframe


# 1. Data preprocessing

Json data analysis is given here:
https://www.dataquest.io/blog/python-json-tutorial/ 
# Transforming json file to dataframe

We have json file with google locations and we need to transform it to dataframe.

In [5]:
# main function analyzing moves_data    
# starting date of being researcher - get is from google form CSV file


#DATARANGE_START = "2016-06-01"
#DATARANGE_END = "2018-05-08"

import os
import json
import requests
from datetime import datetime
from collections import defaultdict
import pandas as pd
import numpy as np
import seaborn as sns


# sets the axis label sizes for seaborn
rc={'font.size': 14, 'axes.labelsize': 14, 'legend.fontsize': 14.0, 
    'axes.titlesize': 14, 'xtick.labelsize': 14, 'ytick.labelsize': 14}
sns.set(rc=rc)

#with open('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/moves-storyline-data.json') as f:
#    moves_data = json.load(f)

#with open('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/moves-storyline-data98972.json') as f:
#    moves_data3 = json.load(f)   
    

id_research1 = "05364098" # Bastian ID 05364098
#id_research1 = "32891125"#ID of researcher  # Marc ID 32891125 
print('data loaded for researcher ', str(id_research1))

with open('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/researcher_mobility/'+id_research1+'/direct-sharing-182/Location History.json') as f:
    moves_data_researcher1 = json.load(f)  
#with open('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/researcher_mobility/'+id_research1+'/direct-sharing-182/Location History.json') as f:
#    moves_data_researcher1 = json.load(f)   



df1 = pd.DataFrame(moves_data_researcher1)
df1.head(10)
#print(type(df1_datapoint ))
#print(df1.columns)


# function to get data from json data
#df1 = dataframe_from_json(moves_data) # works for other geo-files 
    


data loaded for researcher  05364098


Unnamed: 0,locations
0,"{'timestampMs': '1280557864967', 'latitudeE7':..."
1,"{'timestampMs': '1280557901000', 'latitudeE7':..."
2,"{'timestampMs': '1280557967983', 'latitudeE7':..."
3,"{'timestampMs': '1280557969000', 'latitudeE7':..."
4,"{'timestampMs': '1280558011227', 'latitudeE7':..."
5,"{'timestampMs': '1280558133517', 'latitudeE7':..."
6,"{'timestampMs': '1280558311735', 'latitudeE7':..."
7,"{'timestampMs': '1280558591706', 'latitudeE7':..."
8,"{'timestampMs': '1280558732806', 'latitudeE7':..."
9,"{'timestampMs': '1280558893110', 'latitudeE7':..."


Moves data for a researcher.

In [None]:
"'locations': [{'timestampMs': '1546429982000',
   'latitudeE7': 119987508,
   'longitudeE7': 1202035421,
   'accuracy': 5,
   'velocity': 1,
   'heading': 240,
   'altitude': 11,
   'verticalAccuracy': 4},


# Creating new data frame and readable csv file
This file we can then visualise in deck.gl

In [7]:

# create dataframe with lat, lon
time_array = np.zeros(int(df1.shape[0]))
lat_array = np.zeros(int(df1.shape[0]))
lon_array = np.zeros(int(df1.shape[0]))

 
for ind in range(0,int(df1.shape[0])): # make a loop in all rows of dataframe df1 
    #df1_datapoint = df1.iloc[ind].values
    dict_df1_data = df1.iloc[ind].values[0]
    
    time_array[ind] = dict_df1_data['timestampMs']
    lat_array[ind] = dict_df1_data['latitudeE7']
    lon_array[ind] = dict_df1_data['longitudeE7']


df1['time'] = time_array

df1['lat'] = lat_array
df1['lon'] = lon_array


print('done with creating new dataframe')
    
# get ith datapoint in dataframe: df1_datapoint = df1.iloc[i].values
# get value in dictionary for timestamp, lat, lon: dict_df1_data = df1_datapoint[0]
# dict_df1_data['timestampMs'] ... 

done with creating new dataframe


### We need to split column values into different values: timestamps, latitude, longitude etc.

In [8]:

df1.head()

Unnamed: 0,locations,time,lat,lon
0,"{'timestampMs': '1280557864967', 'latitudeE7':...",1280558000000.0,523251200.0,80977320.0
1,"{'timestampMs': '1280557901000', 'latitudeE7':...",1280558000000.0,525368945.0,81131936.0
2,"{'timestampMs': '1280557967983', 'latitudeE7':...",1280558000000.0,525573400.0,81134790.0
3,"{'timestampMs': '1280557969000', 'latitudeE7':...",1280558000000.0,525613248.0,81128391.0
4,"{'timestampMs': '1280558011227', 'latitudeE7':...",1280558000000.0,525723000.0,81104850.0


In [None]:
pd.DataFrame.to_csv('')

# 2. Data visualisation 

Visualise trajectory on the map.

In [37]:
import numpy as np
from matplotlib import pyplot as plt
import numpy as np
import matplotlib.cm as cm
import folium

# function to visualise trajectories on a map 

def visualise_df_traj(df, lat_values, lon_values):
    ''' function to visusalise trajectories on a map
    input:
    lat_values = df.latitude.values
    lon_values = df.longitude.values'''
    
    latitude = df.lat.values
    longitude = df.lon.values

    #then we zip two arrays of lat, lon of datapoints
    latlon1 = list(zip(latitude, longitude))


    #We put map to show first location of Paris 48.8566° North, 2.3522° est
    mapit = folium.Map( location=[48.75 , 2.35], zoom_start=6 )
    for coord in latlon1:
        folium.Marker( location=[ coord[0], coord[1] ], fill_color='#43d9de', radius=8 ).add_to( mapit )
    mapit


    #draw connecting line between locations written in latlon file

    trips = df.tripid # different trips
    array_trip = np.unique(trips) 


    #for ind, c in zip(ind_len, colors):    
    for ind in range(len(array_trip)):
        df_trip = df[df.tripid == array_trip[ind]]
        shape =df_trip.shape 
    
        latitude1 = df_trip.latitudestart.values
        longitude1 = df_trip.longitudestart.values

        #then we zip two arrays of lat, lon of datapoints
        latlon1 = list(zip(latitude1, longitude1))

    
        size = shape[0]
    SAF=folium.PolyLine(locations=latlon1,weight=5,color = 'r')#color_array[ind])
    mapit.add_child(SAF)
    #plt.show()
    


ModuleNotFoundError: No module named 'folium'

In [None]:
lat_values = df1.lat.values
lon_values = df1.lon.values

visualise_df_traj(df1, lat_values, lon_values)

# 3. Data analysis 

Here we do trajectories analysis of researchers and user profiling: 
    0. we analyze distributions of trip durations, trip lengths 
    1. we analyze frequencies of cities and places visited
    2. we calculate the number of amenities of places around the trajectories in places, where researchers stayed the longest
    3. we analyze possible common properties of trajectories of researchers

In [1]:
#functions inserted from OH notebook 

import json
from pprint import pprint


'''
Functions:
 to estimate the distribution of stop duration
 to estimate the distribution of jump length
'''

def duration_stop_distribution(daily_segments):
    """
    takes a daily segment of Moves 
    returns sorted distribution of stops duration
        Can be misleading for days w/ lots of travel etc. 
    """
    places_of_day = []
    duration_seq = [] #sequence of durations of stops
    for i in daily_segments:
        if i['type'] == 'place':
            place_location = i['place']['location']
            start_time = datetime.strptime(i['startTime'],'%Y%m%dT%H%M%S%z')
            end_time = datetime.strptime(i['endTime'],'%Y%m%dT%H%M%S%z')
            duration = end_time - start_time
            duration_seq.append([duration])
    duration_seq.sort() 
    return duration_seq 

def len_jumps_distribution(daily_segments):
    """
    takes a daily segment of Moves 
    returns the distribution of jumps lengths (calculated from lat/long of stops)
    can be misleading for days w/ lots of travel etc. 
    """
    places_of_traj = [] #returns array of all locations during the day
    for i in daily_segments: 
        if i['type'] == 'place':
            place_location = i['place']['location']
            places_of_traj.append([place_location])
    return places_of_traj


def longest_daily_location(daily_segments):
    """
    Function from Bastian (Open Humans)
       
    takes a daily segment log of Moves and returns the 
    lat/long for the location where most time was spent. 
    Can be misleading for days w/ lots of travel etc. 
    But the most quick/dirty solution for now.
    """
    places_of_day = []
    for i in daily_segments:
        if i['type'] == 'place':
            place_location = i['place']['location']
            start_time = datetime.strptime(i['startTime'],'%Y%m%dT%H%M%S%z')
            end_time = datetime.strptime(i['endTime'],'%Y%m%dT%H%M%S%z')
            duration = end_time - start_time
            places_of_day.append([place_location,duration])
    places_of_day.sort(key=lambda tup: tup[-1],reverse=True)
    return places_of_day[0][0]






In [None]:

# 0. analysis of trip durations

array_distance = df1.distance.values 
type(array_distance)
