# Mobility data analysis of researchers trajectories

*Author: Liubov*; *Collaborators: Marc, Bastian, Vero*

Notebook inspired from Open humans project https://www.openhumans.org

Read mobility files from Open humans
https://www.openhumans.org/api/public-data/?source=direct-sharing-138

Other notebooks 
https://exploratory.openhumans.org/notebooks/?source=Moves%20connection

Openhumans open API 
https://open-humans-api.readthedocs.io/en/latest/cli.html#example-use-cases


# Content 
1. Data preprocessing
2. Data analysis
3. Data visualisation

In [1]:


'''
function to create new dataframe from the json trajectory file
code adapted from openhumans notebook
'''

def dataframe_from_json(moves_data):
    for datapoint in moves_data:
        # we need to have observed segments for that day. If moves wasn't running we ignore the day
        if datapoint['segments'] != None:
            # did we stay in a place that day and did we walk that day?
            has_places = False
            walked = False
            for i in datapoint['segments']:
                if i['type'] == 'place':
                        # yes, we were in one place w/o moving around too much, we can keep this day
                        has_places = True
                        
            # is this day in our date range of interest and has data?
            if datapoint['summary'] != None and has_places and datetime.strptime(datapoint['date'],"%Y%m%d") > datetime.strptime(DATARANGE_START,"%Y-%m-%d"):
                moves_processed_data['date'].append(datapoint['date'])
                for activity in datapoint['summary']:
                    if activity['activity'] == 'walking':
                        moves_processed_data['steps'].append(activity['steps'])
                        moves_processed_data['distance'].append(activity['distance'])
                        walked = True
                        
                # in case of not walking, step count is zero
                if not walked:
                    moves_processed_data['steps'].append(0)
                    moves_processed_data['distance'].append(0)  
                    
                # distribution of stops lengths
                stops_distrib = duration_stop_distribution(datapoint['segments'])
                moves_processed_data['duration'].append(stops_distrib)
                location = longest_daily_location(datapoint['segments'])
                moves_processed_data['lat'].append(location['lat'])
                moves_processed_data['lon'].append(location['lon'])    
    
    
    
    #Now that we have all of the data we can convert it into a single pandas dataframe for easier processing and visualization
    
    moves_dataframe = pd.DataFrame(data={
        'date': moves_processed_data['date'],
        'steps': moves_processed_data['steps'],
        'distance': moves_processed_data['distance'],
        'latitude': moves_processed_data['lat'],
        'longitude': moves_processed_data['lon']
    })   
    
    return moves_dataframe




# 1. Data preprocessing

Json data analysis is given here:
https://www.dataquest.io/blog/python-json-tutorial/ 

In [2]:
# main function analyzing moves_data    
# starting date of being researcher - get is from google form CSV file


DATARANGE_START = "2016-06-01"
DATARANGE_END = "2018-05-08"

In [4]:


import os
import json
import requests
from datetime import datetime
from collections import defaultdict
import pandas as pd
import numpy as np
import seaborn as sns


# sets the axis label sizes for seaborn
rc={'font.size': 14, 'axes.labelsize': 14, 'legend.fontsize': 14.0, 
    'axes.titlesize': 14, 'xtick.labelsize': 14, 'ytick.labelsize': 14}
sns.set(rc=rc)

with open('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/moves-storyline-data.json') as f:
    moves_data = json.load(f)
    
#with open('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/moves-storyline-data98933.json') as f: #uploading different datafiles
#    moves_data2 = json.load(f)

#with open('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/moves-storyline-data98972.json') as f:
#    moves_data3 = json.load(f)   
    

#id_research1 = "32891125"#ID of researcher  # Marc ID 32891125 
id_research2 = "05364098" # Bastian ID 05364098
print('data loaded for researcher ', str(id_research2))

#with open('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/researcher_mobility/'+id_research1+'/direct-sharing-182/Location History.json') as f:
#    moves_data_researcher1 = json.load(f)   

with open('C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/researcher_mobility/'+id_research2+'/direct-sharing-182/Location History.json') as f:
    moves_data_researcher2 = json.load(f)  
    

#print('first person')    
# print(moves_data)

    


data loaded for researcher  05364098


Moves data example

In [None]:


"[{'date': '20130515',
  'summary': [{'activity': 'walking',
    'group': 'walking',
    'duration': 3331.0,
    'distance': 3458.0,
    'steps': 5070,
    'calories': 202},
   {'activity': 'transport',
    'group': 'transport',
    'duration': 2383.0,
    'distance': 10577.0}],
  'segments': [{'type': 'place',
    'startTime': '20130515T131401-0700',
    'endTime': '20130515T132742-0700',
    'place': {'id': 61807543,
     'name': 'Ramuilar HQ',
     'type': 'home',
     'location': {'lat': 33.99882, 'lon': -118.43657}},
    'activities': [{'activity': 'walking',
      'group': 'walking',
      'manual': False,
      'startTime': '20130515T131801-0700',
      'endTime': '20130515T131851-0700',
      'duration': 50.0,
      'distance': 48.0,
      'steps': 96,
      'calories': 3,
      'trackPoints': []}],
    'lastUpdate': '20151001T012320Z'},
   {'type': 'move',
    'startTime': '20130515T132742-0700',
    'endTime': '20130515T133637-0700',
    'activities': [{'activity': 'transport',
      'group': 'transport',
      'manual': False,
      'startTime': '20130515T132742-0700',
      'endTime': '20130515T133636-0700',
      'duration': 534.0,
      'distance': 3503.0,
      'trackPoints': []}],
    'lastUpdate': '20130515T213146Z'},
   {'type': 'place',
    'startTime': '20130515T133637-0700',
    'endTime': '20130515T142848-0700',
    'place': {'id': 61855165,
     'name': 'Gjelina',
     'type': 'foursquare',
     'foursquareId': '4a6bdc59f964a52020d01fe3',
     'foursquareCategoryIds': ['4bf58dd8d48988d14e941735'],
     'location': {'lat': 33.99050413968406, 'lon': -118.4649780392647}},
    'activities': [{'activity': 'walking',
      'group': 'walking',
      'manual': False,
      'startTime': '20130515T133909-0700',
      'endTime': '20130515T134039-0700',
      'duration': 90.0,
      'distance': 92.0,
      'steps': 184,
      'calories': 5,
      'trackPoints': []}],
    'lastUpdate': '20140301T070136Z'},"

Moves data for a researcher.

In [None]:



"'locations': [{'timestampMs': '1546429982000',
   'latitudeE7': 119987508,
   'longitudeE7': 1202035421,
   'accuracy': 5,
   'velocity': 1,
   'heading': 240,
   'altitude': 11,
   'verticalAccuracy': 4},
  {'timestampMs': '1546436990484',
   'latitudeE7': 119988413,
   'longitudeE7': 1202039089,
   'accuracy': 10,
   'velocity': 2,
   'heading': 194,
   'altitude': 69,
   'verticalAccuracy': 32},
  {'timestampMs': '1546437174494',
   'latitudeE7': 119989723,
   'longitudeE7': 1202050089,
   'accuracy': 10,
   'velocity': 0,
   'heading': 196,
   'altitude': 15,
   'verticalAccuracy': 24},
  {'timestampMs': '1546437361255',
   'latitudeE7': 119987687,
   'longitudeE7': 1202035317,
   'accuracy': 95},
  {'timestampMs': '1546437793491',
   'latitudeE7': 119997058,
   'longitudeE7': 1202074594,
   'accuracy': 30,
   'velocity': 4,
   'heading': 5,
   'altitude': 11,
   'verticalAccuracy': 12},
  {'timestampMs': '1546485386590',
   'latitudeE7': 119998620,
   'longitudeE7': 1202073352,
   'accuracy': 78,
   'altitude': 16,
   'verticalAccuracy': 10},"

# Transforming json to dataframe

In [7]:
import os
import json
import requests
from datetime import datetime
from collections import defaultdict
import pandas as pd
import json
from pandas.io.json import json_normalize
import numpy as np


df1 = pd.DataFrame(moves_data_researcher2)
df1.head(10)
#print(type(df1_datapoint ))
#print(df1.columns)


# function to get data from json data
#df1 = dataframe_from_json(moves_data_researcher1)
#df2 = dataframe_from_json(moves_data_researcher2)



Unnamed: 0,locations
0,"{'timestampMs': '1280557864967', 'latitudeE7':..."
1,"{'timestampMs': '1280557901000', 'latitudeE7':..."
2,"{'timestampMs': '1280557967983', 'latitudeE7':..."
3,"{'timestampMs': '1280557969000', 'latitudeE7':..."
4,"{'timestampMs': '1280558011227', 'latitudeE7':..."
5,"{'timestampMs': '1280558133517', 'latitudeE7':..."
6,"{'timestampMs': '1280558311735', 'latitudeE7':..."
7,"{'timestampMs': '1280558591706', 'latitudeE7':..."
8,"{'timestampMs': '1280558732806', 'latitudeE7':..."
9,"{'timestampMs': '1280558893110', 'latitudeE7':..."


### We need to split column values into different values: timestamps, latitude, longitude etc.

In [13]:
#df1_datapoint = df1.iloc[3].values
print('data has columns of number ',df1.shape)

#1. deserealize each entry of dataframe df1



df1['coordinates'] = df1['locations'].str.split(',')
df1.head()

#2. create new column in dataframe and write time, locations there





data has columns of number  (136571, 2)


Unnamed: 0,locations,coordinates
0,"{'timestampMs': '1280557864967', 'latitudeE7':...",
1,"{'timestampMs': '1280557901000', 'latitudeE7':...",
2,"{'timestampMs': '1280557967983', 'latitudeE7':...",
3,"{'timestampMs': '1280557969000', 'latitudeE7':...",
4,"{'timestampMs': '1280558011227', 'latitudeE7':...",


# 2. Data analysis 

Here we do trajectories analysis of researchers and user profiling: 
    0. we analyze distributions of trip durations, trip lengths 
    1. we analyze frequencies of cities and places visited
    2. we calculate the number of amenities of places around the trajectories in places, where researchers stayed the longest
    3. we analyze possible common properties of trajectories of researchers

In [1]:
#functions inserted from OH notebook 

import json
from pprint import pprint


'''
Functions:
 to estimate the distribution of stop duration
 to estimate the distribution of jump length
'''

def duration_stop_distribution(daily_segments):
    """
    takes a daily segment of Moves 
    returns sorted distribution of stops duration
        Can be misleading for days w/ lots of travel etc. 
    """
    places_of_day = []
    duration_seq = [] #sequence of durations of stops
    for i in daily_segments:
        if i['type'] == 'place':
            place_location = i['place']['location']
            start_time = datetime.strptime(i['startTime'],'%Y%m%dT%H%M%S%z')
            end_time = datetime.strptime(i['endTime'],'%Y%m%dT%H%M%S%z')
            duration = end_time - start_time
            duration_seq.append([duration])
    duration_seq.sort() 
    return duration_seq 

def len_jumps_distribution(daily_segments):
    """
    takes a daily segment of Moves 
    returns the distribution of jumps lengths (calculated from lat/long of stops)
    can be misleading for days w/ lots of travel etc. 
    """
    places_of_traj = [] #returns array of all locations during the day
    for i in daily_segments: 
        if i['type'] == 'place':
            place_location = i['place']['location']
            places_of_traj.append([place_location])
    return places_of_traj


def longest_daily_location(daily_segments):
    """
    Function from Bastian (Open Humans)
       
    takes a daily segment log of Moves and returns the 
    lat/long for the location where most time was spent. 
    Can be misleading for days w/ lots of travel etc. 
    But the most quick/dirty solution for now.
    """
    places_of_day = []
    for i in daily_segments:
        if i['type'] == 'place':
            place_location = i['place']['location']
            start_time = datetime.strptime(i['startTime'],'%Y%m%dT%H%M%S%z')
            end_time = datetime.strptime(i['endTime'],'%Y%m%dT%H%M%S%z')
            duration = end_time - start_time
            places_of_day.append([place_location,duration])
    places_of_day.sort(key=lambda tup: tup[-1],reverse=True)
    return places_of_day[0][0]






In [None]:

# 0. analysis of trip durations

array_distance = df1.distance.values 
type(array_distance)
