# Raw data analysis and segmentation

Code modified from City brain module.

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 18 14:14:12 2019
@author: smammeri

"""
import pandas as pd
import numpy as np
import geopy.distance
import logging
import time
import uuid
#from utils.data_saving import create_connection_engine, push2db
#from config.mipconfig import DBCONFIG
#import psycopg2

# Logger
logging.basicConfig()
logger = logging.getLogger(" trip_segmentation_tools ")
logger.setLevel(logging.DEBUG)

TRIPS_COLUMNS=[ 'userid', 'tripid', 'datetimestart','datetimestop',
'latitudestart', 'longitudestart','latitudestop', 'longitudestop',
'averagespeed','motionmode', 'durationsec', 'distance', 'timestampstart', 'timestampstop']

# Constants
MAX_SECONDS_BETWEEN_TRIPS = 120
MIN_ITEM_NUMBER_FOR_TRIP = 10
KM_2_M = 1000
MIN_WALKING_SPEED = 0.5 # 0.5 m/s = 1.8 km/hour
MIN_OD_DISTANCE = 100 # 500 meters
MAX_TRIP_DISTANCE = 1000.0 # 1000km
MIN_TRIP_DISTANCE = 0.1 # 0.1 km


def check_time_gap(ts1, ts2, diff=False):
    """
        Check the time difference in second between between two timestamps and return True
        if it exceeds MAX_SECONDS_BETWEEN_TRIPS.
    """
    cdiff = int(ts1 - ts2)
    
    if  cdiff > MAX_SECONDS_BETWEEN_TRIPS:
        return True if not diff else (True, cdiff)
    
    return False if not diff else (False, cdiff)

def get_distance_diff(pos1, pos2, meter = True):
    """
        Compute the distance between two GPS positions by defaut in meters.
        TODO: change the function below if a better option is available.
    """
    dist = geopy.distance.geodesic(pos1, pos2).km
    
    return dist if not meter else dist * KM_2_M

def check_motion_validity(distance, duration):
    """
        Compare input motion with minimum walking speed.
    """
    if duration and float(distance/duration) > MIN_WALKING_SPEED:
        return True

    return False

def tag_my_trips(dataframe, timestamps):
    """
        Generate unique tags per timestamps:
            from ts0 to ts1 -> tag x
            from ts2 to ts3 -> tag y 
            ...
    """
    tag =  uuid.uuid4()
    dataframe["tripid"] = [None for i in range(len(dataframe))]
    
    for idx in range(len(timestamps)-1):
        dataframe.loc[(dataframe["timestamp"] >= timestamps[idx]) & (dataframe["timestamp"] < timestamps[idx+1]), ["tripid"]] = tag
        tag = uuid.uuid4()
    
    return dataframe

def filter_my_trips(dataframe, att="tripid"):
    """
        Filter based on the number of items in a trip.
    """
    tag_to_delete = []
    tag_to_keep = []

    count = dataframe["tripid"].value_counts()
    for tag, occurence in zip(count.index, count):
        if occurence < MIN_ITEM_NUMBER_FOR_TRIP:          
            tag_to_delete.append(tag)
        else:
            tag_to_keep.append(tag)

    for tag in tag_to_delete:
        dataframe = dataframe[dataframe["tripid"] != tag]

    return dataframe, tag_to_keep

def remove_unconsitent_trips(dataframe, tags_to_check):
    """
        Check if a tagged trip is consistent.
        The minimum distance between origin and destination is 500 m.
    """
    final_tags = []
    
    for i in tags_to_check:
        chunk = dataframe[dataframe["tripid"] == i]
        start = (chunk.head(1).latitude.item(), chunk.head(1).longitude.item())
        stop = (chunk.tail(1).latitude.item(), chunk.tail(1).longitude.item())
    
        if get_distance_diff(start, stop) < MIN_OD_DISTANCE or chunk.tail(1).distance.item() < MIN_TRIP_DISTANCE \
            or chunk.tail(1).distance.item() > MAX_TRIP_DISTANCE: 
            # Origin and Dest more than 500m or Trip's distance < 1.0 km or > 1000km
            dataframe = dataframe[dataframe["tripid"] != i]
        else:
            final_tags.append(i)

    return dataframe, final_tags
        
def process_trip_for_one_user(df):
    """
        Splits raw data into trips, those trips
        can be retrieved using their begin and end timestamps. 
        And computes the distance btw.
    """
    df = df.sort_values(by=["timestamp"]) 

    stop_timestamps = []
    df["working_index"] = [i for i in range(len(df))]
    df["stop_points"] = [False for i in range(len(df))]
    df["distance"] = [0 for i in range(len(df))]
    acc_distance = 0

    for row in df.itertuples():
        current_idx = row.working_index
        if not current_idx: # First record
            df.loc[df["working_index"] == 0, ["stop_points"]] = True
            stop_timestamps.append(row.timestamp)
        else:
            last_row = df[df["working_index"] == current_idx - 1]            
            current_pos = (row.latitude, row.longitude)
            last_pos = (last_row.latitude.item(), last_row.longitude.item())

            gap_check, time_gap = check_time_gap(row.timestamp, last_row.timestamp, True)
            
            current_dist = get_distance_diff(current_pos,last_pos)
            acc_distance += current_dist
            
            if (gap_check and not check_motion_validity(current_dist, time_gap)) or (current_idx == len(df) - 1):
                stop_timestamps.append(row.timestamp)
                df.loc[df["working_index"] == current_idx, ["stop_points"]] = True
                df.loc[df["working_index"] == current_idx-1, ["distance"]] = (acc_distance-current_dist)/KM_2_M
                acc_distance = 0
                    
    return df, stop_timestamps

def format_trips(dataframe):
    """                
        'userid', 'tripid', 'datetimestart','datetimestop',
        'latitudestart', 'longitudestart',
        'latitudestop', 'longitudestop',
        'averagespeed',
        'motionmode', 
        'durationsec',
        'distance', 
        'timestampstart', 'timestampstop'
    """
    trips_df = pd.DataFrame(columns=TRIPS_COLUMNS)
    for tripid in dataframe["tripid"].unique():
        if tripid == None:
            continue
        trip = dataframe[dataframe["tripid"] == tripid]
        start = trip.head(1); stop  = trip.tail(1)
        if not start.empty and not stop.empty:
            trips_df=trips_df.append(pd.Series([np.nan]*len(TRIPS_COLUMNS), index=TRIPS_COLUMNS),ignore_index=True)
            trips_df.iloc[-1,:] =  [start['userid'].item(), start['tripid'].item(), start['datetime'].item(), stop['datetime'].item(),
                                   start['latitude'].item(), start['longitude'].item(),
                                   stop['latitude'].item(), stop['longitude'].item(),
                                   trip['speed'].mean(), 
                                   trip["transportmotionmode"].value_counts().index[0], # Take most occurent mode
                                   stop['timestamp'].item() - start['timestamp'].item(),
                                   stop['distance'].item(),            
                                   start['timestamp'].item(), stop['timestamp'].item()]
    return trips_df

def simulate_connection_loss(dataframe,ratio=0.3,random_seed=50):
    if ratio < 0.01:
        return dataframe

    np.random.seed(random_seed)
    remove_n = int(len(dataframe)*ratio)
    drop_indices = np.random.choice(dataframe.index, remove_n, replace=False)

    return dataframe.drop(drop_indices)

def preprocessing(dataframe):
    """
        At this phase, we can filter the df based on its
        speed 
        datetime
        (lat,long) with a radius to focus on specific areas (ex: Paris, 50km)
    """
    dataframe = dataframe[dataframe["speed"] < 150.0] # Remove highest speeds
    #dataframe = dataframe[dataframe["speed"] > 2.0] # Remove lowest speeds
    return dataframe

def process_one_user(user_dataframe):
    ts = time.time()
    # PROCESSING
    user_dataframe, stoping_timestamps = process_trip_for_one_user(user_dataframe)
    logger.debug("PROCESSING 1 in " + str(time.time() - ts) + " (s)")

    ts = time.time()    
    df2                                = tag_my_trips(user_dataframe, stoping_timestamps)
    logger.debug("PROCESSING 2 in " + str(time.time() - ts) + " (s)")

    ts = time.time()
    df3, temp_tags                     = filter_my_trips(df2, "tripid")
    logger.debug("PROCESSING 3 in " + str(time.time() - ts) + " (s)")
    
    ts = time.time()    
    df4, tags                          = remove_unconsitent_trips(df3, temp_tags)
    logger.debug("PROCESSING 4 in " + str(time.time() - ts) + " (s)")

    ts = time.time()    
    df5                                = format_trips(df4)
    logger.debug("PROCESSING 5 in " + str(time.time() - ts) + " (s)")
    return df5

def process_trips_for_all_users(dataframe):
    # PROCESS User by User
    output = pd.DataFrame(columns=TRIPS_COLUMNS)
    for user in dataframe['userid'].unique():
        user_df = dataframe[dataframe['userid'] == user]
        processed_user_df = process_one_user(user_df)
        output = output.append(processed_user_df)
    return output





In [3]:


############################################
#  Main body program
############################################

# example data
filepath_before = 'C:/Users/lyubo/Documents/DATA_networks/mobilitydata/cityBrain/my_trips.csv'
filepath_full = 'C:/Users/lyubo/Documents/DATA_networks/mobilitydata/cityBrain/trips_updated.csv'

df_full = pd.read_csv(filepath_full)
print(df_full.shape)
print(df_full.columns)
df_full.head()

(2882, 15)
Index(['ctid', 'userid', 'tripid', 'datetimestart', 'datetimestop',
       'latitudestart', 'longitudestart', 'latitudestop', 'longitudestop',
       'averagespeed', 'motionmode', 'durationsec', 'distance',
       'timestampstart', 'timestampstop'],
      dtype='object')


Unnamed: 0,ctid,userid,tripid,datetimestart,datetimestop,latitudestart,longitudestart,latitudestop,longitudestop,averagespeed,motionmode,durationsec,distance,timestampstart,timestampstop
0,"(0,2)",eu-west-1:644fc1d8-138d-424e-a3bb-65ed317e86ab,fe5e9a6a-b6c4-4004-aa90-5c7c9b5d3a5f,9/25/18 15:53,9/25/18 15:58,48.668435,2.237086,48.668373,2.237038,1.223873,walking,324,0.065103,1537890788,1537891112
1,"(0,3)",eu-west-1:644fc1d8-138d-424e-a3bb-65ed317e86ab,808fd214-58f4-40e8-8339-26fbff83f470,9/25/18 16:29,9/25/18 16:46,48.667619,2.238008,48.714901,2.298308,36.642469,car,1055,10.814846,1537892942,1537893997
2,"(0,4)",eu-west-1:644fc1d8-138d-424e-a3bb-65ed317e86ab,808fd214-58f4-40e8-8339-26fbff83f470,9/25/18 16:46,9/25/18 16:46,48.714893,2.298301,48.714887,2.298306,0.164059,train,5,0.000707,1537894002,1537894007
3,"(0,5)",eu-west-1:644fc1d8-138d-424e-a3bb-65ed317e86ab,808fd214-58f4-40e8-8339-26fbff83f470,9/25/18 16:47,9/25/18 17:3,48.714889,2.298302,48.704187,2.322558,11.361664,car,962,2.80142,1537894025,1537894987
4,"(0,6)",eu-west-1:644fc1d8-138d-424e-a3bb-65ed317e86ab,09c45ba4-489c-49f9-9bfa-7f6040bedf19,9/26/18 15:17,9/26/18 15:33,48.665507,2.237715,48.713246,2.297054,41.910265,car,970,10.663661,1537975059,1537976029


In [4]:
filepath = "C:/Users/lyubo/Documents/DATA_networks/mobilitydata/openhumans/mobility_delta_ID_05364098_researcher.csv"

df_researcher = pd.read_csv(filepath)
print(df_researcher.shape)
print(df_researcher.columns)
df_researcher.head()

(136570, 8)
Index(['Unnamed: 0', 'time', 'start_lat', 'start_lon', 'dest_lat', 'dest_lon',
       'distance', 'delta'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,time,start_lat,start_lon,dest_lat,dest_lon,distance,delta
0,0,0,52.32512,8.097732,52.536895,8.113194,1350.735616,0.0
1,1,1,52.536895,8.113194,52.55734,8.113479,130.304575,1.0
2,2,2,52.55734,8.113479,52.561325,8.112839,25.538086,1.0
3,3,3,52.561325,8.112839,52.5723,8.110485,70.657676,1.0
4,4,4,52.5723,8.110485,52.61285,8.125504,266.63692,1.0


In [8]:
# rename columns according to function requirements 
df_res_new = df_researcher.rename(columns={"start_lat": "latitudestart", "start_lon": "longitudestart", "dest_lat": "latitudestop", "dest_lon": "longitudestop", "distance":"distance", "delta": "timestamp"})
df_res_new.head()


df_raw = df_researcher.rename(columns={"start_lat": "latitude", "start_lon": "longitude", "dest_lat": "latitudestop", "dest_lon": "longitudestop", "distance":"distance", "delta": "timestamp"})
df_raw.head()

Unnamed: 0.1,Unnamed: 0,time,latitude,longitude,latitudestop,longitudestop,distance,timestamp
0,0,0,52.32512,8.097732,52.536895,8.113194,1350.735616,0.0
1,1,1,52.536895,8.113194,52.55734,8.113479,130.304575,1.0
2,2,2,52.55734,8.113479,52.561325,8.112839,25.538086,1.0
3,3,3,52.561325,8.112839,52.5723,8.110485,70.657676,1.0
4,4,4,52.5723,8.110485,52.61285,8.125504,266.63692,1.0


In [9]:

df_new, stop_timestamps = process_trip_for_one_user(df_raw)




In [15]:
print(df_new.shape)

df_new.stop_points.value_counts()
df_new.head(10)


(136570, 10)


Unnamed: 0.1,Unnamed: 0,time,latitude,longitude,latitudestop,longitudestop,distance,timestamp,working_index,stop_points
0,0,0,52.32512,8.097732,52.536895,8.113194,0.0,0.0,0,True
91055,91055,91055,38.063824,23.799519,38.064009,23.799763,0.0,1.0,1,False
91054,91054,91054,38.06376,23.799623,38.063824,23.799519,0.0,1.0,2,False
91053,91053,91053,38.059831,23.797104,38.06376,23.799623,0.0,1.0,3,False
91052,91052,91052,38.06097,23.798575,38.059831,23.797104,0.0,1.0,4,False
91051,91051,91051,38.059906,23.797272,38.06097,23.798575,0.0,1.0,5,False
91050,91050,91050,38.059631,23.796895,38.059906,23.797272,0.0,1.0,6,False
91049,91049,91049,38.058718,23.797072,38.059631,23.796895,0.0,1.0,7,False
91048,91048,91048,38.05808,23.799359,38.058718,23.797072,0.0,1.0,8,False
91047,91047,91047,38.056588,23.801888,38.05808,23.799359,0.0,1.0,9,False


# Trip segmentation step 2 
Now based on the last index of stop_points we will create new df with only stops and create trip ids