In [1]:
from __future__ import print_function
from __future__ import division

import skmob
from skmob.preprocessing import compression
from skmob.tessellation import tilers

import pandas as pd 
import numpy as np

from shapely import wkt

from glob import glob

from tqdm import tqdm

import geopandas as gpd

import time
import argparse
import numpy as np
import _pickle as pickle
from collections import Counter

import folium

OUT_PATH = '../data/'

# Gowalla

In [2]:
url = "https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz"

In [3]:
df = pd.read_csv(url, sep='\t', header=0,
     names=['user', 'check-in_time', 'latitude', 'longitude', 'location id'])

In [4]:
# The output must be a csv with the following data:

# user_id, checkin_id, - , -, latitude, longitude, -, time 

# with time matching the foramt %a %b %d %H:%M:%S %z %Y

# by simply following the aformentioned constraints, we have a csv that can be parsed by DeepMove

In [5]:
df['none-1'] = -1
df['none-2'] = -1
df['none-3'] = -1

In [6]:
cols = ['user', 'location id', 'none-1', 'none-2', 'latitude', 'longitude', 'none-3', 'check-in_time']
df = df[cols]

In [7]:
df['check-in_time'] = pd.to_datetime(df['check-in_time'])

In [8]:
df['check-in_time'] = df['check-in_time'].dt.strftime('%a %b %d %H:%M:%S %z %Y')

In [11]:
df = df.sort_values(by=['user','check-in_time'])

In [15]:
df.to_csv(OUT_PATH + 'gowalla.txt', header=False, index=False, sep='\t')

# Taxi Porto

In [15]:
from datetime import datetime

from csv import reader
from ast import literal_eval

sdada = pd.read_csv('data/original_data/taxi_porto/train.csv')
subset_ids = set(list(sdada.TAXI_ID.unique())[:3])

df_elems = []

with open('data/original_data/taxi_porto/train.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    header = next(csv_reader)
    for row in tqdm(csv_reader):
        measurements = literal_eval(row[8])
        for measument in measurements:
            if len(measument) > 1:
                # remove very far away points 
                if measument[0] < 7.5 and measument[1] > 40 and measument[1] < 42 and int(row[4]) in subset_ids:
                    df_elems.append([row[0],row[4],row[5],measument[0], measument[1]])
                 
porto_df = pd.DataFrame(df_elems)

1710670it [08:37, 3306.09it/s]


In [16]:
porto_df = pd.DataFrame(df_elems)

In [18]:
porto_df.rename(columns={0:'trip_id', 1:'user', 2:'time', 3:'longitude', 4:'latitude'}, inplace=True)

In [19]:
porto_df['time'] = pd.to_datetime(porto_df['time'],unit='s')

In [20]:
porto_tdf = skmob.TrajDataFrame(porto_df, latitude='latitude', longitude='longitude', datetime='time', user_id='trip_id')

In [23]:
porto_tdf = porto_tdf.sort_by_uid_and_datetime()

In [24]:
porto_tdf = compression.compress(porto_tdf)

In [26]:
tile_size = 250
tess_porto = tilers.tiler.get("squared", base_shape='Porto, Portugal', meters=tile_size)

In [28]:
porto_points = gpd.GeoDataFrame(porto_tdf, geometry=gpd.points_from_xy(porto_tdf.lng, porto_tdf.lat))

In [29]:
mapped_porto_tdf = gpd.sjoin(porto_points, tess_porto, how="inner", op='intersects')

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  """Entry point for launching an IPython kernel.


In [31]:
mapped_porto_tdf = skmob.TrajDataFrame(mapped_porto_tdf, latitude='lat', longitude='lng', datetime='datetime', user_id='uid')

In [32]:
mapped_porto_tdf = mapped_porto_tdf.sort_by_uid_and_datetime()

In [35]:
class DataGenerator(object):
    
    def __init__(self):
        self.data = {}
        self.venues = {}
        self.words_original = []
        self.words_lens = []
        self.dictionary = dict()
        self.words_dict = None
        self.data_filter = {}
        self.user_filter3 = None
        self.uid_list = {}
        self.vid_list = {'unk': [0, -1]}
        self.vid_list_lookup = {}
        self.vid_lookup = {}
        self.pid_loc_lat = {}
        self.data_neural = {}
        
        self.train_split = 0.7
        self.validation_split = 0.1
    
    def load_trajectories(self, df):
        for i, row in df.iterrows():
            uid = row['user']
            pid = row['tile_ID']
            tim = row['datetime']
            
            if uid not in self.data:
                self.data[uid] = [[pid, tim]]
            else:
                self.data[uid].append([pid, tim])
                
            if pid not in self.venues:
                self.venues[pid] = 1
            else:
                self.venues[pid] += 1
                
                
    def session_generation(self, df):

        uid_3 = [x for x in self.data]
        pick3 = sorted([(x, len(self.data[x])) for x in uid_3], key=lambda x: x[1], reverse=True)
        pid_3 = [x for x in self.venues]
        pid_pic3 = sorted([(x, self.venues[x]) for x in pid_3], key=lambda x: x[1], reverse=True)
        pid_3 = dict(pid_pic3)
        
        session_len_list = []
        
        for u in pick3:
            
            uid = u[0]
            info = self.data[uid]
            topk = Counter([x[0] for x in info]).most_common()
            topk1 = [x[0] for x in topk if x[1] > 1]
            sessions = {}
            
            sub_df = df[df.user == str(uid)]
            
            for i, row in sub_df.iterrows():
                sid = row['uid']
                poi = row['tile_ID']
                tmd = str(row['datetime'])
                record = [poi, tmd]

                if sid not in sessions:
                    sessions[sid] = [record]
                else:
                    sessions[sid].append(record)
            self.data_filter[uid] = {'sessions_count': len(sessions), 'topk_count': len(topk), 'topk': topk,
                                         'sessions': sessions, 'raw_sessions': sessions}

        self.user_filter3 = [x for x in self.data_filter]
    
    def build_users_locations_dict(self):
        for u in self.user_filter3:
            sessions = self.data_filter[u]['sessions']
            if u not in self.uid_list:
                self.uid_list[u] = [len(self.uid_list), len(sessions)]
            for sid in sessions:
                poi = [p[0] for p in sessions[sid]]
                for p in poi:
                    if p not in self.vid_list:
                        self.vid_list_lookup[len(self.vid_list)] = p
                        self.vid_list[p] = [len(self.vid_list), 1]
                    else:
                        self.vid_list[p][1] += 1
                        
     # support for radius of gyration
    def load_venues(self,df,tess):
        for i, row in df.iterrows():
            
            pid = row['tile_ID']
            
            if pid not in self.pid_loc_lat:
            
                lat = tess[tess.tile_ID == str(pid)]['geometry'].centroid.values[0].x
                lon = tess[tess.tile_ID == str(pid)]['geometry'].centroid.values[0].y
                try:
                    self.pid_loc_lat[pid] = [float(lon), float(lat)]
                except Exception as e:
                    print('error:{}'.format(e))
                    print(lon)
                    print(lat)

    def venues_lookup(self):
        for vid in self.vid_list_lookup:
            pid = self.vid_list_lookup[vid]
            lon_lat = self.pid_loc_lat[pid]
            self.vid_lookup[vid] = lon_lat
          
    @staticmethod    
    def tid_list_48(tmd):
        tm = time.strptime(tmd, "%Y-%m-%d %H:%M:%S")
        if tm.tm_wday in [0, 1, 2, 3, 4]:
            tid = tm.tm_hour
        else:
            tid = tm.tm_hour + 24
        return tid
        
    def prepare_neural_data(self):
        for u in self.uid_list:
            sessions = self.data_filter[u]['sessions']
            sessions_tran = {}
            sessions_id = []
            for sid in sessions:
                sessions_tran[sid] = [[self.vid_list[p[0]][0], self.tid_list_48(p[1])] for p in
                                      sessions[sid]]
                sessions_id.append(sid)
                
            split_id = int(np.floor(self.train_split * len(sessions_id)))
            split_validation = int(np.floor(self.validation_split * len(sessions_id)))
            
            if split_validation == 0:
                split_validation = 1
            
            split_validation = split_id + split_validation
                
            train_id = sessions_id[:split_id]
            validation_id = sessions_id[split_id : split_validation]
            test_id = sessions_id[split_validation:]
            
            pred_len = sum([len(sessions_tran[i]) - 1 for i in train_id])
            valid_len = sum([len(sessions_tran[i]) - 1 for i in test_id])
            train_loc = {}
            for i in train_id:
                for sess in sessions_tran[i]:
                    if sess[0] in train_loc:
                        train_loc[sess[0]] += 1
                    else:
                        train_loc[sess[0]] = 1
            
            self.data_neural[self.uid_list[u][0]] = {'sessions': sessions_tran, 'train': train_id, 'test': test_id,
                                                     'pred_len': pred_len, 'valid_len': valid_len,
                                                     'train_loc': train_loc, 'validation': validation_id}
            
    def save_variables(self):
        foursquare_dataset = {'data_neural': self.data_neural, 'vid_list': self.vid_list, 'uid_list': self.uid_list, 'data_filter': self.data_filter,
                              'vid_lookup': self.vid_lookup}
        pickle.dump(foursquare_dataset, open('data/taxi_porto_new_gen.pk', 'wb'))



In [36]:
a = DataGenerator()

In [37]:
a.load_trajectories(mapped_porto_tdf)

In [38]:
a.session_generation(mapped_porto_tdf)

In [39]:
a.build_users_locations_dict()

In [40]:
a.load_venues(mapped_porto_tdf, tess_porto)






In [41]:
a.venues_lookup()


In [42]:
a.prepare_neural_data()


In [43]:
a.save_variables()


# Taxi SF

In [44]:
taxi_ids = dict()
last_id = 0

with open('data/original_data/taxi_sf/_cabs.txt', 'r') as f:
    ids = f.readlines()
    for idx in ids:
        if idx.split('"')[1] not in taxi_ids:
            taxi_ids[idx.split('"')[1]] = last_id
            last_id += 1 

In [45]:
df_list = []

current_traj_id = 0

for filename in tqdm(glob('data/original_data/taxi_sf/*.txt')):
    
    pred = 0
    
    if filename == 'data/original_data/taxi_sf/_cabs.txt':
        continue
    
    df = pd.read_csv(filename, header=None, sep='\s', index_col = None)
    df[3] = pd.to_datetime(df[3],unit='s')
    
    fname = taxi_ids[(filename.split('/')[-1]).split('.')[0][4:]]
    df['uid'] = fname
    
    df['traj_id'] = 'na'
    
    for i, row in df.iterrows():
        if row[2] == 1 and pred == 0:
            current_traj_id += 1 
            df.loc[i, 'traj_id'] = current_traj_id
            pred = 1
        elif row[2] == 1 and pred == 1:
            df.loc[i, 'traj_id'] = current_traj_id
        elif row[2] == 0 and pred == 1:
            pred = 0
    
    df_list.append(df)

  if sys.path[0] == '':
100%|██████████| 537/537 [46:50<00:00,  5.23s/it]


In [46]:
df = pd.concat(df_list, axis=0, ignore_index=True)

In [47]:
df = df.sort_values(['uid',3])

In [48]:
df.to_csv('data/original_data/sf_with_traj.csv')

In [49]:
df = pd.read_csv('data/original_data/sf_with_traj.csv')

In [50]:
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,uid,traj_id
0,4193158,37.74891,-122.39757,0,2008-06-04 21:53:33,0,na
1,4193157,37.74861,-122.39748,0,2008-06-04 21:54:33,0,na
2,4193156,37.74866,-122.39749,0,2008-06-04 21:55:33,0,na
3,4193155,37.74872,-122.39752,0,2008-06-04 21:56:33,0,na
4,4193154,37.74882,-122.39756,0,2008-06-04 21:57:33,0,na


In [51]:
df = df[df['2'] == 1]

In [52]:
df.rename(columns={'uid':'user'}, inplace=True)

In [53]:
tdf = skmob.TrajDataFrame(df, latitude='0', longitude='1', datetime='3', user_id='traj_id')

In [54]:
tdf = tdf.sort_by_uid_and_datetime()

In [56]:
tdf = compression.compress(tdf)

In [57]:
tdf.to_csv('data/original_data/sf_with_traj_compressed.csv')

In [58]:
tdf = pd.read_csv('data/original_data/sf_with_traj_compressed.csv')

In [59]:
tdf.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

In [60]:
tdf = skmob.TrajDataFrame(tdf, latitude='0', longitude='1', datetime='3', user_id='traj_id')

In [62]:
tile_size = 200

In [63]:
tess_tdf = tilers.tiler.get("squared", base_shape='San Francisco, California, USA', meters=tile_size)

In [65]:
tess_tdf = pd.read_csv('data/original_data/tex_SanFrancisco.csv')

In [66]:
tess_tdf = gpd.GeoDataFrame(tess_tdf)

In [67]:
tdf_points = gpd.GeoDataFrame(tdf, geometry=gpd.points_from_xy(tdf.lng, tdf.lat))

In [70]:
tdf_points.set_geometry(col='geometry', inplace=True)

In [71]:
tess_tdf['geometry'] = tess_tdf['geometry'].apply(wkt.loads)

In [72]:
tess_tdf.set_geometry(col='geometry', inplace=True)

In [73]:
sf_df = gpd.sjoin(tdf_points, tess_tdf, how="inner", op='intersects')

In [75]:
sf_tdf = skmob.TrajDataFrame(sf_df, latitude='lat', longitude='lng', datetime='datetime', user_id='uid')

In [76]:
sf_tdf = sf_tdf.sort_by_uid_and_datetime()

In [77]:
sf_tdf.to_csv('data/original_data/taxi_sf_mapped.csv')

In [78]:
sf_tdf = pd.read_csv('data/original_data/taxi_sf_mapped.csv')

In [79]:
sf_tdf.drop(columns=['Unnamed: 0'], inplace=True)

In [81]:
user_cutted = list(sf_tdf.user.unique())[:15]

In [82]:
sf_tdf = sf_tdf[sf_tdf.user.isin(user_cutted)]

In [83]:
class DataGenerator_SF(object):
    
    def __init__(self):
        self.data = {}
        self.venues = {}
        self.words_original = []
        self.words_lens = []
        self.dictionary = dict()
        self.words_dict = None
        self.data_filter = {}
        self.user_filter3 = None
        self.uid_list = {}
        self.vid_list = {'unk': [0, -1]}
        self.vid_list_lookup = {}
        self.vid_lookup = {}
        self.pid_loc_lat = {}
        self.data_neural = {}
        
        self.train_split = 0.7
        self.validation_split = 0.1
    
    def load_trajectories(self, df):
        for i, row in df.iterrows():
            uid = row['user']
            pid = row['tile_ID']
            tim = row['datetime']
            
            if uid not in self.data:
                self.data[uid] = [[pid, tim]]
            else:
                self.data[uid].append([pid, tim])
                
            if pid not in self.venues:
                self.venues[pid] = 1
            else:
                self.venues[pid] += 1
                
                
    def session_generation(self, df):

        uid_3 = [x for x in self.data]
        pick3 = sorted([(x, len(self.data[x])) for x in uid_3], key=lambda x: x[1], reverse=True)
        pid_3 = [x for x in self.venues]
        pid_pic3 = sorted([(x, self.venues[x]) for x in pid_3], key=lambda x: x[1], reverse=True)
        pid_3 = dict(pid_pic3)
        
        session_len_list = []
        
        for u in pick3:
            uid = u[0]
            info = self.data[uid]
            topk = Counter([x[0] for x in info]).most_common()
            topk1 = [x[0] for x in topk if x[1] > 1]
            sessions = {}
            
            sub_df = df[df.user == uid]
            
            for i, row in sub_df.iterrows():
                sid = row['uid']
                poi = row['tile_ID']
                tmd = str(row['datetime'])
                record = [poi, tmd]

                if sid not in sessions:
                    sessions[sid] = [record]
                else:
                    sessions[sid].append(record)
            self.data_filter[uid] = {'sessions_count': len(sessions), 'topk_count': len(topk), 'topk': topk,
                                         'sessions': sessions, 'raw_sessions': sessions}

        self.user_filter3 = [x for x in self.data_filter]
    
    def build_users_locations_dict(self):
        for u in self.user_filter3:
            sessions = self.data_filter[u]['sessions']
            if u not in self.uid_list:
                self.uid_list[u] = [len(self.uid_list), len(sessions)]
            for sid in sessions:
                poi = [p[0] for p in sessions[sid]]
                for p in poi:
                    if p not in self.vid_list:
                        self.vid_list_lookup[len(self.vid_list)] = p
                        self.vid_list[p] = [len(self.vid_list), 1]
                    else:
                        self.vid_list[p][1] += 1
                        
     # support for radius of gyration
    def load_venues(self,df,tess):
        for i, row in df.iterrows():
            
            pid = row['tile_ID']
            
            if pid not in self.pid_loc_lat:

                lat = tess[tess.tile_ID == pid]['geometry'].centroid.values[0].x
                lon = tess[tess.tile_ID == pid]['geometry'].centroid.values[0].y
                try:
                    self.pid_loc_lat[pid] = [float(lon), float(lat)]
                except Exception as e:
                    print('error:{}'.format(e))
                    print(lon)
                    print(lat)

    def venues_lookup(self):
        for vid in self.vid_list_lookup:
            pid = self.vid_list_lookup[vid]
            lon_lat = self.pid_loc_lat[pid]
            self.vid_lookup[vid] = lon_lat
          
    @staticmethod    
    def tid_list_48(tmd):
        tm = time.strptime(tmd, "%Y-%m-%d %H:%M:%S")
        if tm.tm_wday in [0, 1, 2, 3, 4]:
            tid = tm.tm_hour
        else:
            tid = tm.tm_hour + 24
        return tid
        
    def prepare_neural_data(self):
        for u in self.uid_list:
            sessions = self.data_filter[u]['sessions']
            sessions_tran = {}
            sessions_id = []
            for sid in sessions:
                sessions_tran[sid] = [[self.vid_list[p[0]][0], self.tid_list_48(p[1])] for p in
                                      sessions[sid]]
                sessions_id.append(sid)
                
            split_id = int(np.floor(self.train_split * len(sessions_id)))
            split_validation = int(np.floor(self.validation_split * len(sessions_id)))
            
            if split_validation == 0:
                split_validation = 1
            
            split_validation = split_id + split_validation
                
            train_id = sessions_id[:split_id]
            validation_id = sessions_id[split_id : split_validation]
            test_id = sessions_id[split_validation:]
            
            pred_len = sum([len(sessions_tran[i]) - 1 for i in train_id])
            valid_len = sum([len(sessions_tran[i]) - 1 for i in test_id])
            train_loc = {}
            for i in train_id:
                for sess in sessions_tran[i]:
                    if sess[0] in train_loc:
                        train_loc[sess[0]] += 1
                    else:
                        train_loc[sess[0]] = 1
            
            self.data_neural[self.uid_list[u][0]] = {'sessions': sessions_tran, 'train': train_id, 'test': test_id,
                                                     'pred_len': pred_len, 'valid_len': valid_len,
                                                     'train_loc': train_loc, 'validation': validation_id}
            
    def save_variables(self):
        foursquare_dataset = {'data_neural': self.data_neural, 'vid_list': self.vid_list, 'uid_list': self.uid_list, 'data_filter': self.data_filter,
                              'vid_lookup': self.vid_lookup}
        pickle.dump(foursquare_dataset, open('data/taxi_sf_new_gen.pk', 'wb'))



In [85]:
a = DataGenerator_SF()
a.load_trajectories(sf_tdf)
a.session_generation(sf_tdf)

a.build_users_locations_dict()
a.load_venues(sf_tdf, tess_tdf)
a.venues_lookup()
a.prepare_neural_data()
a.save_variables()
