In [36]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [37]:
def read_raw_weeplace(path):
    raw_data = pd.read_csv(path)
    return raw_data

In [38]:
raw_path = 'data/raw/weeplace_checkins.csv'
raw_data = read_raw_weeplace(raw_path)

In [39]:
raw_data.loc[:,'datetime'] = pd.to_datetime(raw_data.loc[:,'datetime'], format='%Y-%m-%dT%H:%M:%S')

In [40]:
raw_data.loc[:, 'date'] = raw_data.loc[:, 'datetime'].map(lambda x : x.date())
raw_data.loc[:, 'time'] = raw_data.loc[:, 'datetime'].map(lambda x : x.time())

In [41]:
filter_data = raw_data.groupby(['userid', 'date']).filter(lambda x : len(x) > 5)
groups = filter_data.groupby(['userid', 'date'])
filter_data['group_id'] = groups.grouper.label_info

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [42]:
filter_data = filter_data.sort_values(by = ['userid', 'date', 'time'])

In [43]:
filter_data.loc[:,'POI'] = filter_data.loc[:,'lat'] + filter_data.loc[:,'lon']

In [44]:
unique_POIs = filter_data['userid'].unique()
POI_dict = {unique_POIs[i]: i for i in range(len(unique_POIs))}
filter_data.loc[:,'userid'] = filter_data.loc[:,'userid'].apply(lambda x: POI_dict[x])

In [45]:
unique_POIs = filter_data['POI'].unique()
POI_dict = {unique_POIs[i]: i for i in range(len(unique_POIs))}

In [46]:
filter_data.loc[:,'POI_ID'] = filter_data.loc[:,'POI'].apply(lambda x: POI_dict[x])

In [47]:
start_i = -1
start_labels = []
for idx, row in filter_data.iterrows():
    if row['group_id'] != start_i:
        start_labels.append(idx)
        start_i += 1 

In [48]:
filter_data['dtime'] = filter_data['datetime'].diff(periods=1).fillna(0).dt.floor('T').dt.total_seconds().div(60).astype(int)

  """Entry point for launching an IPython kernel.


In [49]:
filter_data.loc[start_labels, 'dtime'] = 0
filter_data['tlower'] = (filter_data['dtime'] / 10).astype(int) * 10
filter_data['tupper'] = filter_data['tlower'] + 10

In [50]:
shifted_lats = filter_data['lat'].shift(1).fillna(0)
shifted_lons = filter_data['lon'].shift(1).fillna(0)

In [51]:
filter_data['ddist'] = ((filter_data['lat'] - shifted_lats).apply(lambda x: x*x) + (filter_data['lon'] - shifted_lons).apply(lambda x: x*x)).apply(lambda x: np.sqrt(x))

In [52]:
filter_data['ddist'] = np.ceil(filter_data['ddist'] * 100000 / 100).astype(int)

In [53]:
filter_data['ddist'] = filter_data['ddist'].map(lambda x: 2000 if x > 2000 else x)

In [54]:
filter_data.loc[start_labels, 'ddist'] = 0
filter_data['dlower'] = (filter_data['ddist'] / 10).astype(int) * 10
filter_data['dupper'] = filter_data['dlower'] + 10

In [55]:
dtime_set = set(filter_data['tupper'].unique()).union(set(filter_data['tlower'].unique()))
ddist_set = set(filter_data['dupper'].unique()).union(set(filter_data['dlower'].unique()))
dtime_dict = {value: key for key, value in enumerate(dtime_set)}
ddist_dict = {value: key for key, value in enumerate(ddist_set)}

In [56]:
filter_data.loc[:,'tupper_id'] = filter_data.loc[:,'tupper'].apply(lambda x: dtime_dict[x])
filter_data.loc[:,'tlower_id'] = filter_data.loc[:,'tlower'].apply(lambda x: dtime_dict[x])
filter_data.loc[:,'dupper_id'] = filter_data.loc[:,'dupper'].apply(lambda x: ddist_dict[x])
filter_data.loc[:,'dlower_id'] = filter_data.loc[:,'dlower'].apply(lambda x: ddist_dict[x])

In [57]:
max_lens = 0
user_visits = dict()
for gname, group in filter_data.groupby('userid'):
    if max_lens < group.shape[0]:
        max_lens = group.shape[0]
    user_visits[gname] = group['POI_ID']

In [58]:
all_pois = set(filter_data['POI_ID'].unique())
all_pois_l = list(all_pois)
for col_name in ['neg1', 'neg2']:
    negs = np.random.choice(all_pois_l, [filter_data['userid'].nunique() * 2, max_lens])
    user_negs = []
    i = 0
    for user in filter_data['userid'].unique():
        user_len = user_visits[user].shape[0]
        cand_negs = negs[i, :user_len]
        if len(set(cand_negs).intersection(set(user_visits[user]))) != 0:
            cand_negs = np.random.choice(list(all_pois.difference(set(user_visits[user]))), user_len)
        user_negs.extend(cand_negs)
        i += 1
    filter_data[col_name] = np.array(user_negs)

In [59]:
group_sizes = dict()
for gid, gsize in filter_data.groupby('group_id').grouper.size().iteritems():
    group_sizes[gid] = gsize

In [60]:
filter_data['group_size'] = filter_data['group_id'].map(lambda x: group_sizes[x])

In [61]:
filter_data.to_csv('data/weeplace_filter.csv', index = False)

In [62]:
sized_dataset = filter_data[filter_data['group_size'] == 6]

In [63]:
for gid, group in sized_dataset.groupby('group_id'):
    print(group.loc[:, ['POI_ID', 'dtime', 'tupper', 'tlower', 'tupper_id', 'tlower_id', 'ddist', 'dlower', 'dupper', 'dlower_id', 'dupper_id', 'neg1', 'neg2']].values)
    break

[[     0      0     10      0      3      0      0      0     10      0
       2 180890 373368]
 [     1      7     10      0      3      0      1      0     10      0
       2 157733 511427]
 [     2      5     10      0      3      0      2      0     10      0
       2 221976  11444]
 [     3    460    470    460    135    133      2      0     10      0
       2 408481  77983]
 [     4     89     90     80     27     24      5      0     10      0
       2 132905 580291]
 [     5     37     40     30     12      9     66     60     70     12
      14 401932 500579]]


In [64]:
sized_dataset.groupby('group_id').ngroups

96353

In [65]:
filter_data.columns

Index(['userid', 'placeid', 'datetime', 'lat', 'lon', 'city', 'category',
       'date', 'time', 'group_id', 'POI', 'POI_ID', 'dtime', 'tlower',
       'tupper', 'ddist', 'dlower', 'dupper', 'tupper_id', 'tlower_id',
       'dupper_id', 'dlower_id', 'neg1', 'neg2', 'group_size'],
      dtype='object')

In [66]:
filter_data['group_id'].nunique()

338492

In [67]:
filter_data.shape[0]

3160983

In [68]:
filter_data

Unnamed: 0,userid,placeid,datetime,lat,lon,city,category,date,time,group_id,...,ddist,dlower,dupper,tupper_id,tlower_id,dupper_id,dlower_id,neg1,neg2,group_size
7247182,0,,2010-04-15 05:42:15,35.649232,139.716503,渋谷区,College & Education:Library,2010-04-15,05:42:15,0,...,0,0,10,3,0,2,0,180890,373368,6
7247181,0,,2010-04-15 05:49:36,35.648710,139.715858,渋谷区,Shops:Other - Shopping,2010-04-15,05:49:36,0,...,1,0,10,3,0,2,0,157733,511427,6
7247180,0,1,2010-04-15 05:55:18,35.648079,139.714778,渋谷区,Shops:Other - Shopping,2010-04-15,05:55:18,0,...,2,0,10,3,0,2,0,221976,11444,6
7247179,0,,2010-04-15 13:35:54,35.646449,139.714589,,,2010-04-15,13:35:54,0,...,2,0,10,135,133,2,0,408481,77983,6
7247178,0,jr-1-tokyo,2010-04-15 15:04:57,35.645720,139.710630,Tokyo,Travel:Train Station:Train,2010-04-15,15:04:57,0,...,5,0,10,27,24,2,0,132905,580291,6
7247177,0,,2010-04-15 15:42:07,35.641944,139.645086,,Travel:Bus Station:Bus,2010-04-15,15:42:07,0,...,66,60,70,12,9,14,12,401932,500579,6
7247172,0,,2010-04-17 01:35:09,35.622942,139.750621,品川区,Travel:Train Station,2010-04-17,01:35:09,1,...,0,0,10,3,0,2,0,245571,49130,11
7247171,0,,2010-04-17 02:21:08,35.681096,139.763631,千代田区,Food:Café,2010-04-17,02:21:08,1,...,60,60,70,15,12,14,12,285554,484562,11
7247170,0,,2010-04-17 04:51:13,35.660679,139.729142,,Parks & Outdoors:Scenic Lookout,2010-04-17,04:51:13,1,...,41,40,50,48,45,10,8,518406,387947,11
7247169,0,,2010-04-17 06:42:34,35.659840,139.729203,,Arts & Entertainment:Museum:Art,2010-04-17,06:42:34,1,...,1,0,10,36,33,2,0,366080,206405,11


In [69]:
poi_cat = dict()
for poi, cat  in zip(filter_data['POI_ID'], filter_data['category']):
    poi_cat[poi] = cat

In [70]:
import pickle
pickle.dump(poi_cat, open('data/cat_dict', 'wb' ))