In [2]:
import os

import numpy as np
import pandas as pd

import torch
from torch import nn

In [3]:
nyc_data = pd.read_csv('/home/hatcher/test/deep-learning/foursquare-checkin/FS_NYC.csv')
nyc_data.head()

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012


In [4]:
venue_id2index = {id:index for index, id in enumerate(nyc_data['venueId'].drop_duplicates())}
nyc_data['venueIndex'] = nyc_data['venueId'].map(venue_id2index)
nyc_data.head()

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,venueIndex
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,0
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,1
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,2
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012,3
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,4


In [None]:
import torch.utils.data as data
# 定义dataset
class my_Dataset(data.Dataset):
    def __init__(self, features, labels):
        self.X = features
        self.y = labels

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return self.X.shape[0]

In [None]:
class FourSquareDataset:
    def __init__(self, data_path):
        # 适当修改数据读取过程
        self.raw_data = pd.read_csv(data_path)
        venue_id2index = {id:index for index, id in enumerate(self.raw_data['venueId'].drop_duplicates())}
        self.raw_data['venueIndex'] = self.raw_data['venueId'].map(venue_id2index)
        self.min = self.raw_data.min()
        self.max = self.raw_data.max()
        self.data = (self.raw_data - self.min) / (self.max - self.min)
        

        
    def denormalize(self, x, feat):
        return x * (self.max - self.min) + self.min

    def construct_set(self, train_por, test_por,target,window_size=12,label=0):
        train_x = []
        train_y = []
        val_x = []
        val_y = []
        test_x = []
        test_y = []
        self.target = target

        # 补全构造过程
        len_train = int(self.data.shape[0] * train_por)
        train_seqs = self.data[:len_train]
        for i in range(train_seqs.shape[0] - window_size):
            train_seq = train_seqs.loc[i:i + window_size]
            train_x.append(train_seq.loc[i:i + window_size - 1][self.feature_col].values.tolist())
            train_y.append(train_seq.loc[i + window_size][f'{self.sensor}_{target}'].tolist())

        for user_id, group in self.data.groupby('userId'):
            # pandas会对userId进行遍历。
            # 每次遍历中，group包含了对应userId所有的check-in记录。
            user_trajectory = group.sort_values(['utcTimestamp'])['venueIndex'].tolist()
            train_seq = user_trajectory[:int(len(user_trajectory) * train_por)]
            for i in range(len(train_seq) - window_size):
                train_x.append(train_seq[i:i+window_size])
                train_y.append(train_seq[i+window_size])



        len_val = int(self.data.shape[0] * (train_por + test_por))
        val_seqs = self.data[len_train:len_val]
        val_seqs = val_seqs.reset_index()
        for i in range(val_seqs.shape[0] - window_size):
            val_seq = val_seqs.loc[i:i + window_size]
            val_x.append(val_seq.loc[i:i + window_size - 1][self.feature_col].values.tolist())
            val_y.append(val_seq.loc[i + window_size][f'{self.sensor}_{target}'].tolist())

        test_seqs = self.data[len_val:]
        test_seqs = test_seqs.reset_index()
        for i in range(test_seqs.shape[0] - window_size):
            test_seq = test_seqs.loc[i:i + window_size]
            test_x.append(test_seq.loc[i:i + window_size - 1][self.feature_col].values.tolist())
            test_y.append(test_seq.loc[i + window_size][f'{self.sensor}_{target}'].tolist())

        train_set = my_Dataset(torch.Tensor(train_x), torch.Tensor(train_y))
        val_set = my_Dataset(torch.Tensor(val_x), torch.Tensor(val_y))
        test_set = my_Dataset(torch.Tensor(test_x), torch.Tensor(test_y))
        return train_set, val_set, test_set