In [1]:
!pwd

/Users/liuzhichao/Documents/GitHub/Baidu_BigData_Competition/code


In [2]:
import os

os.chdir("/Users/liuzhichao/Documents/GitHub/Baidu_BigData_Competition/")

In [3]:
!pwd

/Users/liuzhichao/Documents/GitHub/Baidu_BigData_Competition


In [5]:
with open("data/data_processed/region_names.txt", "r") as f:
    for line in f:
        region_names = line.strip().split()

In [7]:
len(region_names)

392

In [8]:
import os
import sys
import numpy as np
import pandas as pd
import argparse

# train, validation, test split
def data_split(dataset, args):
    indices = np.arange(0, len(dataset))
    # no validation set
    if args.val_num <= 0:
        # leave-one out test set
        train_num = len(dataset) - 1
        train_indices = indices[:train_num]
        test_indices = indices[-1:]
        return Subset(dataset, train_indices), None, Subset(dataset, test_indices)
    else:
        train_num = len(dataset) - args.val_num - 1

        train_indices = indices[:train_num]
        valid_indices = indices[train_num:train_num + args.val_num]
        test_indices = indices[-1:]
        return Subset(dataset, train_indices), \
                Subset(dataset, valid_indices), Subset(dataset, test_indices)


class BaseDataset(object):
    """BaseDataset"""

    def __init__(self):
        pass

    def __getitem__(self, idx):
        """getitem"""
        raise NotImplementedError

    def __len__(self):
        """len"""
        raise NotImplementedError


class Subset(BaseDataset):
    """
    Subset of a dataset at specified indices.
    """

    def __init__(self, dataset, indices):
        self.dataset = dataset
        self.indices = indices

    def __getitem__(self, idx):
        """getitem"""
        return self.dataset[self.indices[idx]]

    def __len__(self):
        """len"""
        return len(self.indices)


class BaseDataset(object):
    """BaseDataset"""

    def __init__(self):
        pass

    def __getitem__(self, idx):
        """getitem"""
        raise NotImplementedError

    def __len__(self):
        """len"""
        raise NotImplementedError


class Subset(BaseDataset):
    """
    Subset of a dataset at specified indices.
    """

    def __init__(self, dataset, indices):
        self.dataset = dataset
        self.indices = indices

    def __getitem__(self, idx):
        """getitem"""
        return self.dataset[self.indices[idx]]

    def __len__(self):
        """len"""
        return len(self.indices)


class InfectDataset(BaseDataset):
    def __init__(self, args):
        self.args = args
        self.input_file = self.args.input_file
        self.label_file = self.args.label_file
        self.region_names_file = self.args.region_names_file

        self.city_num = self.args.city_num
        self.feat_dim = self.args.feat_dim
        self.n_pred = self.args.n_pred
        self.n_his = self.args.n_his

        self.data = self.process()

    def process(self):
        X = pd.read_csv(self.input_file)
        X = X.fillna(0.0)
        Y = pd.read_csv(self.label_file)

        with open(self.region_names_file, 'r') as f:
            for line in f:
                region_names = line.strip().split()

        # scaling (why scaling here?)
        SCALE = 1000
        for name in region_names:
            X[name] = X[[name]].apply(lambda x: x / SCALE)
            Y[name] = Y[[name]].apply(lambda x: x / SCALE)

        print("region migration: ", X.head())
        print("infect: ", Y.head())

        X = X.drop(columns=['date'])
        Y = Y.drop(columns=['date'])
        # total number of training examples
        date_num = len(Y)
        # maybe for future use
        train_num = date_num - self.n_pred

        df = pd.DataFrame(columns=X.columns)
        # (?, n_his, city_num, node_feat_dim)
        for i in range(date_num - self.n_his - self.n_pred + 1):
            df = df.append(X[i:(i + self.n_his)])
            df = df.append(Y[(i + self.n_his):(i + self.n_his + self.n_pred)])

        # for testing
        df = df.append(X[-self.n_his:])
        df = df.append(Y[-self.n_pred:])  # unused, for padding

        data = df.values.reshape(-1, self.n_his + self.n_pred, self.city_num, 1)

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if isinstance(idx, int):
            return np.expand_dims(self.data[idx], axis=0)
        else:
            return self.data[idx]
        


In [14]:
parser = argparse.ArgumentParser()
parser.add_argument('--city_num', type=int, default=5)
parser.add_argument('--feat_dim', type=int, default=1)
parser.add_argument('--n_his', type=int, default=10)
parser.add_argument('--n_pred', type=int, default=10)
parser.add_argument('--batch_size', type=int, default=10)
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--save', type=int, default=10)
parser.add_argument('--Ks', type=int, default=3)  #equal to num_layers
parser.add_argument('--Kt', type=int, default=3)
parser.add_argument('--lr', type=float, default=1e-3)
parser.add_argument('--keep_prob', type=float, default=1.0)
parser.add_argument('--opt', type=str, default='ADAM')
parser.add_argument('--inf_mode', type=str, default='sep')
parser.add_argument('--input_file', type=str, default='dataset/data_processed/migration.csv')
parser.add_argument('--label_file', type=str, default='dataset/data_processed/infection.csv')
parser.add_argument('--adj_mat_file', type=str, default='dataset/data_processed/adj_matrix.npy')
parser.add_argument('--output_path', type=str, default='./outputs/')
parser.add_argument('--val_num', type=str, default=0)
parser.add_argument('--test_num', type=str, default=1)
parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--train_all', action='store_true')

_StoreTrueAction(option_strings=['--train_all'], dest='train_all', nargs=0, const=True, default=False, type=None, choices=None, help=None, metavar=None)

In [13]:
args = parser.parse_args()

usage: ipykernel_launcher.py [-h] [--city_num CITY_NUM] [--feat_dim FEAT_DIM]
                             [--n_his N_HIS] [--n_pred N_PRED]
                             [--batch_size BATCH_SIZE] [--epochs EPOCHS]
                             [--save SAVE] [--Ks KS] [--Kt KT] [--lr LR]
                             [--keep_prob KEEP_PROB] [--opt OPT]
                             [--inf_mode INF_MODE] [--input_file INPUT_FILE]
                             [--label_file LABEL_FILE]
                             [--adj_mat_file ADJ_MAT_FILE]
                             [--output_path OUTPUT_PATH] [--val_num VAL_NUM]
                             [--test_num TEST_NUM] [--use_cuda] [--train_all]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/liuzhichao/Library/Jupyter/runtime/kernel-7482beaa-96ab-44b5-98d5-999fcc5d98a3.json


SystemExit: 2