In [2]:
import torch
import numpy as np

In [3]:
bikes_numpy = np.loadtxt(
    "../data/p1ch4/bike-sharing-dataset/hour-fixed.csv",
    dtype=np.float32,
    delimiter=",",
    skiprows=1,
    converters={1: lambda x: float(x[8:10])})
bikes = torch.from_numpy(bikes_numpy)
bikes[5]

tensor([6.0000, 1.0000, 1.0000, 0.0000, 1.0000, 5.0000, 0.0000, 6.0000, 0.0000,
        2.0000, 0.2400, 0.2576, 0.7500, 0.0896, 0.0000, 1.0000, 1.0000])

In [4]:
bikes.shape

torch.Size([17520, 17])

We now want to split our data into 24h chunks

In [5]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [8]:
daily_bikes = bikes.view(-1, 24, bikes.shape[1])

# provides a new way to view the data, at basically 0 computational cost;
# 24 means 24 hours in a day
# bikes.shape[1] makes sure we still have all 17 columns
# the -1 tells pytorch to work out that index by itself

daily_bikes.shape

# we now have 730 days worth of data split up

torch.Size([730, 24, 17])

In [11]:
# we now change the ordering to be number of days (N), columns (C), length of batch (L)
daily_bikes = daily_bikes.transpose(1, 2) # flips (transposes) the first and second dimensions

daily_bikes.shape


torch.Size([730, 17, 24])

The “weather situation” variable is ordinal. It has four levels: 1 for good weather, and 4 for, er, really bad. We could treat this variable as categorical, with levels interpreted as labels, or as a continuous variable. If we decided to go with categorical, we would turn the variable into a one-hot-encoded vector and concatenate the columns with the dataset

In [14]:
first_day = bikes[:24].long()
first_day.shape

torch.Size([24, 17])

In [15]:
weather_onehot = torch.zeros(first_day.shape[0], 4)
first_day[:, 9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [16]:
# note the use of unsqueeze. From the documentation:
# [unsqueeze] Returns a new tensor with a dimension of size one inserted at the specified position.

weather_onehot.scatter_(
    dim=1,
    index=first_day[:, 9].unsqueeze(1).long()-1,
    value=1.0
)

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [17]:
torch.cat((bikes[:24], weather_onehot), 1) # concaternate along dimension 1 (the columns)

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
         0.0000e+00, 6.0000e+00, 0.0000e+00, 1.0000e+00, 2.4000e-01, 2.8790e-01,
         8.1000e-01, 0.0000e+00, 3.0000e+00, 1.3000e+01, 1.6000e+01, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00,
         0.0000e+00, 6.0000e+00, 0.0000e+00, 1.0000e+00, 2.2000e-01, 2.7270e-01,
         8.0000e-01, 0.0000e+00, 8.0000e+00, 3.2000e+01, 4.0000e+01, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [3.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 2.0000e+00,
         0.0000e+00, 6.0000e+00, 0.0000e+00, 1.0000e+00, 2.2000e-01, 2.7270e-01,
         8.0000e-01, 0.0000e+00, 5.0000e+00, 2.7000e+01, 3.2000e+01, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 3.0000e+00,
         0.0000e+00, 6.0000e+00, 0.0000e+00, 1.0000e

Instead of onehot encoding, we can also rescale the column from 0 to 1:

In [None]:
daily_bikes[:, 9, :] = (daily_bikes[:, 9, :] - 1/0)/3.0 # change to 0, 1, 2, 3 then div by 3

And we can also rescale other data:

In [None]:
temp = daily_bikes[:, 10, :]

temp_min = torch.min(temp)
temp_max = torch.max(temp)
daily_bikes[:, 10, :] = ((daily_bikes[:, 10, :] - temp_min) / (temp_max - temp_min)) # maps range to [0,1]

temp = daily_bikes[:, 10, :]
daily_bikes[:, 10, :] = ((daily_bikes[:, 10, :] - torch.mean(temp))
                         / torch.std(temp)) # gets mean of 0 and sd of 1

