In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn

device = torch.device("mps")

In [2]:
class NegativeWeightsConstraint:
    def __init__(self, start_index, end_index):
        self.start_index = start_index
        self.end_index = end_index

    def __call__(self, module):
        if hasattr(module, 'weight'):
            weight = module.weight
            weight_data = weight.data
            weight_data[0][0][0][self.start_index:self.end_index] = torch.clamp(weight_data[0][0][0][self.start_index:self.end_index], max=0.0)

    def backward_hook(self, grad):
        grad[..., self.start_index:self.end_index] = torch.where(
            grad[..., self.start_index:self.end_index] > 0,
            grad[..., self.start_index:self.end_index],
            torch.zeros_like(grad[..., self.start_index:self.end_index])            
        )
        return grad
    
class ZeroWeightsConstraint:
    def __init__(self, indices=[]):
        self.indices = indices

    def __call__(self, module):
        if hasattr(module, 'weight'):
            weight = module.weight
            weight_data = weight.data
            weight_data[0][0][0][self.indices] = 0.0
    
    def backward_hook(self, grad):
        grad[..., self.indices] = 0.0
        return grad

In [3]:
class CNN1(nn.Module):
    def __init__(self, num_features):
        super(CNN1, self).__init__()
        self.num_features = num_features
        self.conv1 = nn.Conv2d(1, 1, kernel_size=(1, self.num_features), padding=0, bias=False)
        self.conv1.apply(NegativeWeightsConstraint(start_index=0, end_index=3))
        self.conv1.apply(ZeroWeightsConstraint(indices=[3,5,10,14,16,19,23,26,28]))

    def forward(self, x, debug=False):
        batch_size, num_choices, num_features = x.size()
        if debug:
            print("x.shape: ", x.shape)
        x = x.view(batch_size, 1, num_choices, num_features)
        if debug:
            print("After reshape: ", x.shape) 
        x = self.conv1(x)
        if debug:
            print("After conv1: ", x.shape) 
        x = torch.flatten(x, start_dim=1)
        if debug:
            print("After flatten: ", x.shape)
        return x
num_features = 30
model = CNN1(num_features).to(device)
X = torch.rand(32, 2, num_features).to(device)
model(X, debug=True).shape    

x.shape:  torch.Size([32, 2, 30])
After reshape:  torch.Size([32, 1, 2, 30])
After conv1:  torch.Size([32, 1, 2, 1])
After flatten:  torch.Size([32, 2])


torch.Size([32, 2])

In [13]:
class CNN2(nn.Module):
    def __init__(self, num_features):
        super(CNN2, self).__init__()
        self.num_features = num_features
        self.conv1 = nn.Conv2d(1, 1, kernel_size=(1, self.num_features*(self.num_features+1)//2+self.num_features), padding=0, bias=False)
        self.conv1.apply(NegativeWeightsConstraint(start_index=0, end_index=3))
        self.conv1.apply(ZeroWeightsConstraint(indices=[3,5,10,14,16,19,23,26,28]))

    def forward(self, x, debug=False):
        batch_size, num_choices, num_features = x.size()
        x = x.unsqueeze(-1)
        y = torch.matmul(x,x.transpose(-1,-2))
        upper_triangular_indices = torch.triu_indices(num_features, num_features, offset=0)
        upper_triangle = y[..., upper_triangular_indices[0], upper_triangular_indices[1]]
        upper_triangle = upper_triangle.unsqueeze(-1)
        y = torch.cat((x, upper_triangle), dim=-2)

        y = y.view(batch_size, 1, num_choices, num_features*(num_features+1)//2+num_features)
        if debug:
            print("After reshape: ", y.shape) 
        y = self.conv1(y)
        if debug:
            print("After conv1: ", y.shape) 
        y = torch.flatten(y, start_dim=1)
        if debug:
            print("After flatten: ", y.shape)
        return y
num_features = 30
model = CNN2(num_features).to(device)
X = torch.rand(32, 2, num_features).to(device)
model(X, debug=True).shape    

After reshape:  torch.Size([32, 1, 2, 495])
After conv1:  torch.Size([32, 1, 2, 1])
After flatten:  torch.Size([32, 2])


torch.Size([32, 2])

In [24]:
class TransformerEncoder(nn.Module):
    def __init__(self, num_features, d_model, nhead, num_encoder_layers, dim_feedforward, dropout):
        super(TransformerEncoder, self).__init__()
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout), num_encoder_layers)
        self.fc = nn.Linear(d_model, dim_feedforward)
    
    def forward(self, y, debug=False):
        y = self.transformer_encoder(y)
        if debug:
            print("After transformer_encoder: ", y.shape)
        y = self.fc(y)
        if debug:
            print("After fc: ", y.shape)
        return y


class Transformer(nn.Module):
    def __init__(self, num_features, hidden_size, d_model, conv_kernel, nhead, num_encoder_layers, dim_feedforward, dropout):
        super(Transformer, self).__init__()
        self.num_features = num_features
        self.num_features2 = num_features*(num_features+1)//2+num_features
        self.hidden_size = hidden_size
        self.d_model = d_model
        self.conv_kernel = conv_kernel
        self.nhead = nhead
        self.num_encoder_layers = num_encoder_layers
        self.dim_feedforward = dim_feedforward
        self.Transformer_encoder = TransformerEncoder(num_features, d_model, nhead, num_encoder_layers, dim_feedforward, dropout)
        self.conv1 = nn.Conv2d(1, 1, kernel_size=(1, self.conv_kernel), padding=0, bias=False)
        self.avgpool = nn.AdaptiveAvgPool1d(self.hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.conv1.apply(NegativeWeightsConstraint(start_index=0, end_index=3))

    def forward(self, x, debug=False):
        batch_size, num_choices, num_features = x.size()
        x = x.unsqueeze(-1)
        y = torch.matmul(x,x.transpose(-1,-2))
        upper_triangular_indices = torch.triu_indices(num_features, num_features, offset=0)
        upper_triangle = y[..., upper_triangular_indices[0], upper_triangular_indices[1]]
        upper_triangle = upper_triangle.unsqueeze(-1)
        if debug:
            print("After upper_triangle: ", upper_triangle.shape)
        y = torch.cat((x, upper_triangle), dim=-2)
        if debug:
            print("After concat: ", y.shape)
        y = self.dropout(y).squeeze(-1)
        # y = self.avgpool(y)
        # if debug:
        #     print("After avgpool: ", y.shape)
        y = y.view(batch_size * num_choices, self.num_features2)
        if debug:
            print("After reshape: ", y.shape)
        y = F.relu(y, inplace=True)

        x = x.view(batch_size * num_choices, self.num_features)[:,:3]
        if debug:
            print("x: ", x.shape)
        # y = torch.cat((x, y), dim=-1)
        
        if debug:
            print("After concat: ", y.shape) 
        y = self.Transformer_encoder(y, debug=debug)
        if debug:
            print("After Transformer_encoder: ", y.shape)   
        y = torch.cat((x, y), dim=-1)
        y = y.view(batch_size, 1, num_choices, self.conv_kernel)
        y = self.conv1(y)
        if debug:
            print("After conv1: ", y.shape) 
        y = torch.flatten(y, start_dim=1)
        if debug:
            print("After flatten: ", y.shape)
        return y
num_features = 30
num_features2 = num_features*(num_features+1)//2+num_features
hidden_size = num_features2
d_model = num_features2
conv_kernel = num_features2+3
nhead = 5
num_encoder_layers = 4
dim_feedforward = num_features2
dropout = 0.2
model = Transformer(num_features, hidden_size, d_model, conv_kernel, nhead, num_encoder_layers, dim_feedforward, dropout).to(device)
X = torch.rand(32, 2, num_features).to(device)
model(X, debug=True).shape    

After upper_triangle:  torch.Size([32, 2, 465, 1])
After concat:  torch.Size([32, 2, 495, 1])
After reshape:  torch.Size([64, 495])
x:  torch.Size([64, 3])
After concat:  torch.Size([64, 495])
After transformer_encoder:  torch.Size([64, 495])
After fc:  torch.Size([64, 495])
After Transformer_encoder:  torch.Size([64, 495])
After conv1:  torch.Size([32, 1, 2, 1])
After flatten:  torch.Size([32, 2])


torch.Size([32, 2])

In [4]:
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Subset, SequentialSampler
from torch.utils.data import ConcatDataset
import pickle
import numpy as np
import numpy as np
import torch

def PrepareData(data_path, data_file, num_route): 
    data = pickle.load(open(data_path + "/" + data_file, "rb"))

    sampler = []
    ds = []

    route_properties = {3: torch.empty((0, 3, 30)), 2: torch.empty((0, 2, 30))}
    route_choices = {3: torch.empty(0), 2: torch.empty(0)}

    for i in range(data["car_av"].shape[0]):
        if data["car_av"][i] == 1:
            # 3 routes possible
            arr_tensor_x = torch.from_numpy(data["x"][i]).unsqueeze(0)
            arr_tensor_z = torch.stack([torch.from_numpy(data['z'][i])]*3, dim=0).unsqueeze(0)
            arr_tensor = torch.cat((arr_tensor_x, arr_tensor_z), dim=2)

            # print("Arr.shape", arr_tensor.shape, route_properties[3].shape)
            route_properties[3] = torch.cat((route_properties[3], arr_tensor), dim=0)
            route_choices[3] = torch.cat((route_choices[3], torch.tensor([data["y"][i]-1])), dim=0)
        else:
            # 2 routes possible
            arr_tensor_x = torch.from_numpy(data["x"][i][:-1]).unsqueeze(0)
            arr_tensor_z = torch.stack([torch.from_numpy(data['z'][i])]*2, dim=0).unsqueeze(0)
            # print("Arr.shape", arr_tensor_x.shape, arr_tensor_z.shape)
            arr_tensor = torch.cat((arr_tensor_x, arr_tensor_z), dim=2)
            
            # print("Arr.shape", arr_tensor.shape, route_properties[2].shape)
            route_properties[2] = torch.cat((route_properties[2], arr_tensor), dim=0)
            route_choices[2] = torch.cat((route_choices[2], torch.tensor([data["y"][i]-1])), dim=0)
    
    temp_ds = CustomDataset(route_properties[num_route], route_choices[num_route])
    sampler.append(SequentialSampler(Subset(temp_ds, np.arange(len(temp_ds)))))
    ds.append(temp_ds)
    
    return ds, sampler

class CustomDataset(Dataset):
    def __init__(self, routes, labels):
        self.routes = routes
        self.choice = labels

    def __len__(self):
        return len(self.choice)

    def __getitem__(self, idx):
        choice = self.choice[idx]
        routes = self.routes[idx]
        sample = {"route": routes, "choice": choice}
        return sample
        

In [5]:
data_path = "/Users/jeremy/TasteNet-MNL/swissmetro/data/"
data_train = "train_jeremy.pkl"
data_dev = "dev_jeremy.pkl"
data_test = "test_jeremy.pkl"
# data_all = "swissmetro_all_jeremy.pkl"
from torch import nn, optim
lr = 1e-5
# optimizer = optim.SGD(model.parameters(), lr=lr)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
train_datasets = []
dev_datasets = []
test_datasets = []

for k in range(2, 4):
    ds_train, sampler_train = PrepareData(data_path, data_train, k)
    ds_dev, sampler_dev = PrepareData(data_path, data_dev, k)
    ds_test, sampler_test = PrepareData(data_path, data_test, k)
    # ds_all, sampler_all = PrepareData(data_path, data_all, k)
    train_datasets.append(ds_train)
    dev_datasets.append(ds_dev)
    test_datasets.append(ds_test)


## LEGEND
0 TT
1 HE
2 CO
3 MALE_0
4 MALE_1
5 AGE_0
6 AGE_1
7 AGE_2
8 AGE_3
9 AGE_4
10 INCOME_0
11 INCOME_1
12 INCOME_2
13 INCOME_3
14 FIRST_0
15 FIRST_1
16 WHO_0
17 WHO_1
18 WHO_2
19 PURPOSE_0
20 PURPOSE_1
21 PURPOSE_2
22 PURPOSE_3
23 LUGGAGE_0
24 LUGGAGE_1
25 LUGGAGE_2
26 GA_0
27 GA_1
28 SM_SEATS_0
29 SM_SEATS_1

In [17]:
def run_train():
    batch_size = 256
    NUM_EPOCHS = 10
    for epoch in range(NUM_EPOCHS):
        train_correct = 0
        train_total_seen = 0
        train_total_loss = 0
        dev_correct = 0
        dev_total_seen = 0
        dev_total_loss = 0
        for k in range(2, 4):
            train_loader = DataLoader(ConcatDataset(train_datasets[k-2]), batch_size=batch_size)
            for i, data in enumerate(train_loader):
                X = data["route"].float().to(device)
                y = data["choice"].float().to(device)

                # Begin training
                # zero the parameter gradients
                model.train()
                optimizer.zero_grad()
                y_pred = model(X, debug=False).to(device)
                loss = criterion(y_pred, y.long())
                loss.backward()
                optimizer.step()
                correct_batch = (torch.argmax(y_pred, dim=1) == y).float().sum().item()
                train_correct += correct_batch
                train_total_seen += len(y_pred)
                train_total_loss += loss.item()
            # model.eval()
            # dev_loader = DataLoader(ConcatDataset(dev_datasets[k-2]), batch_size=batch_size)
            # for i, data in enumerate(dev_loader):
            #     X = data["route"].float().to(device)
            #     y = data["choice"].float().to(device)
            #     y_pred = model(X, debug=False).to(device)
            #     correct_batch = (torch.argmax(y_pred, dim=1) == y).float().sum().item()
            #     dev_correct += correct_batch
            #     dev_total_seen += len(y_pred)
            #     dev_total_loss += loss.item()
        # print(f"Epoch {epoch+1}/{NUM_EPOCHS} Train Batch correct: ", correct_batch, len(y_pred))
        print(f"Epoch {epoch+1}/{NUM_EPOCHS} Train Total Loss: ", train_total_loss, "Average Loss: ", train_total_loss/train_total_seen)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS} Train Accuracy: ", 100*train_correct/train_total_seen,"%")
        # print(f"Epoch {epoch+1}/{NUM_EPOCHS} Dev Batch correct: ", correct_batch, len(y_pred))
        # print(f"Epoch {epoch+1}/{NUM_EPOCHS} Dev Total Loss: ", dev_total_loss, "Average Loss: ", dev_total_loss/dev_total_seen)
        # print(f"Epoch {epoch+1}/{NUM_EPOCHS} Dev Accuracy: ", 100*dev_correct/dev_total_seen,"%")

    # Test evaluation
    test_correct = 0
    test_total_seen = 0
    test_total_loss = 0
    for k in range(2, 4):
        test_loader = DataLoader(ConcatDataset(test_datasets[k-2]), batch_size=batch_size)
        for i, data in enumerate(test_loader):
            X = data["route"].float().to(device)
            y = data["choice"].float().to(device)
            model.eval()
            y_pred = model(X, debug=False).to(device)
            correct_batch = (torch.argmax(y_pred, dim=1) == y).float().sum().item()
            test_correct += correct_batch
            test_total_seen += len(y_pred)
            test_total_loss += loss.item()
    # print(f"Test Batch correct: ", correct_batch, len(y_pred))
    print(f"Test Total Loss: ", test_total_loss, "Average Loss: ", test_total_loss/test_total_seen)
    print(f"Test Accuracy: ", 100*test_correct/test_total_seen,"%")
    print(model)
    print(model.conv1.weight.data)

In [18]:
model = CNN1(num_features).to(device)
run_train()

Epoch 1/10 Train Total Loss:  30.9199920296669 Average Loss:  0.004131479426732617
Epoch 1/10 Train Accuracy:  36.718332442544096 %
Epoch 2/10 Train Total Loss:  30.9199920296669 Average Loss:  0.004131479426732617
Epoch 2/10 Train Accuracy:  36.718332442544096 %
Epoch 3/10 Train Total Loss:  30.9199920296669 Average Loss:  0.004131479426732617
Epoch 3/10 Train Accuracy:  36.718332442544096 %
Epoch 4/10 Train Total Loss:  30.9199920296669 Average Loss:  0.004131479426732617
Epoch 4/10 Train Accuracy:  36.718332442544096 %
Epoch 5/10 Train Total Loss:  30.9199920296669 Average Loss:  0.004131479426732617
Epoch 5/10 Train Accuracy:  36.718332442544096 %
Epoch 6/10 Train Total Loss:  30.9199920296669 Average Loss:  0.004131479426732617
Epoch 6/10 Train Accuracy:  36.718332442544096 %
Epoch 7/10 Train Total Loss:  30.9199920296669 Average Loss:  0.004131479426732617
Epoch 7/10 Train Accuracy:  36.718332442544096 %
Epoch 8/10 Train Total Loss:  30.9199920296669 Average Loss:  0.004131479426

In [19]:
model = CNN2(num_features).to(device)
run_train()

Epoch 1/10 Train Total Loss:  31.718872010707855 Average Loss:  0.004238224480319061
Epoch 1/10 Train Accuracy:  25.16034206306788 %
Epoch 2/10 Train Total Loss:  31.718872010707855 Average Loss:  0.004238224480319061
Epoch 2/10 Train Accuracy:  25.16034206306788 %
Epoch 3/10 Train Total Loss:  31.718872010707855 Average Loss:  0.004238224480319061
Epoch 3/10 Train Accuracy:  25.16034206306788 %
Epoch 4/10 Train Total Loss:  31.718872010707855 Average Loss:  0.004238224480319061
Epoch 4/10 Train Accuracy:  25.16034206306788 %
Epoch 5/10 Train Total Loss:  31.718872010707855 Average Loss:  0.004238224480319061
Epoch 5/10 Train Accuracy:  25.16034206306788 %
Epoch 6/10 Train Total Loss:  31.718872010707855 Average Loss:  0.004238224480319061
Epoch 6/10 Train Accuracy:  25.16034206306788 %
Epoch 7/10 Train Total Loss:  31.718872010707855 Average Loss:  0.004238224480319061
Epoch 7/10 Train Accuracy:  25.16034206306788 %
Epoch 8/10 Train Total Loss:  31.718872010707855 Average Loss:  0.004

In [26]:
SEED = 1234
torch.manual_seed(SEED)
model = Transformer(num_features, hidden_size, d_model, conv_kernel, nhead, num_encoder_layers, dim_feedforward, dropout).to(device)
run_train()

Epoch 1/10 Train Total Loss:  30.556217551231384 Average Loss:  0.004082872468096123
Epoch 1/10 Train Accuracy:  42.00962052378407 %
Epoch 2/10 Train Total Loss:  30.5024191737175 Average Loss:  0.004075684015729222
Epoch 2/10 Train Accuracy:  42.811330839123464 %
Epoch 3/10 Train Total Loss:  30.57978445291519 Average Loss:  0.0040860214394595395
Epoch 3/10 Train Accuracy:  40.980758952431856 %
Epoch 4/10 Train Total Loss:  30.570771753787994 Average Loss:  0.004084817177149652
Epoch 4/10 Train Accuracy:  41.36825227151256 %
Epoch 5/10 Train Total Loss:  30.45882886648178 Average Loss:  0.004069859549235941
Epoch 5/10 Train Accuracy:  42.437199358631744 %
Epoch 6/10 Train Total Loss:  30.476789474487305 Average Loss:  0.00407225941668724
Epoch 6/10 Train Accuracy:  42.08979155531801 %
Epoch 7/10 Train Total Loss:  30.466135263442993 Average Loss:  0.004070835818204568
Epoch 7/10 Train Accuracy:  43.35916622127205 %
Epoch 8/10 Train Total Loss:  30.55682474374771 Average Loss:  0.00408