<a href="https://colab.research.google.com/github/Kyriezxc/CIS522_Project/blob/main/hyperparameters_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks/CIS522_Project"

/content/drive/MyDrive/Colab Notebooks/CIS522_Project


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

SEED = 2023
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# @title Set random seed

# @markdown Executing `set_seed(seed=seed)` you are setting the seed

# For DL its critical to set the random seed so that students can have a
# baseline to compare their results to expected results.
# Read more here: https://pytorch.org/docs/stable/notes/randomness.html

# Call `set_seed` function in the exercises to ensure reproducibility.
import random

def set_seed(seed=None, seed_torch=True):
    """
    Function that controls randomness. NumPy and random modules must be imported.

    Args:
      seed : Integer
        A non-negative integer that defines the random state. Default is `None`.
      seed_torch : Boolean
        If `True` sets the random seed for pytorch tensors, so pytorch module
        must be imported. Default is `True`.

    Returns:
      Nothing.
    """
    if seed is None:
        seed = np.random.choice(2**32)
    random.seed(seed)
    np.random.seed(seed)
    if seed_torch:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    print(f"Random seed {seed} has been set.")


# In case that `DataLoader` is used
def seed_worker(worker_id):
    """
    DataLoader will reseed workers following randomness in
    multi-process data loading algorithm.

    Args:
      worker_id: integer
        ID of subprocess to seed. 0 means that
        the data will be loaded in the main process
        Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details

    Returns:
      Nothing
    """
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

set_seed(SEED)

Random seed 2023 has been set.


In [None]:
train = pd.read_csv("data/train.csv")
# train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
# Filter missing data and useless columns
train = train[train["MISSING_DATA"] == False]
train = train[train["POLYLINE"].map(len) > 2]
train = train[["POLYLINE"]]
# Choose 10000 rows randomly from dataset to run
train_1 = train.sample(10000, random_state=SEED)

# Pre-processing

In [None]:
def _change_type(polyline):
    polyline = polyline[2:-2]
    cords_raw = polyline.split("],[")
    cords = []
    max_lon, min_lon, max_lat, min_lat = np.float32("-inf"), np.float32("inf"), np.float32("-inf"), np.float32("inf")
    for cord in cords_raw:
        cords.append([np.float32(loc) for loc in cord.split(",")])
        max_lon = max(max_lon, cords[-1][0])
        min_lon = min(min_lon, cords[-1][0])
        max_lat = max(max_lat, cords[-1][1])
        min_lat = min(min_lat, cords[-1][1])
    return pd.Series({"POLYLINE": cords, "max_lon": max_lon, "min_lon": min_lon, "max_lat": max_lat, "min_lat": min_lat})

In [None]:
def filter_map(train, max_lat, min_lat, max_lon, min_lon):
    train.reset_index(drop=True, inplace=True)
    return train[(train["max_lat"] <= max_lat) & (train["min_lat"] >= min_lat) &
                 (train["max_lon"] <= max_lon) & (train["min_lon"] >= min_lon)]

In [None]:
def _normalize(polyline, max_lon, min_lon, max_lat, min_lat, m):
    final = []
    for cord in polyline:
        normalized = [(cord[0]-min_lon)/(max_lon-min_lon), (cord[1] - min_lat) / (max_lat - min_lat)]
        normalized[0] = min(m - 1, int(normalized[0] * m))
        normalized[1] = min(m - 1, int(normalized[1] * m))
        final.append(normalized)
    return pd.Series({"POLYLINE_INIT": final[:-1], "POLYLINE_DEST": final[-1]})

In [None]:
def _to_matrix(polyline, m, activation=lambda x: x):
    mat = np.zeros((m, m), dtype=np.float32)
    n = len(polyline)
    for i in range(n):
        x = polyline[i][0]
        y = polyline[i][1]
        mat[y][x] = activation((i + 1) / n)
    return mat

In [None]:
def transform(df_train, m):
    # Change type
    changed = df_train["POLYLINE"].apply(_change_type)
    # Filter map for max/min long/lat
    changed = filter_map(changed, 41.2, 41.1, -8.6, -8.7)
    # Get min-max
    max_longitude = changed["max_lon"].max()
    min_longitude = changed["min_lon"].min()
    max_latitude = changed["max_lat"].max()
    min_latitude = changed["min_lat"].min()
    # Normalize min-max and split
    cleaned = changed["POLYLINE"].apply(_normalize, args=(max_longitude, min_longitude, max_latitude, min_latitude, m))
    # Transform to matrices
    cleaned["MATRIX_LIN"] = cleaned["POLYLINE_INIT"].apply(_to_matrix, args=(m,))
    cleaned["MATRIX_QUA"] = cleaned["POLYLINE_INIT"].apply(_to_matrix, args=(m, lambda x: x ** 2))
    cleaned["MATRIX_CON"] = cleaned["POLYLINE_INIT"].apply(_to_matrix, args=(m, lambda x: 1))
    return cleaned

In [None]:
transformed = transform(train_1, 40)
len(transformed)

5784


# Hyperparameters Tuning

In [None]:
def sequence2tensor(sequence, seqlen=200):
    if len(sequence) > seqlen:
        sequence = sequence[-seqlen:]
    tensor = torch.zeros(seqlen, 3, dtype=torch.float32)
    for i, point in enumerate(sequence, start=seqlen-len(sequence)):
        tensor[i, :2] = torch.Tensor(point)
        tensor[i, 2] = 1  # 1 for in the sequence, 0 for out of the sequence
    return tensor

def matrix2tensor(matrix):
    return torch.Tensor(matrix)

def output2tensor(output, classification=False):
    if classification:
        return output[0] * 40 + output[1]
    return torch.Tensor(output)

In [None]:
idx_train, idx_valid = train_test_split(range(len(transformed)), test_size=0.2, random_state=SEED)

In [None]:
def train_model(model, train_data, valid_data, lr=0.001, epochs=20, device=DEVICE):
    train_loader = DataLoader(train_data, batch_size=64, shuffle=True, worker_init_fn=seed_worker)
    valid_loader = DataLoader(valid_data, batch_size=64, shuffle=True, worker_init_fn=seed_worker)
    
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    criterion2 = nn.L1Loss()

    train_losses, valid_losses = [], []
    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = []
        for in_, out in train_loader:
            in_, out = in_.to(device), out.to(device)
            optimizer.zero_grad()
            pred = model(in_)
            loss = criterion(pred, out)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        train_losses.append(np.mean(train_loss))
        model.eval()
        valid_loss = []
        for in_, out in valid_loader:
            in_, out = in_.to(device), out.to(device)
            pred = model(in_)
            loss = criterion(pred, out)
            valid_loss.append(loss.item())
        valid_losses.append(np.mean(valid_loss))
    print("Last epoch train L2 loss: {}".format(train_losses[-1]))
    print("Last epoch validation L2 loss: {}".format(valid_losses[-1]))

In [None]:
output_tensor = transformed["POLYLINE_DEST"].apply(output2tensor).values
out_train, out_valid = output_tensor[idx_train], output_tensor[idx_valid]

## LSTM

In [None]:
sequence_tensor = transformed["POLYLINE_INIT"].apply(sequence2tensor).values
seq_train, seq_valid = sequence_tensor[idx_train], sequence_tensor[idx_valid]

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, sequence, output):
        self.sequence = sequence
        self.output = output

    def __len__(self):
        return len(self.sequence)

    def __getitem__(self, idx):
        return self.sequence[idx], self.output[idx]

In [None]:
train_data = SequenceDataset(seq_train, out_train)
valid_data = SequenceDataset(seq_valid, out_valid)

In [None]:
class LSTM(nn.Module):
    def __init__(self, i_size=3, h_size=3, n_layers=1, o_size=2):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size = i_size,
            hidden_size = h_size,
            num_layers = n_layers,
            bias = False,
            batch_first = True
        )
        self.out = nn.Linear(h_size, o_size)
        
    def forward(self, x):
        out, (h, c) = self.lstm(x, None)  # None represents zero initial hidden state
        return self.out(out[:, -1, :]) # choose last time step of output

In [None]:
for h in range(2, 6):
    print("Hidden size: {}".format(h))
    set_seed(SEED)
    lstm = LSTM(h_size=h)
    train_model(lstm, train_data, valid_data, lr=0.005, epochs=40)
    print("--------------------")

Hidden size: 2
Random seed 2023 has been set.


  0%|          | 0/40 [00:00<?, ?it/s]

Last epoch train L2 loss: 51.05022401679052
Last epoch validation L2 loss: 51.52779006958008
--------------------
Hidden size: 3
Random seed 2023 has been set.


  0%|          | 0/40 [00:00<?, ?it/s]

Last epoch train L2 loss: 50.399993896484375
Last epoch validation L2 loss: 48.784828989129316
--------------------
Hidden size: 4
Random seed 2023 has been set.


  0%|          | 0/40 [00:00<?, ?it/s]

Last epoch train L2 loss: 29.954251655160565
Last epoch validation L2 loss: 28.658432408383018
--------------------
Hidden size: 5
Random seed 2023 has been set.


  0%|          | 0/40 [00:00<?, ?it/s]

Last epoch train L2 loss: 30.770522261319094
Last epoch validation L2 loss: 29.37538548519737
--------------------


## Matrix input with linear activation

In [None]:
matrix_tensor = transformed["MATRIX_LIN"].apply(matrix2tensor).values
mat_train, mat_valid = matrix_tensor[idx_train], matrix_tensor[idx_valid]

In [None]:
class MatrixDataset(Dataset):
    def __init__(self, matrix, output):
        self.matrix = matrix
        self.output = output

    def __len__(self):
        return len(self.matrix)

    def __getitem__(self, idx):
        return self.matrix[idx], self.output[idx]

In [None]:
train_data = MatrixDataset(mat_train, out_train)
valid_data = MatrixDataset(mat_valid, out_valid)

### MLP

In [None]:
class MLP(nn.Module):
    def __init__(self, i_size=40*40, h_size=100, h_layers=2, o_size=2):
        super(MLP, self).__init__()
        self.hiddens = nn.Sequential()
        self.hiddens.append(nn.Sequential(nn.Linear(i_size, h_size), nn.Dropout(), nn.ReLU()))
        for i in range(1, h_layers):
            self.hiddens.append(nn.Sequential(nn.Linear(h_size, h_size), nn.Dropout(), nn.ReLU()))
        self.out = nn.Linear(h_size, o_size)
        
    def forward(self, x):
        x = x.flatten(1)
        for hidden in self.hiddens:
            x = hidden(x)
        return self.out(x)

In [None]:
for l in range(1, 5):
    for h in (50, 100, 150, 200):
        print("Hidden layers: {}, hidden size: {}".format(l, h))
        set_seed(SEED)
        mlp = MLP(h_size=h, h_layers=l)
        train_model(mlp, train_data, valid_data, lr=0.001, epochs=50)
        print("--------------------")

Hidden layers: 1, hidden size: 50
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 29.54943069039959
Last epoch validation L2 loss: 12.082071555288215
--------------------
Hidden layers: 1, hidden size: 100
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 19.339705663184596
Last epoch validation L2 loss: 11.213913515994424
--------------------
Hidden layers: 1, hidden size: 150
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 13.645598489944248
Last epoch validation L2 loss: 9.37280747764989
--------------------
Hidden layers: 1, hidden size: 200
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 11.660718943974743
Last epoch validation L2 loss: 9.365546213953118
--------------------
Hidden layers: 2, hidden size: 50
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 31.09068225181266
Last epoch validation L2 loss: 11.510492073862176
--------------------
Hidden layers: 2, hidden size: 100
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 18.27569382811246
Last epoch validation L2 loss: 9.26976826316432
--------------------
Hidden layers: 2, hidden size: 150
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 14.542109411056728
Last epoch validation L2 loss: 8.794815013283177
--------------------
Hidden layers: 2, hidden size: 200
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 12.162331770544183
Last epoch validation L2 loss: 8.704546652342144
--------------------
Hidden layers: 3, hidden size: 50
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 38.952083430878105
Last epoch validation L2 loss: 13.531556330229106
--------------------
Hidden layers: 3, hidden size: 100
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 20.05815044821125
Last epoch validation L2 loss: 9.977286966223465
--------------------
Hidden layers: 3, hidden size: 150
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 16.015170646040406
Last epoch validation L2 loss: 7.7208192724930615
--------------------
Hidden layers: 3, hidden size: 200
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 13.436548076263845
Last epoch validation L2 loss: 7.3026734904239055
--------------------
Hidden layers: 4, hidden size: 50
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 41.74470648047042
Last epoch validation L2 loss: 13.70777325881155
--------------------
Hidden layers: 4, hidden size: 100
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 20.974197466079502
Last epoch validation L2 loss: 7.838876561114662
--------------------
Hidden layers: 4, hidden size: 150
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 15.798835571498087
Last epoch validation L2 loss: 6.852387378090306
--------------------
Hidden layers: 4, hidden size: 200
Random seed 2023 has been set.


  0%|          | 0/50 [00:00<?, ?it/s]

Last epoch train L2 loss: 12.959819793701172
Last epoch validation L2 loss: 7.3046588144804305
--------------------


### CNN

In [None]:
class CNN(nn.Module):
    def __init__(self, conv_channels=32, kernel_size=3):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, out_channels=conv_channels, kernel_size=kernel_size)
        self.fc1 = nn.Linear((40 - kernel_size + 1) ** 2 * conv_channels, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv1(x)
        x = nn.functional.relu(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = nn.functional.relu(x)
        x = self.fc2(x)
        return x

In [None]:
for k in (3, 5, 7, 9):
    for c in (16, 32, 64, 128):
        print("Kernal size: {}, conv channels: {}".format(k, c))
        set_seed(SEED)
        cnn = CNN(conv_channels=c, kernel_size=k)
        train_model(cnn, train_data, valid_data, lr=0.001)
        print("--------------------")

Kernal size: 3, conv channels: 16
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 4.449866781496022
Last epoch validation L2 loss: 6.633932741064775
--------------------
Kernal size: 3, conv channels: 32
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 4.420960473687681
Last epoch validation L2 loss: 6.784470620908235
--------------------
Kernal size: 3, conv channels: 64
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 3.3983645079887075
Last epoch validation L2 loss: 5.700418396999962
--------------------
Kernal size: 3, conv channels: 128
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.983460733335312
Last epoch validation L2 loss: 4.911886208935788
--------------------
Kernal size: 5, conv channels: 16
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 3.5709776388455743
Last epoch validation L2 loss: 6.004807622809159
--------------------
Kernal size: 5, conv channels: 32
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.7603287247762287
Last epoch validation L2 loss: 4.940412326862938
--------------------
Kernal size: 5, conv channels: 64
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.6436816494758815
Last epoch validation L2 loss: 4.629060582110756
--------------------
Kernal size: 5, conv channels: 128
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.2215644962167085
Last epoch validation L2 loss: 4.362302234298305
--------------------
Kernal size: 7, conv channels: 16
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.983143010368086
Last epoch validation L2 loss: 5.482414584410818
--------------------
Kernal size: 7, conv channels: 32
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.807702024505563
Last epoch validation L2 loss: 4.365839738594858
--------------------
Kernal size: 7, conv channels: 64
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.1790730259189868
Last epoch validation L2 loss: 4.258225566462467
--------------------
Kernal size: 7, conv channels: 128
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 1.9976773180373728
Last epoch validation L2 loss: 4.077375957840367
--------------------
Kernal size: 9, conv channels: 16
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 3.01835338873406
Last epoch validation L2 loss: 5.63180084604966
--------------------
Kernal size: 9, conv channels: 32
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.7844245458302432
Last epoch validation L2 loss: 4.4874857287657886
--------------------
Kernal size: 9, conv channels: 64
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.0113538720019877
Last epoch validation L2 loss: 4.5062839859410335
--------------------
Kernal size: 9, conv channels: 128
Random seed 2023 has been set.


  0%|          | 0/20 [00:00<?, ?it/s]

Last epoch train L2 loss: 2.105982685334062
Last epoch validation L2 loss: 5.840756196724741
--------------------
