In [1]:
from preprocessing import *
from sklearn.model_selection import KFold
import argparse
from model import *
from train import test
import torch.optim as optim
import pandas as pd

from MatrixVectorizer import *
import networkx as nx
from typing import Union

In [2]:
# load csvs as numpy
lr_data_path = "../data/lr_train.csv"
hr_data_path = "../data/hr_train.csv"

lr_train_data = pd.read_csv(lr_data_path, delimiter=",").to_numpy()
hr_train_data = pd.read_csv(hr_data_path, delimiter=",").to_numpy()
lr_train_data[lr_train_data < 0] = 0
np.nan_to_num(lr_train_data, copy=False)

hr_train_data[hr_train_data < 0] = 0
np.nan_to_num(hr_train_data, copy=False)

# map the anti-vectorize function to each row of the lr_train_data

lr_train_data_vectorized = np.array(
    [MatrixVectorizer.anti_vectorize(row, 160) for row in lr_train_data]
)
hr_train_data_vectorized = np.array(
    [MatrixVectorizer.anti_vectorize(row, 268) for row in hr_train_data]
)

num_samples = hr_train_data_vectorized.shape[0]
num_samples_list = range(num_samples)
sample_to_index = dict(zip(num_samples_list, hr_train_data_vectorized))

print(lr_train_data_vectorized.shape)
print(hr_train_data_vectorized.shape)

(167, 160, 160)
(167, 268, 268)


In [3]:
lr_train_data_vectorized = torch.tensor(lr_train_data_vectorized, dtype=torch.float32)
hr_train_data_vectorized = torch.tensor(hr_train_data_vectorized, dtype=torch.float32)

from torch.utils.data import Dataset

class NoisyDataset(Dataset):
    def __init__(self, lr_data, hr_data, noise_level=0.01):
        """
        lr_data: Low resolution data (torch.tensor)
        hr_data: High resolution data (torch.tensor)
        noise_level: Standard deviation of Gaussian noise to be added
        """
        self.lr_data = lr_data
        self.hr_data = hr_data
        self.noise_level = noise_level

    def __len__(self):
        return len(self.lr_data)

    def __getitem__(self, idx):
        lr_sample = self.lr_data[idx]
        hr_sample = self.hr_data[idx]

        # Adding Gaussian noise
        noise = torch.randn(lr_sample.size()) * self.noise_level
        noisy_lr_sample = lr_sample + noise

        # Clipping to ensure values are between 0 and 1
        noisy_lr_sample = torch.clamp(noisy_lr_sample, 0, 1)

        return noisy_lr_sample, hr_sample

train_data = NoisyDataset(lr_train_data_vectorized, hr_train_data_vectorized, noise_level=0.5)

train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=True)
sample = next(iter(train_data_loader))
print(sample[0].shape, sample[1].shape)

torch.Size([1, 160, 160]) torch.Size([1, 268, 268])


In [4]:
split = int(num_samples * 0.8)

subjects_adj, subjects_labels = (
    lr_train_data_vectorized[:split],
    hr_train_data_vectorized[:split],
)

held_out_subjects_adj, held_out_subjects_labels = (
    lr_train_data_vectorized[split:],
    hr_train_data_vectorized[split:],
)

In [5]:
num_splt = 3
epochs = 30
lr = 0.00005
lmbda = 25
lr_dim = 160
hr_dim = 320
hidden_dim = 320
padding = 26
dropout = 0.1
args = argparse.Namespace()
args.epochs = epochs
args.lr = lr
args.lmbda = lmbda
args.lr_dim = lr_dim
args.hr_dim = hr_dim
args.hidden_dim = hidden_dim
args.padding = padding
args.p = dropout

In [6]:
cv = KFold(n_splits=3, random_state=42, shuffle=True)

In [7]:
ks = [0.7, 0.5]
model = GSRNet(ks, args)

In [8]:
class TopologicalMeasures:
    def __init__(self, graph: Union[np.ndarray, torch.Tensor]):
        if isinstance(graph, np.ndarray):
            self.graph = nx.Graph(graph)
        elif isinstance(graph, torch.Tensor):
            graph_numpy = graph.cpu().detach().numpy()
            self.graph = nx.Graph(graph_numpy)

    def compute_measures(self, number=0):
        measures = {}
        measures["degree"] = torch.FloatTensor(list(dict(self.graph.degree()).values()))
        # measures['clustering'] = torch.FloatTensor(list(nx.clustering(self.graph).values())) # removed due to slow computation
        measures["closeness"] = torch.FloatTensor(
            list(nx.closeness_centrality(self.graph).values())
        )
        # measures['betweenness'] = torch.FloatTensor(list(nx.betweenness_centrality(self.graph).values())) # removed due to slow computation
        measures["pagerank"] = torch.FloatTensor(list(nx.pagerank(self.graph).values()))
        measures["eigenvector"] = torch.FloatTensor(
            list(nx.eigenvector_centrality(self.graph).values())
        )
        return measures


def precompute_topological_measures(hr_train_data_vectorized: np.ndarray):
    index_to_measure = {}
    for index, graph in enumerate(hr_train_data_vectorized):
        index_to_measure[index] = TopologicalMeasures(graph).compute_measures()
        if index % 10 == 0:
            print(f"Computed measures for {index} graphs")
    return index_to_measure


def compute_topological_MAE_loss(
    graph1, graph2: Union[np.ndarray, torch.Tensor], precomputed_g1: False
):
    if precomputed_g1:
        measures1 = graph1
    else:
        measures1 = TopologicalMeasures(graph1).compute_measures()
    measures2 = TopologicalMeasures(graph2).compute_measures()
    loss = 0
    # compute MAE for each measure

    for measure in measures1:
        loss += F.l1_loss(measures1[measure], measures2[measure])
    loss = loss / len(measures1)

    return loss

In [9]:
# precompute topological measures for hr_train_data_vectorized
# precomputed_measures = precompute_topological_measures(hr_train_data_vectorized)

In [10]:
import time

criterion = nn.L1Loss()


def train(model, optimizer, subjects_adj, subjects_labels, args):
    # , subjects_adj_test, subjects_ground_truth_test):

    all_epochs_loss = []
    no_epochs = args.epochs

    for epoch in range(no_epochs):
        epoch_loss = []
        epoch_error = []
        epoch_topo = []

        model.train()
        for index, (lr, hr) in enumerate(train_data_loader):
            
            lr = lr.reshape(160, 160)
            hr = hr.reshape(268, 268)

            model_outputs, net_outs, start_gcn_outs, layer_outs = model(lr)
            model_outputs = unpad(model_outputs, args.padding)

            # weights = unpad(model.layer.weights, args.padding)

            padded_hr = pad_HR_adj(hr, args.padding)
            eig_val_hr, U_hr = torch.linalg.eigh(padded_hr, UPLO="U")

            # loss = criterion(net_outs, start_gcn_outs) + criterion(model.layer.weights,U_hr) + args.lmbda * criterion(model_outputs, hr)
            # loss = criterion(model_outputs, hr)
            loss = (
                args.lmbda * criterion(net_outs, start_gcn_outs)
                + criterion(model.layer.weights, U_hr)
                + criterion(model_outputs, hr)
            )
            start_time = time.time()

            # topo = compute_topological_MAE_loss(hr, model_outputs, precomputed_g1 = False)
            # topo = compute_topological_MAE_loss(
            #     precomputed_measures[index], model_outputs, precomputed_g1=True
            # )
            # print("Time to compute topo: ", time.time() - start_time)
            # print(topo.item())

            error = criterion(model_outputs, hr)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss.append(loss.item())
            epoch_error.append(error.item())
            # epoch_topo.append(topo.item())

        model.eval()
        # print("Epoch: ",epoch+1, "Loss: ", np.mean(epoch_loss), "Error: ", np.mean(epoch_error))
        print(
            "Epoch: ",
            epoch + 1,
            "Loss: ",
            np.mean(epoch_loss),
            "Error: ",
            np.mean(epoch_error),
            # "Topo: ",
            # np.mean(epoch_topo),
        )
        # test(model, held_out_subjects_adj, held_out_subjects_labels, args)
        # test(model, subjects_adj_test, subjects_ground_truth_test, args)
        all_epochs_loss.append(np.mean(epoch_loss))

In [11]:
# # print(model)
# optimizer = optim.Adam(model.parameters(), lr=args.lr)
# # optimizer = optim.SGD(model.parameters(), lr=args.lr)

# for train_index, test_index in cv.split(subjects_adj):
#     subjects_adj_train = subjects_adj[train_index]  # Get training data
#     subjects_adj_test = subjects_adj[test_index]   # Get testing data
#     subjects_ground_truth_train = subjects_labels[train_index]
#     subjects_ground_truth_test = subjects_labels[test_index]

#     train(model, optimizer, subjects_adj_train, subjects_ground_truth_train, args, subjects_adj_test, subjects_ground_truth_test)

#     print('Held out test score:')
#     test(model, held_out_subjects_adj, held_out_subjects_labels, args)
#     print('------------------------------')

# Final Model & Kaggle Submission

In [12]:
# final train
final_model = GSRNet(ks, args)
optimizer = optim.Adam(final_model.parameters(), lr=args.lr)

train(final_model, optimizer, lr_train_data_vectorized, hr_train_data_vectorized, args)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Epoch:  1 Loss:  0.5707595127071449 Error:  0.2406230556393812 Topo:  nan
Epoch:  2 Loss:  0.38345538570495424 Error:  0.20803140344733964 Topo:  nan
Epoch:  3 Loss:  0.36614711645120634 Error:  0.1969838216454683 Topo:  nan
Epoch:  4 Loss:  0.3543242904954328 Error:  0.19150986482283314 Topo:  nan
Epoch:  5 Loss:  0.3433955650843546 Error:  0.1876029204465672 Topo:  nan
Epoch:  6 Loss:  0.3334669738472579 Error:  0.18493208735288974 Topo:  nan
Epoch:  7 Loss:  0.32464526620453704 Error:  0.18312408440484257 Topo:  nan
Epoch:  8 Loss:  0.3165310728335809 Error:  0.18154095211428797 Topo:  nan
Epoch:  9 Loss:  0.3085047434903904 Error:  0.1803627714603961 Topo:  nan
Epoch:  10 Loss:  0.3014208462066993 Error:  0.17958564529875795 Topo:  nan
Epoch:  11 Loss:  0.294827392893637 Error:  0.17898524226899631 Topo:  nan
Epoch:  12 Loss:  0.28837572653850396 Error:  0.17817109546618548 Topo:  nan
Epoch:  13 Loss:  0.28255740896670406 Error:  0.1774993182834751 Topo:  nan
Epoch:  14 Loss:  0.27

In [13]:
# Generate submission

# load csvs as numpy
test_lr_data_path = "../data/lr_test.csv"

# lr_test_data = np.loadtxt(test_lr_data_path, delimiter=',')
lr_test_data = pd.read_csv(test_lr_data_path, delimiter=",").to_numpy()
print(lr_test_data.shape)
lr_test_data[lr_test_data < 0] = 0
np.nan_to_num(lr_test_data, copy=False)


# map the anti-vectorize function to each row of the lr_train_data

lr_test_data_vectorized = np.array(
    [MatrixVectorizer.anti_vectorize(row, 160) for row in lr_test_data]
)
print(lr_test_data_vectorized.shape)

(112, 12720)
(112, 160, 160)


In [14]:
final_model.eval()
preds = []
for lr in lr_test_data_vectorized:
    lr = torch.from_numpy(lr).type(torch.FloatTensor)

    model_outputs, _, _, _ = final_model(lr)
    model_outputs = unpad(model_outputs, args.padding)
    preds.append(MatrixVectorizer.vectorize(model_outputs.detach().numpy()))

print(len(preds), preds[0].shape)
r = np.hstack(preds)
print(r.shape)
meltedDF = r.flatten()

112 (35778,)
(4007136,)


In [15]:
n = meltedDF.shape[0]
df = pd.DataFrame({"ID": np.arange(1, n + 1), "Predicted": meltedDF})
df.to_csv("submission.csv", index=False)