# Regressor
1. load datasets
1. sample from datasets
1. construct features
1. construct labels (load from logs)

In [None]:
# load datasets:

import importlib
import random
import argparse
import configparser
import numpy as np
import networkx as nx
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_sparse
from torch import Tensor
from torch.nn import Linear
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.optim as optim

from torch_geometric.utils import negative_sampling, to_networkx
from typing import Union, Tuple
from torch_geometric.typing import OptPairTensor, Adj, OptTensor, Size
from torch_sparse import SparseTensor, matmul
from torch_geometric.nn.conv import MessagePassing

from ogb.linkproppred import PygLinkPropPredDataset, Evaluator


import networkx as nx
import seaborn as sns
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

import scipy
import math


from dataset_utils import node_feature_utils
from dataset_utils.node_feature_utils import *
import my_utils as utils
import sys,os
sys.path.append(os.getcwd())


importlib.reload(utils)

# save datasets
import pickle as pk

def save_datasets(datasets, file_name):
    with open(file_name, 'wb') as f:
        pk.dump(datasets, f)

def load_datasets(file_name):
    with open(file_name, 'rb') as f:
        datasets = pk.load(f)
    return datasets


# Load regressor datasets

In [None]:
"""
- small: MUTAG, NCI1, DD, CIFAR10, MNIST, SYN_CC
- middle: ogbg-molhiv, ogbg-molbace, ogbg-moltox21
"""

# small scale: < 10k
small_datasets = ['mutag', 'nci1', 'dd', 'cifar10', 'mnist', 'syn_cc']
d1 = load_datasets('mutag_datasets.pkl')
d2 = load_datasets('dd_datasets.pkl')
d3 = load_datasets('cifar10_datasets.pkl')
d4 = load_datasets('cifar10_datasets.pkl')

# SynCC
syn_cc_datasets = load_datasets('syn_datasets.pkl')

# middle scale: > 10k


# large scale: > 100k



In [None]:
# combine all datasets:


all_datasets = []

for d in syn_cc_datasets:
    all_datasets.append(d)
    
all_datasets.extend(d1+d2+d3)

store_each_len = []

for d in all_datasets:
    store_each_len.append(len(d))
    
def belong_to_which_datasets(idx, store_each_len):
    sum_len = 0
    for i, l in enumerate(store_each_len):
        sum_len += l
        if idx < sum_len:
            return i
    return -1

# construct regressor

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import matplotlib.pyplot as plt


import numpy as np
from sklearn.preprocessing import StandardScaler


def mean_norm(x):
    scaler = StandardScaler()
    return scaler.fit_transform(x), scaler


class CustomDataset(Dataset):
    def __init__(self, data):
        self.X = torch.tensor([t[0] for t in data], dtype=torch.float32)
        self.Y = torch.tensor([t[1] for t in data], dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]


np.random.seed(42)

def generate_data(num_samples):
    features = np.random.rand(num_samples, 26)
    labels = np.random.rand(num_samples, 1)
    return [(features[i], labels[i]) for i in range(num_samples)]



X = np.array([t[0] for t in all_datasets])
Y = np.array([t[1] for t in all_datasets])

normalized_x, scaler_x = mean_norm(X)
normalized_y, scaler_y = mean_norm(Y.reshape(-1, 1))

print(normalized_x.shape, normalized_y.shape)

print(len(normalized_x))
normed_combined_data = [(normalized_x[i], normalized_y[i]) for i in range(normalized_x.shape[0])]

dataset = CustomDataset(normed_combined_data)

train_size = int(0.6 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

class MLPRegressor(nn.Module):
    def __init__(self):
        super(MLPRegressor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(26, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.Dropout(0.4),
            nn.Sigmoid(),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.layers(x)


    
    
def evaluate(loader, model,):
    model.eval()
    total_loss = 0.0
    total_samples = 0
    preds = []
    Y = []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels.view(-1, 1))
            total_loss += loss.item() * inputs.size(0)
            total_samples += inputs.size(0)
            preds.append(outputs.cpu().numpy())
            Y.append(labels.cpu().numpy())
            
    
    preds = np.concatenate(preds, axis=0)
    Y = np.concatenate(Y, axis=0).ravel()
    
    mae = mean_absolute_error(Y, preds)
    mse = mean_squared_error(Y, preds)
    rmse = math.sqrt(mse)
    r2 = r2_score(Y, preds)
    
    results = (mae, mse, rmse, r2)
    return mse, results, preds, Y

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



model = MLPRegressor().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 500
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()

    train_loss, _, _, _ = evaluate(train_loader, model)
    val_loss, _,_,_ = evaluate(val_loader, model)
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    # print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

test_loss, test_results,_, _ = evaluate(test_loader, model)
print(f"Test Loss: {test_loss:.4f}")
print(f'MLPs: {round(test_results[0], 2)} & {round(test_results[1], 2)} & {round(test_results[2], 2)} & {round(test_results[3], 2)}')

# Plot train and validation loss curves
plt.figure()
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.legend()
plt.title("Train and Validation Loss Curves")
plt.show()


In [None]:

np.random.seed(42)

def train_regressor(loader, regressor):
    X = []
    Y = []

    for inputs, labels in loader:
        X.extend(inputs.cpu().numpy())
        Y.extend(labels.cpu().numpy())

    X = np.array(X)
    Y = np.array(Y).ravel()

    regressor.fit(X, Y)
    

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import math


def regressor_evaluate(loader, regressor):
    X = []
    Y = []

    for inputs, labels in loader:
        X.extend(inputs.cpu().numpy())
        Y.extend(labels.cpu().numpy())

    X = np.array(X)
    Y = np.array(Y).ravel()

    preds = regressor.predict(X)
    mae = mean_absolute_error(Y, preds)
    mse = mean_squared_error(Y, preds)
    rmse = math.sqrt(mse)
    r2 = r2_score(Y, preds)
    
    results = (mae, mse, rmse, r2)
    return mse, results, preds, Y


# baseline regressors:
- random forest

In [None]:

def run_regressor(regressor, regressor_name):
        
    # Choose a regressor
    # regressor = GradientBoostingRegressor(random_state=42)

    train_regressor(train_loader, regressor)

    train_loss, train_results, train_preds, train_Y = regressor_evaluate(train_loader, regressor)
    val_loss, val_results, val_preds, val_Y = regressor_evaluate(val_loader, regressor)
    test_loss, test_reuslts, test_preds, test_Y = regressor_evaluate(test_loader, regressor)

    # print(f"regressor_name: {regressor_name}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Test Loss: {test_loss:.4f} \
    #       (mse, rmse, r^2): {test_reuslts}")
    
    print(f'{regressor_name}: {round(test_reuslts[0], 2)} & {round(test_reuslts[1], 2)} & {round(test_reuslts[2], 2)} & {round(test_reuslts[3], 2)}')
    plt.figure()
    plt.plot(scaler_y.inverse_transform(test_preds), label="Predictions")
    plt.plot(scaler_y.inverse_transform(test_Y), label="True Labels")
    plt.title(regressor_name)
    plt.legend()


from sklearn.svm import SVR
from sklearn.linear_model import Ridge

# Choose a regressor
regressors = {'SVR': SVR(),'Ridge': Ridge(), "RandomForestRegressor": RandomForestRegressor(),  
              "LinearRegression": LinearRegression()}

for k, regressor in regressors.items():
    run_regressor(regressor, k)

In [None]:
import xgboost as xgb
import lightgbm as lgb


regressors = {'XG boost': xgb.XGBRegressor(objective ='reg:squarederror'),
              'LGBMRegressor': lgb.LGBMRegressor()}

for k, regressor in regressors.items():
    run_regressor(regressor, k)

In [None]:
# use test dataset to evaluate the model

# average each datasets:

from collections import Counter

test_dataset_idx = [belong_to_which_datasets(i, store_each_len) for i in test_data.indices]
print(Counter(test_dataset_idx))

# Get predictions and true labels from the test dataset
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predictions.extend(outputs.squeeze().cpu().numpy())
        true_labels.extend(labels.squeeze().cpu().numpy())


plt.figure()
plt.plot(scaler_y.inverse_transform(predictions), label="Predictions")
plt.plot(scaler_y.inverse_transform(true_labels), label="True Labels")
plt.legend()


# plot predictions of other real-world datasets

In [None]:
# CIFAR10 only has on fold
"""
result_GIN_0317_decouple_degree_attr_CIFAR10
result_GIN_0317_mix_degree_attr_CIFAR10
result_GIN_0317_only_attr_CIFAR10
result_GIN_0317_only_degree_CIFAR10
result_GIN_0318_decouple_degree_attr_CIFAR10
result_GIN_0327_finger_mlp_attr_multicrossen_CIFAR10
result_GIN_0401_GIN_degree_CIFAR10
result_GIN_0403_GIN_degree_CIFAR10
"""

MLP_log_path_degree = f'./results/result_GIN_0401_graph_mlp_avgDegree_CIFAR10/MolecularGraphMLP_CIFAR10_assessment/1_NESTED_CV'
GNN_log_path_degree = f'./results/result_GIN_0317_only_degree_CIFAR10/GIN_CIFAR10_assessment/1_NESTED_CV'

MLP_log_path_attr = f'./results/result_GIN_0327_finger_mlp_attr_multicrossen_CIFAR10/MolecularFingerprint_CIFAR10_assessment/10_NESTED_CV'
GNN_log_path_attr = f'./results/result_GIN_0317_only_attr_CIFAR10/GIN_CIFAR10_assessment/1_NESTED_CV'

dataset = datasets_obj['CIFAR10']
cifar10_datasets = E_datasets(dataset, MLP_log_path_degree, GNN_log_path_degree, MLP_log_path_attr, GNN_log_path_attr, fold=1)

