In [1]:
from pandas import read_csv
from numpy import array as arr
from ast import literal_eval as lev
from sklearn.model_selection import train_test_split as tts

In [2]:
RANDOM_SEED = 42
EMBEDDING_DIM = 50

In [3]:
DATASET_PATH = "./dataset/"

In [4]:
def load_dataset(path : str = DATASET_PATH, *args, **kwargs) -> tuple:
    big_dataset = read_csv(path + "gp_table.csv")
    x = arr([lev(i) for i in big_dataset['snps']]) # convert string of list into literal list
    y = arr(big_dataset['rice_yield'])
    print("============ Raw Data ============")
    print("x data dim: {}".format(x.shape))
    print("y data dim: {}".format(y.shape), end="\n\n")

    # split dataset
    x_train, x_test, y_train, y_test = tts(x, y, test_size=.3, random_state=RANDOM_SEED)
    x_test, x_val, y_test, y_val = tts(x_test, y_test, test_size=.5, random_state=RANDOM_SEED)

    print("========= Processed Data =========")
    print("x_train: {}, y_train: {}".format(len(x_train), len(y_train)))
    print("x_valid: {}, y_valid: {}".format(len(x_val), len(y_val)))
    print("x_test: {}, y_test: {}".format(len(x_test), len(y_test)))

    return (x_train, y_train), (x_val, y_val), (x_test, y_test)

In [5]:
(x_train, y_train), (x_val, y_val), (x_test, y_test) = load_dataset()

x data dim: (687, 1232)
y data dim: (687,)

x_train: 480, y_train: 480
x_valid: 104, y_valid: 104
x_test: 103, y_test: 103


# PHASE 2

Source:
* [link-1](https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8)

In [None]:
%matplotlib inline
import torch
import networkx as nx
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch_geometric.data import Data
# from torch_geometric.loader import DataLoader

# print(torch.__version__)

In [7]:
# import graphdata as gd
import utils

  from .autonotebook import tqdm as notebook_tqdm


## 2.1. Train Data

In [8]:
data_train, G_train, num_valid_data_train = utils.get_graph_data(x_train, y_train, EMBEDDING_DIM)

  data_x = tensor(embedd_x, dtype=t_float)
  data_y = tensor(section_y, dtype=t_float)


In [10]:
print(f'total valid data train: {num_valid_data_train[0]}')
print(f'total invalid data train: {num_valid_data_train[1]}')
print(f'total data train: {num_valid_data_train[2]}')
print()
print(data_train)

total valid data train: 164140
total invalid data train: 66260
total data train: 230400

Data(x=[480, 1232, 50], edge_index=[2, 82070], y=[480])


In [11]:
print("Directed graph:", G_train.is_directed())
print("Number of nodes:", G_train.number_of_nodes())
print("Number of edges:", G_train.number_of_edges())

Directed graph: False
Number of nodes: 480
Number of edges: 82070


## 2.2. Validation Data

In [None]:
data_val, G_val, num_valid_data_val = utils.get_graph_data(x_val, y_val, embedding_dim)

In [None]:
print(f'total valid data validation: {num_valid_data_val[0]}')
print(f'total invalid data validation: {num_valid_data_val[1]}')
print(f'total data validation: {num_valid_data_val[2]}')
print()
print(data_val)

total valid data validation: 6408
total invalid data validation: 4408
total data validation: 10816

Data(x=[104, 1232, 50], edge_index=[2, 3204], y=[104])


In [None]:
print("Directed graph:", G_val.is_directed())
print("Number of nodes:", G_val.number_of_nodes())
print("Number of edges:", G_val.number_of_edges())

Directed graph: False
Number of nodes: 104
Number of edges: 3204


## 2.3. Test Data

In [None]:
data_test, G_test, num_valid_data_test = utils.get_graph_data(x_test, y_test, embedding_dim)

In [None]:
print(f'total valid data test: {num_valid_data_test[0]}')
print(f'total invalid data test: {num_valid_data_test[1]}')
print(f'total data test: {num_valid_data_test[2]}')
print()
print(data_test)

total valid data test: 6880
total invalid data test: 3729
total data test: 10609

Data(x=[103, 1232, 50], edge_index=[2, 3440], y=[103])


In [None]:
print("Directed graph:", G_test.is_directed())
print("Number of nodes:", G_test.number_of_nodes())
print("Number of edges:", G_test.number_of_edges())

Directed graph: False
Number of nodes: 103
Number of edges: 3440


# GNN

Referensi: https://towardsdatascience.com/graph-neural-networks-in-python-c310c7c18c83

In [18]:
import torch

In [17]:
from torch.nn import Linear, Module
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

data_num_class = 1 # karena setiap baris mau output 1 kelas aja (?) ya
data_num_features = EMBEDDING_DIM

class GCN(Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(RANDOM_SEED)

    # if hidden_channels < 50:
    #     self.conv1 = GCNConv(data_num_features, 7)
    #     self.conv2 = GCNConv(7, 5)
    #     self.conv3 = GCNConv(5, 16)
    #     self.conv4 = GCNConv(16, 11)
    #     self.conv5 = GCNConv(11, 8)
    #     self.conv6 = GCNConv(6, 4)
    #     self.conv7 = GCNConv(4, 2)
    #     self.classifier = Linear(2, data_num_class)
    # else:
        self.conv1 = GCNConv(data_num_features, 40)
        self.conv2 = GCNConv(40, 30)
        self.conv3 = GCNConv(30, 22)
        self.conv4 = GCNConv(22, 16)
        self.conv5 = GCNConv(16, 12)
        self.conv6 = GCNConv(12, 8)
        self.conv7 = GCNConv(8, 4)
        self.classifier = Linear(4, data_num_class)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index)
        h = F.relu(h)
        h = self.conv2(h, edge_index)
        h = F.relu(h)
        h = self.conv3(h, edge_index)
        h = F.relu(h)
        h = self.conv4(h, edge_index)
        h = F.relu(h)
        h = self.conv5(h, edge_index)
        h = F.relu(h)
        h = self.conv6(h, edge_index)
        h = F.relu(h)
        h = self.conv7(h, edge_index)
        h = F.relu(h)
        out = self.classifier(h)
        return out, h

In [31]:
model = GCN(hidden_channels=50)
# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
model = model.to(device)
params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(model)
print("Total trainable params: {}".format(params))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = range(0, 500)

GCN(
  (conv1): GCNConv(50, 40)
  (conv2): GCNConv(40, 30)
  (conv3): GCNConv(30, 22)
  (conv4): GCNConv(22, 16)
  (conv5): GCNConv(16, 12)
  (conv6): GCNConv(12, 8)
  (conv7): GCNConv(8, 4)
  (classifier): Linear(in_features=4, out_features=1, bias=True)
)
Total trainable params: 4669


In [28]:
# from torchmetrics import R2Score
# import tensorflow as tf

def evaluate(data):
    optimizer.zero_grad()
    out, h = model(data.x.to(device), data.edge_index.to(device))

#     out_squeezed = torch.squeeze(out)
#     out_flatten = torch.flatten(out, start_dim=0)

#     out_np = out_flatten.detach().numpy()
#     out_np_squeezed = out_squeezed.detach().numpy()

#     print(tf.shape(data.y))
#     print(tf.shape(out_np))
#     print(tf.shape(out_np_squeezed))


    # r2score = R2Score()
    # r2score(out_flatten, data.y)

    loss_func = nn.MSELoss()
    loss = loss_func(out, data.y)
    loss.backward()
    optimizer.step()
    return loss, h, out

### Train & Validation

In [32]:
train_losses = []
train_embeddings = []
train_out = []

val_losses = []
val_embeddings = []
val_out = []

# from torch.utils.data import DataLoader
# train_dataloader = torch_geometric.loader.NodeLoader(data_train, node_sampler=30, batch_size=32, shuffle=True)
# val_dataloader =  torch_geometric.loader.NodeLoader(data_val, node_sampler=30, batch_size=32, shuffle=True)

for epoch in epochs:
  # for i,  in train_dataloader:
    
    train_loss, train_h, out_train = evaluate(data_train)
    train_losses.append(train_loss)
    train_embeddings.append(train_h)
    train_out.append(out_train)

    val_loss, val_h, out_val = evaluate(data_val)
    val_losses.append(val_loss)
    val_embeddings.append(val_h)
    val_out.append(out_val)

    if epoch % 10 == 0:
        print(f"Train Epoch: {epoch}\tLoss: {train_loss:.4f}")
        print(f"Validate Epoch: {epoch}\tLoss: {val_loss:.4f}")

KeyboardInterrupt: 

In [None]:
type(data_val)

In [None]:
for i in range(0, len(train_losses)):
  train_losses[i] = float(train_losses[i].data)

for i in range(0, len(val_losses)):
  val_losses[i] = float(val_losses[i].data)

# plot lines
plt.plot(train_losses, label = "Train Loss")
plt.plot(val_losses, label = "Validation Loss")
plt.legend()
plt.show()

### Test

In [None]:
test_losses = []
test_embeddings = []
test_out = []
for epoch in epochs:
  loss, h, out = evaluate(data_test)
  test_losses.append(loss)
  test_embeddings.append(h)
  test_out.append(out)
print(f"Epoch: {epoch}\tLoss: {loss:.4f}")

In [None]:
for i in range(0, len(test_losses)):
  test_losses[i] = float(test_losses[i].data)

# plot lines
plt.plot(train_losses, label = "Train Loss")
plt.plot(val_losses, label = "Validation Loss")
plt.plot(test_losses, label = "Test Loss")
plt.legend()
plt.show()