In [2]:
#1. prepare dataset for torch batching
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

import ibdloader
datapath = "../datasets-genotek/"

dataset1fname = datapath+"CR_graph_rel.csv"
df = pd.read_csv(dataset1fname)
print(df)

pairs, weights, labels, labeldict =\
ibdloader.load_pure( dataset1fname )

        node_id1   node_id2 label_id1 label_id2   ibd_sum  ibd_n
0         node_0     node_5  мордвины  мордвины  29.81720      4
1         node_0    node_10  мордвины  мордвины  11.63220      1
2         node_0    node_11  мордвины  мордвины  23.90440      2
3         node_0    node_18  мордвины  мордвины  11.25290      1
4         node_0    node_20  мордвины  мордвины   8.88252      1
...          ...        ...       ...       ...       ...    ...
67498  node_3741  node_3752  белорусы  белорусы   9.51327      1
67499  node_3745  node_3755  белорусы  белорусы   9.23221      1
67500  node_3749  node_3764  белорусы  белорусы  10.63310      1
67501  node_3754  node_3755  украинцы  белорусы   8.04722      1
67502  node_3758  node_3766  белорусы  белорусы   8.77936      1

[67503 rows x 6 columns]
Unique ids in ibd datafile: 3767
OK: Ids are starting from 0
OK: Ids are consecutive
Label dictionary: {'мордвины': 0, 'белорусы': 1, 'украинцы': 2, 'южные-русские': 3, 'северные-русские': 4}
pa

In [4]:
#test|train selection
#10% of every class goes to test
#сгруппируем вершины по классам
nodeclasses = {}
for label in labeldict:
    idx = labeldict[label]
    nodeclasses[label] = np.argwhere(labels==idx).flatten()
   
for label in nodeclasses:
    print (f"{label}: {nodeclasses[label].shape[0]}")


nodeclasses_train = {}
print ("train partition")
for label in nodeclasses:
    print (f"{label}: {nodeclasses_train[label].shape[0]}")
nodeclasses_test = {}
print ("test partition")
for label in nodeclasses:
    print (f"{label}: {nodeclasses_test[label].shape[0]}")
indices_train
indices_test
train_mask = np.array([True]*self.len)
test_mask = np.array([True]*self.len)

#experiment 1: simple one-hot encoding of class as 5 features
def onehot(lbl):
    arr = [0,0,0,0,0]
    arr[lbl]=1
    return arr
        
features = [ onehot(label) for label in labels]
features = np.array(features)

In [59]:
from torch.utils.data import Dataset, DataLoader

class Data(Dataset):
    def __init__(self, X, y, train_mask, test_mask):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.int64))
        self.len = self.X.shape[0]
        self.num_features = 5
        self.num_classes = 5
        self.train_mask = train_mask
        self.test_mask = test_mask
       
    def __getitem__(self, index):
        return self.X[index], self.y[index]
   
    def __len__(self):
        return self.len
   
batch_size = 4

# Instantiate training and test data
train_data = Data(features, labels, train_mask, test_mask)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

In [60]:

import torch
from torch.nn import Linear
import torch.nn.functional as F


class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(train_data.num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, train_data.num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        #x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x

model = MLP(hidden_channels=16)
print(model)

MLP(
  (lin1): Linear(in_features=5, out_features=16, bias=True)
  (lin2): Linear(in_features=16, out_features=5, bias=True)
)


In [67]:
model = MLP(hidden_channels=16)
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(train_data.X)  # Perform a single forward pass.
    loss = criterion(out[train_data.train_mask], train_data.y[train_data.train_mask])  # Compute the loss solely based on the training nodes.
    #loss = criterion(out, train_data.y)  # Compute the loss solely based on the training nodes.
    
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

def test():
    model.eval()
    out = model(train_data.X)
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred[train_data.test_mask] == train_data.y[train_data.test_mask]  # Check against ground-truth labels.
    
    test_acc = int(test_correct.sum()) / int(train_data.test_mask.sum())  # Derive ratio of correct predictions.
    print(f"Test accuracy: {test_acc}, correct {int(test_correct.sum())} out of {int(train_data.test_mask.sum())}")
    
    return test_acc

for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    

Epoch: 001, Loss: 1.5821
Epoch: 002, Loss: 1.5477
Epoch: 003, Loss: 1.5139
Epoch: 004, Loss: 1.4800
Epoch: 005, Loss: 1.4466
Epoch: 006, Loss: 1.4135
Epoch: 007, Loss: 1.3808
Epoch: 008, Loss: 1.3480
Epoch: 009, Loss: 1.3150
Epoch: 010, Loss: 1.2817
Epoch: 011, Loss: 1.2479
Epoch: 012, Loss: 1.2137
Epoch: 013, Loss: 1.1790
Epoch: 014, Loss: 1.1439
Epoch: 015, Loss: 1.1085
Epoch: 016, Loss: 1.0731
Epoch: 017, Loss: 1.0374
Epoch: 018, Loss: 1.0016
Epoch: 019, Loss: 0.9665
Epoch: 020, Loss: 0.9314
Epoch: 021, Loss: 0.8965
Epoch: 022, Loss: 0.8620
Epoch: 023, Loss: 0.8281
Epoch: 024, Loss: 0.7949
Epoch: 025, Loss: 0.7625
Epoch: 026, Loss: 0.7307
Epoch: 027, Loss: 0.7000
Epoch: 028, Loss: 0.6703
Epoch: 029, Loss: 0.6414
Epoch: 030, Loss: 0.6148
Epoch: 031, Loss: 0.5882
Epoch: 032, Loss: 0.5623
Epoch: 033, Loss: 0.5368
Epoch: 034, Loss: 0.5119
Epoch: 035, Loss: 0.4878
Epoch: 036, Loss: 0.4643
Epoch: 037, Loss: 0.4414
Epoch: 038, Loss: 0.4194
Epoch: 039, Loss: 0.3981
Epoch: 040, Loss: 0.3778


In [68]:
acc = test()

Test accuracy: 1.0, correct 3767 out of 3767
1.0


In [42]:
train_data.X[0:1]

tensor([[1., 0., 0., 0., 0.]])