In [1]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cu113.html --quiet

[K     |████████████████████████████████| 7.9 MB 2.1 MB/s 
[K     |████████████████████████████████| 3.5 MB 33.8 MB/s 
[K     |████████████████████████████████| 2.4 MB 22.0 MB/s 
[K     |████████████████████████████████| 709 kB 35.6 MB/s 
[K     |████████████████████████████████| 467 kB 2.2 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [3]:
import pandas as pd

df = pd.read_csv("iris.csv")
df

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [4]:
def toNumber(Species):
  if Species == 'setosa':
    return 0
  elif Species == 'versicolor':
    return 1
  else: 
    return 2

df['label'] = df['Species'].apply(toNumber)

In [72]:
df = df.sample(frac=1)
df

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,label
8,4.4,2.9,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
85,6.0,3.4,4.5,1.6,versicolor,1
125,7.2,3.2,6.0,1.8,virginica,2
67,5.8,2.7,4.1,1.0,versicolor,1
...,...,...,...,...,...,...
93,5.0,2.3,3.3,1.0,versicolor,1
107,7.3,2.9,6.3,1.8,virginica,2
69,5.6,2.5,3.9,1.1,versicolor,1
116,6.5,3.0,5.5,1.8,virginica,2


In [73]:
import numpy as np
from sklearn.neighbors import kneighbors_graph

def getEdges(embeddings):

  A = kneighbors_graph(embeddings, n_neighbors=3, mode = 'connectivity', metric="euclidean", n_jobs = -1)
  A = A.toarray()
  
  edges = []

  for i in range(0, embeddings.shape[0]):
      indexs = np.where(A[i] != 0)
      for index in indexs[0]:
          edges.append([i, index])

  return edges

In [74]:
embeddings = df.iloc[:,:-2].to_numpy()
target = df['label'].to_numpy()

edges = getEdges(embeddings)

In [75]:
import torch
from torch_geometric.data import Data
import torch_geometric.transforms as T


edges = torch.tensor(edges, dtype=torch.long)
embeddings = torch.tensor(embeddings, dtype=torch.float)
target = torch.tensor(target, dtype=torch.long)

data = Data(x=embeddings, edge_index=edges.t().contiguous())

data.num_classes = 3
data.y = target
data

Data(x=[150, 4], edge_index=[2, 450], num_classes=3, y=[150])

In [122]:
from sklearn.model_selection import train_test_split

test_index = 80

def to_mask(index, size):
    mask = torch.zeros(size, dtype=torch.bool)
    mask[index] = 1
    return mask

def data_split(data):
    index = [i for i in range(0, test_index)]
    index = torch.tensor(index, dtype=torch.long)

    index_train, index_val, y_train, y_val = train_test_split(index, target[0:test_index], test_size=0.4, shuffle = target[0:test_index], random_state=43)

    data.train_mask = to_mask(index_train, size=data.num_nodes)
    data.val_mask = to_mask(index_val, size=data.num_nodes)
    data.test_mask = to_mask(torch.tensor([i for i in range(test_index, data.num_nodes - 1)], dtype=torch.long), size=data.num_nodes)

    return data

In [123]:
data = data_split(data)
data

Data(x=[150, 4], edge_index=[2, 450], num_classes=3, y=[150], train_mask=[150], val_mask=[150], test_mask=[150])

In [124]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 32)
        self.conv2 = GCNConv(32, data.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [140]:
from sklearn.metrics import accuracy_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

model.train()
for epoch in range(400):
    optimizer.zero_grad()
    out = model(data)
    #print(accuracy_score(train_mask.y, out.argmax(dim=1)))
    loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask])
    loss.backward()
    optimizer.step()

In [141]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9565


In [142]:
from sklearn.metrics import classification_report

model.eval()
y_pred = model(data).argmax(dim=1)

print(classification_report(data.y[data.test_mask], y_pred[data.test_mask]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       1.00      0.88      0.94        26
           2       0.88      1.00      0.94        22

    accuracy                           0.96        69
   macro avg       0.96      0.96      0.96        69
weighted avg       0.96      0.96      0.96        69

