In [None]:
# import modules and configuration set-up
import os
import os.path as osp

import json
import numpy as np
import torch

In [None]:
# load data and feature file 
# with open('task3_train.txt','r')as f:
#     task3_train=f.read()

### Word representations
1. one-hot-encode
2. word embedding + node attributes ( e.g. position, contextual information (introduce local bias), tag information (POS))

In [2]:
# word2idx = dict()
# idx2word = dict()
# emb2idx = dict()
# idx2emb = dict()
word2emb = dict()

### Word pairs / Relation inference
1. lexical relations
2. syntactic dependency obtained from parsing tree
3. co-occurrence
4. word similarity
5. 句法结构上地位相同
6. 词性/NER等同
7. 或直接的full-connected （这样就类似于self-attention模型了）

In [None]:
win_size = 2

### Graph construction
TODO 将relation type表示为edge attributes

In [None]:
from torch_geometric.data import Data
from torch_geometric.data import Dataset

In [None]:
# pairs = None: modeling the sentence as full-connected graph i.e. each node is connected with all of the other nodes
def construct_graph(sentence, label, pairs=None):
#     if pairs:
    edges = [[int(link['source']),int(link['target'])] for link in graph['links']]
    x = [] # TODO 
    graph_data = Data(x = x, 
                      edge_index=torch.tensor(edges,dtype=torch.long).t().contiguous(),
                      y = torch.tensor([label]))
    pass

### GAT

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import GATConv

dataset = 'Cora'
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)
dataset = Planetoid(path, dataset, T.NormalizeFeatures())
data = dataset[0]


class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GATConv(dataset.num_features, 8, heads=8, dropout=0.6)
        self.conv2 = GATConv(8 * 8, dataset.num_classes, dropout=0.6)

    def forward(self):
        x = F.dropout(data.x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, data.edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, data.edge_index)
        return F.log_softmax(x, dim=1)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)


def train():
    model.train()
    optimizer.zero_grad()
    F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
    optimizer.step()


def test():
    model.eval()
    logits, accs = model(), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs


for epoch in range(1, 201):
    train()
    log = 'Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    print(log.format(epoch, *test()))
    
#-----
import os.path as osp

import torch
import torch.nn.functional as F
from torch_geometric.datasets import PPI
from torch_geometric.data import DataLoader
from torch_geometric.nn import GATConv
from sklearn.metrics import f1_score

path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI')
train_dataset = PPI(path, split='train')
val_dataset = PPI(path, split='test')
test_dataset = PPI(path, split='test')
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)


class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GATConv(train_dataset.num_features, 256, heads=4)
        self.lin1 = torch.nn.Linear(train_dataset.num_features, 4 * 256)
        self.conv2 = GATConv(4 * 256, 256, heads=4)
        self.lin2 = torch.nn.Linear(4 * 256, 4 * 256)
        self.conv3 = GATConv(
            4 * 256, train_dataset.num_classes, heads=6, concat=False)
        self.lin3 = torch.nn.Linear(4 * 256, train_dataset.num_classes)

    def forward(self, x, edge_index):
        x = F.elu(self.conv1(x, edge_index) + self.lin1(x))
        x = F.elu(self.conv2(x, edge_index) + self.lin2(x))
        x = self.conv3(x, edge_index) + self.lin3(x)
        return x


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
loss_op = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)


def train():
    model.train()

    total_loss = 0
    for data in train_loader:
        num_graphs = data.num_graphs
        data.batch = None
        data = data.to(device)
        optimizer.zero_grad()
        loss = loss_op(model(data.x, data.edge_index), data.y)
        total_loss += loss.item() * num_graphs
        loss.backward()
        optimizer.step()
    return total_loss / len(train_loader.dataset)


def test(loader):
    model.eval()

    ys, preds = [], []
    for data in loader:
        ys.append(data.y)
        with torch.no_grad():
            out = model(data.x.to(device), data.edge_index.to(device))
        preds.append((out > 0).float().cpu())

    y, pred = torch.cat(ys, dim=0).numpy(), torch.cat(preds, dim=0).numpy()
    return f1_score(y, pred, average='micro') if pred.sum() > 0 else 0


for epoch in range(1, 101):
    loss = train()
    acc = test(val_loader)
    print('Epoch: {:02d}, Loss: {:.4f}, F1: {:.4f}'.format(epoch, loss, acc))

### Self-attention / Transformer
```
SSAN
    num_layers=1,
    self_attention_heads=1,
    qkv_projections_bias_and_activation=True,
    self_attention_sublayer_bias_and_activation=True,
    ffnn_sublayer=False,
    self_attention_sublayer_residual_and_norm=False,
    ffnn_sublayer_residual_and_norm=False,
    include_positional_encoding=False,                      # PE
    positional_encoding_denominator_term=10000,
    use_relative_positions=True,                            # RPR
    max_relative_positions=10,                              # clipping distance
Transformer Encoder
    num_layers=2,
    self_attention_heads=8,
    qkv_projections_bias_and_activation=False,              # Linear projections instead of FFNN
    self_attention_sublayer_bias_and_activation=False,
    ffnn_sublayer=True,
    self_attention_sublayer_residual_and_norm=True,
    ffnn_sublayer_residual_and_norm=True,
    include_positional_encoding=True,                       # PE
    positional_encoding_denominator_term=10000,
    use_relative_positions=False,                           # RPR
    max_relative_positions=10,
```

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, batch_first=False):
        super(SelfAttention, self).__init__()
        
        self.hidden_size =  

In [None]:
def positional_encoding():
    pass

### Classifier
1. 直接用soft-max层得到各个class的概率
2. Other classifier
3. TODO multitask

### Train

### Evaluation