In [1]:
import os
import torch
import numpy as np
import json
import re
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader


class WordEmbeddingLoader(object):
    def __init__(self):
        self.path_word = "./embedding/hlbl-embeddings-scaled.EMBEDDING_SIZE=50.txt"  # path of pre-trained word embedding
        self.word_dim = 50  # dimension of word embedding

    def trim_from_pre_embedding(self, vocab):
        word2id = dict()
        word_vec = {}
        trim_word_vec = list()
        with open(self.path_word, 'r', encoding='utf-8') as fr:
            for line in fr:
                line = line.strip().split()
                if len(line) != self.word_dim + 1:
                    continue
                word_vec[line[0]] = np.asarray(line[1:], dtype=np.float32)
        for word in vocab:
            word2id[word] = len(word2id)
            if (word in word_vec):
                trim_word_vec.append(word_vec[word])
            else:
                trim_word_vec.append(np.random.uniform(-1, 1, self.word_dim))
                
        if ("*UNKNOWN*" not in word2id):
            word2id['*UNKNOWN*'] = len(word2id)
            unk_emb = np.random.uniform(-1, 1, self.word_dim)
            trim_word_vec.append(unk_emb)
        if ("PAD" not in word2id):
            word2id['PAD'] = len(word2id)
            pad_emb = np.zeros(self.word_dim)
            trim_word_vec.append(unk_emb)
        trim_word_vec = np.array(trim_word_vec)
        trim_word_vec = trim_word_vec.astype(np.float32).reshape(-1, self.word_dim)
        return word2id, torch.from_numpy(trim_word_vec)

    def load_embedding(self):
        word2id = dict() 
        word_vec = list()  
        word2id['PAD'] = len(word2id)  # PAD character

        with open(self.path_word, 'r', encoding='utf-8') as fr:
            for line in fr:
                line = line.strip().split()
                if len(line) != self.word_dim + 1:
                    continue
                word2id[line[0]] = len(word2id)
                word_vec.append(np.asarray(line[1:], dtype=np.float32))
        if ("*UNKNOWN*" not in word2id):
            word2id['*UNKNOWN*'] = len(word2id)
            unk_emb = np.random.uniform(-1, 1, self.word_dim)
            word_vec.append(unk_emb)
        pad_emb = np.zeros([1, self.word_dim], dtype=np.float32)  # <pad> is initialize as zero
        word_vec = np.concatenate((pad_emb, word_vec), axis=0)
        word_vec = word_vec.astype(np.float32).reshape(-1, self.word_dim)
        word_vec = torch.from_numpy(word_vec)
        return word2id, word_vec


class RelationLoader(object):
    def __init__(self):
        self.data_dir = "./data"

    def __load_relation(self):
        relation_file = os.path.join(self.data_dir, 'relation2id.txt')
        rel2id = {}
        id2rel = {}
        with open(relation_file, 'r', encoding='utf-8') as fr:
            for line in fr:
                relation, id_s = line.strip().split()
                id_d = int(id_s)
                rel2id[relation] = id_d
                id2rel[id_d] = relation
        return rel2id, id2rel, len(rel2id)

    def get_relation(self):
        return self.__load_relation()


class SemEvalDateset(Dataset):
    def __init__(self, filename, rel2id, word2id):
        self.filename = filename
        self.rel2id = rel2id
        self.word2id = word2id
        self.max_len = 96
        self.pos_dis = 20
        self.data_dir = "./data"
        self.dataset, self.label = self.__load_data()

    def __get_pos_index(self, x):
        if x < -self.pos_dis:
            return 0
        if x >= -self.pos_dis and x <= self.pos_dis:
            return x + self.pos_dis + 1
        if x > self.pos_dis:
            return 2 * self.pos_dis + 2

    def __get_relative_pos(self, x, entity_pos):
        if x < entity_pos[0]:
            return self.__get_pos_index(x - entity_pos[0])
        elif x > entity_pos[1]:
            return self.__get_pos_index(x - entity_pos[1])
        else:
            return self.__get_pos_index(0)

    def __symbolize_sentence(self, e1_pos, e2_pos, sentence):
        mask = [1] * len(sentence)
        if e1_pos[0] < e2_pos[0]:
            for i in range(e1_pos[0], e2_pos[1] + 1):
                mask[i] = 2
            for i in range(e2_pos[1] + 1, len(sentence)):
                mask[i] = 3
        else:
            for i in range(e2_pos[0], e1_pos[1] + 1):
                mask[i] = 2
            for i in range(e1_pos[1] + 1, len(sentence)):
                mask[i] = 3

        words = []
        pos1 = []
        pos2 = []
        length = min(self.max_len, len(sentence))
        mask = mask[:length]

        for i in range(length):
            words.append(self.word2id.get(sentence[i], self.word2id['*UNKNOWN*']))
            pos1.append(self.__get_relative_pos(i, e1_pos))
            pos2.append(self.__get_relative_pos(i, e2_pos))

        if length < self.max_len:
            for i in range(length, self.max_len):
                mask.append(0)  # 'PAD' mask is zero
                words.append(self.word2id['PAD'])

                pos1.append(self.__get_relative_pos(i, e1_pos))
                pos2.append(self.__get_relative_pos(i, e2_pos))
        unit = np.asarray([words, pos1, pos2, mask], dtype=np.int64)
        unit = np.reshape(unit, newshape=(1, 4, self.max_len))
        return unit

    def _lexical_feature(self, e1_idx, e2_idx, sent):
        def _entity_context(e_idx, sent):
            ''' return [w(e-1), w(e), w(e+1)]
            '''
            context = []
            context.append(sent[e_idx])
            if e_idx >= 1:
                context.append(sent[e_idx - 1])
            else:
                context.append(sent[e_idx])

            if e_idx < len(sent) - 1:
                context.append(sent[e_idx + 1])
            else:
                context.append(sent[e_idx])
            return context

        context1 = _entity_context(e1_idx[0], sent)
        context2 = _entity_context(e2_idx[0], sent)
        lexical = context1 + context2
        lexical_ids = [self.word2id.get(word, self.word2id['*UNKNOWN*']) for word in lexical]
        lexical_ids = np.asarray(lexical_ids, dtype=np.int64)
        return np.reshape(lexical_ids, newshape=(1, 6))

    def __load_data(self):
        path_data_file = os.path.join(self.data_dir, self.filename)
        data = []
        labels = []
        with open(path_data_file, 'r', encoding='utf-8') as fr:
            for line in fr:
                line = json.loads(line.strip())
                label = line['relation']
                sentence = line['sentence']
                e1_pos = (line['subj_start'], line['subj_end'])
                e2_pos = (line['obj_start'], line['obj_end'])
                label_idx = self.rel2id[label]

                one_sentence = self.__symbolize_sentence(e1_pos, e2_pos, sentence)
                lexical = self._lexical_feature(e1_pos, e2_pos, sentence)
                temp = (one_sentence, lexical)
                data.append(temp)
                # data.append(one_sentence)
                labels.append(label_idx)
        return data, labels

    def __getitem__(self, index):
        data = self.dataset[index]
        label = self.label[index]
        return data, label

    def __len__(self):
        return len(self.label)


class SemEvalDataLoader(object):
    def __init__(self, rel2id, word2id):
        self.rel2id = rel2id
        self.word2id = word2id

    def __collate_fn(self, batch):
        data, label = zip(*batch) 
        data = list(data)
        label = list(label)
        sentence_feat = torch.from_numpy(np.concatenate([x[0] for x in data], axis=0))
        lexical_feat = torch.from_numpy(np.concatenate([x[1] for x in data], axis=0))
        label = torch.from_numpy(np.asarray(label, dtype=np.int64))
        return (sentence_feat, lexical_feat), label

    def __get_data(self, filename, shuffle=False):
        dataset = SemEvalDateset(filename, self.rel2id, self.word2id)
        loader = DataLoader(
            dataset=dataset,
            batch_size=128,
            shuffle=shuffle,
            num_workers=2,
            collate_fn=self.__collate_fn
        )
        return loader

    def get_train(self):
        return self.__get_data('train.json', shuffle=True)

    def get_dev(self):
        return self.__get_data('test.json', shuffle=False)

    def get_test(self):
        return self.__get_data('test.json', shuffle=False)


class processor(object):
    def __init__(self):
        pass

    def search_entity(self, sentence):
        e1 = re.findall(r'<e1>(.*)</e1>', sentence)[0]
        e2 = re.findall(r'<e2>(.*)</e2>', sentence)[0]
        sentence = sentence.replace('<e1>' + e1 + '</e1>', ' <e1> ' + e1 + ' </e1> ', 1)
        sentence = sentence.replace('<e2>' + e2 + '</e2>', ' <e2> ' + e2 + ' </e2> ', 1)
        sentence = word_tokenize(sentence)
        sentence = ' '.join(sentence)
        sentence = sentence.replace('< e1 >', '<e1>')
        sentence = sentence.replace('< e2 >', '<e2>')
        sentence = sentence.replace('< /e1 >', '</e1>')
        sentence = sentence.replace('< /e2 >', '</e2>')
        sentence = sentence.split()

        assert '<e1>' in sentence
        assert '<e2>' in sentence
        assert '</e1>' in sentence
        assert '</e2>' in sentence

        subj_start = subj_end = obj_start = obj_end = 0
        pure_sentence = []
        for i, word in enumerate(sentence):
            if '<e1>' == word:
                subj_start = len(pure_sentence)
                continue
            if '</e1>' == word:
                subj_end = len(pure_sentence) - 1
                continue
            if '<e2>' == word:
                obj_start = len(pure_sentence)
                continue
            if '</e2>' == word:
                obj_end = len(pure_sentence) - 1
                continue
            pure_sentence.append(word)
        return e1, e2, subj_start, subj_end, obj_start, obj_end, pure_sentence

    def convert(self, path_src, path_des):
        with open(path_src, 'r', encoding='utf-8') as fr:
            data = fr.readlines()
        with open(path_des, 'w', encoding='utf-8') as fw:
            for i in range(0, len(data), 4):
                id_s, sentence = data[i].strip().split('\t')
                sentence = sentence[1:-1]
                e1, e2, subj_start, subj_end, obj_start, obj_end, sentence = self.search_entity(sentence)
                meta1 = dict(
                    id=id_s,
                    relation=data[i + 1].strip(),
                    head=e1,
                    tail=e2,
                    subj_start=subj_start,
                    subj_end=subj_end,
                    obj_start=obj_start,
                    obj_end=obj_end,
                    sentence=sentence,
                    comment=data[i + 2].strip()[8:]
                )
                json.dump(meta1, fw, ensure_ascii=False)
                fw.write('\n')

class VocabGenerator(object):
    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path

    def get_vocab(self):
        vocab = {}
        with open(self.train_path, 'r', encoding='utf-8') as fr:
            for line in fr:
                line = json.loads(line.strip())
                sentence = line['sentence']
                for word in sentence:
                    vocab[word] = 1
        with open(self.test_path, 'r', encoding='utf-8') as fr:
            for line in fr:
                line = json.loads(line.strip())
                sentence = line['sentence']
                for word in sentence:
                    vocab[word] = 1
        return vocab

if __name__ == '__main__':
    path_train = './data/TRAIN_FILE.TXT'
    path_test = './data/FULL_TEST.txt'
    processor1 = processor()
    processor1.convert(path_train, './data/train.json')
    processor1.convert(path_test, './data/test.json')
    vocab = VocabGenerator('./data/train.json', './data/test.json').get_vocab()


In [4]:
word2id, word_vec = WordEmbeddingLoader().load_embedding()
rel2id, id2rel, class_num = RelationLoader().get_relation()
rel2id

{'Other': 0,
 'Cause-Effect(e1,e2)': 1,
 'Cause-Effect(e2,e1)': 2,
 'Component-Whole(e1,e2)': 3,
 'Component-Whole(e2,e1)': 4,
 'Content-Container(e1,e2)': 5,
 'Content-Container(e2,e1)': 6,
 'Entity-Destination(e1,e2)': 7,
 'Entity-Destination(e2,e1)': 8,
 'Entity-Origin(e1,e2)': 9,
 'Entity-Origin(e2,e1)': 10,
 'Instrument-Agency(e1,e2)': 11,
 'Instrument-Agency(e2,e1)': 12,
 'Member-Collection(e1,e2)': 13,
 'Member-Collection(e2,e1)': 14,
 'Message-Topic(e1,e2)': 15,
 'Message-Topic(e2,e1)': 16,
 'Product-Producer(e1,e2)': 17,
 'Product-Producer(e2,e1)': 18}

In [7]:
word_vec.size()

torch.Size([246123, 50])

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init


class CNN(nn.Module):
    def __init__(self, word_vec, class_num):
        super().__init__()
        self.word_vec = word_vec
        self.class_num = class_num

        self.max_len = 96
        self.word_dim = 50
        self.pos_dim = 5
        self.pos_dis = 20

        self.dropout_value = 0.5
        self.filter_num = 200
        self.window = 3
        self.hidden_size = 100

        self.dim = self.word_dim + 2 * self.pos_dim

        self.word_embedding = nn.Embedding.from_pretrained(embeddings=self.word_vec, freeze=False, )
        self.pos1_embedding = nn.Embedding(num_embeddings=2 * self.pos_dis + 3, embedding_dim=self.pos_dim)
        self.pos2_embedding = nn.Embedding(num_embeddings=2 * self.pos_dis + 3, embedding_dim=self.pos_dim)

        self.conv = nn.Conv2d(
            in_channels=1,
            out_channels=self.filter_num,
            kernel_size=(self.window, self.dim),
            stride=(1, 1),
            bias=False,
            padding=(1, 0), 
            padding_mode='zeros'
        )
        self.maxpool = nn.MaxPool2d((self.max_len, 1))
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(self.dropout_value)
        self.linear = nn.Linear(in_features=self.filter_num, out_features=self.hidden_size, bias=False)
        self.dense = nn.Linear(in_features=self.hidden_size + 6 * self.word_dim, out_features=self.class_num, bias=False)


        init.xavier_normal_(self.pos1_embedding.weight)
        init.xavier_normal_(self.pos2_embedding.weight)
        init.xavier_normal_(self.conv.weight)
        init.xavier_normal_(self.linear.weight)
        init.xavier_normal_(self.dense.weight)


    def encoder_layer(self, token, pos1, pos2):
        word_emb = self.word_embedding(token)  
        pos1_emb = self.pos1_embedding(pos1)  
        pos2_emb = self.pos2_embedding(pos2)  
        emb = torch.cat(tensors=[word_emb, pos1_emb, pos2_emb], dim=-1)
        return emb

    def conv_layer(self, emb, mask):
        emb = emb.unsqueeze(dim=1) 
        conv = self.conv(emb)  

        conv = conv.view(-1, self.filter_num, self.max_len)  
        mask = mask.unsqueeze(dim=1)  
        mask = mask.expand(-1, self.filter_num, -1) 
        conv = conv.masked_fill_(mask.eq(0), float('-inf'))  
        conv = conv.unsqueeze(dim=-1) 
        return conv

    def single_maxpool_layer(self, conv):
        pool = self.maxpool(conv)  
        pool = pool.view(-1, self.filter_num)  
        return pool

    def forward(self, data):
        token = data[0][:, 0, :].view(-1, self.max_len)
        pos1 = data[0][:, 1, :].view(-1, self.max_len)
        pos2 = data[0][:, 2, :].view(-1, self.max_len)
        mask = data[0][:, 3, :].view(-1, self.max_len)
        lexical = data[1].view(-1, 6)
        lexical_emb = self.word_embedding(lexical)
        lexical_emb = lexical_emb.view(-1, self.word_dim * 6)
        emb = self.encoder_layer(token, pos1, pos2)
        emb = self.dropout(emb)
        conv = self.conv_layer(emb, mask)
        pool = self.single_maxpool_layer(conv)
        sentence_feature = self.linear(pool)
        sentence_feature = self.tanh(sentence_feature)
        sentence_feature = self.dropout(sentence_feature)
        features = torch.cat((lexical_emb, sentence_feature), 1)
        logits = self.dense(features)
        return logits


In [8]:
class Eval(object):
    def __init__(self):
        pass

    def evaluate(self, model, criterion, data_loader):
        predict_label = []
        true_label = []
        total_loss = 0.0
        with torch.no_grad():
            model.eval()
            for _, (data, label) in enumerate(data_loader):
                sent_feat = data[0]
                lex_feat = data[1]
                data = (sent_feat, lex_feat)
                label = label

                scores = model(data)
                loss = criterion(scores, label)
                total_loss += loss.item() * scores.shape[0]

                scores, pred = torch.max(scores[:, 1:], dim=1)
                pred = pred + 1

                scores = scores.cpu().detach().numpy().reshape((-1, 1))
                pred = pred.cpu().detach().numpy().reshape((-1, 1))
                label = label.cpu().detach().numpy().reshape((-1, 1))

                for i in range(pred.shape[0]):
                    if scores[i][0] < 0:
                        pred[i][0] = 0

                predict_label.append(pred)
                true_label.append(label)
        predict_label = np.concatenate(predict_label, axis=0).reshape(-1).astype(np.int64)
        true_label = np.concatenate(true_label, axis=0).reshape(-1).astype(np.int64)
        eval_loss = total_loss / predict_label.shape[0]

        f1 = semeval_scorer(predict_label, true_label)
        return f1, eval_loss, predict_label



In [11]:
import os
import torch
import torch.nn as nn
import torch.optim as optim

def print_result(predict_label, id2rel, start_idx=8001):
    with open('script/predicted_result.txt', 'w', encoding='utf-8') as fw:
        for i in range(0, predict_label.shape[0]):
            fw.write('{}\t{}\n'.format(start_idx + i, id2rel[int(predict_label[i])]))


def train(model, criterion, loader):
    train_loader, dev_loader, _ = loader
    print(loader)
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

    print(model)
    for name, param in model.named_parameters():
        if param.requires_grad:
            print('%s :  %s' % (name, str(param.data.shape)))
    print('--------------------------------------')
    print('start to train the model ...')

    eval_tool = Eval()
    min_f1 = -float('inf')
    for epoch in range(1, 100 + 1):
        for step, (data, label) in enumerate(train_loader):
            model.train()
            

            sent_feat = data[0]
            lex_feat = data[1]
            data = (sent_feat, lex_feat)

            optimizer.zero_grad()
            logits = model(data)
            loss = criterion(logits, label)
            loss.backward()
            optimizer.step()

        _, train_loss, _ = eval_tool.evaluate(model, criterion, train_loader)
        f1, dev_loss, _ = eval_tool.evaluate(model, criterion, dev_loader)

        print('[%03d] train_loss: %.3f | dev_loss: %.3f | micro f1 on dev: %.4f' % (epoch, train_loss, dev_loss, f1), end=' ')
        if f1 > min_f1:
            min_f1 = f1
        
            model_dir = os.path.join("./model")
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            torch.save(model.state_dict(), os.path.join(model_dir, 'model.pkl'))


def test(model, criterion, loader):
    print('--------------------------------------')
    print('start test ...')
    _, _, test_loader = loader
    model.load_state_dict(torch.load(os.path.join(model_dir, 'model.pkl')))
    eval_tool = Eval()
    f1, test_loss, predict_label = eval_tool.evaluate(model, criterion, test_loader)
    print('test_loss: %.3f | micro f1 on test:  %.4f' % (test_loss, f1))
    return predict_label


if __name__ == '__main__':

    vocab = VocabGenerator('data/train.json', 'data/test.json').get_vocab()
    word2id, word_vec = WordEmbeddingLoader().load_embedding()
    rel2id, id2rel, class_num = RelationLoader().get_relation()
    loader = SemEvalDataLoader(rel2id, word2id )

    train_loader, dev_loader = None, None
    train_loader = loader.get_train()
    dev_loader = loader.get_dev()
    test_loader = loader.get_test()
    loader = [train_loader, dev_loader, test_loader]
    print('finish!')

    print('--------------------------------------')
    model = CNN(word_vec=word_vec, class_num=class_num)
    

    criterion = nn.CrossEntropyLoss()

    
    train(model, criterion, loader)
    predict_label = test(model, criterion, loader)
    print_result(predict_label, id2rel)


finish!
--------------------------------------
CNN()
(word_embedding): Embedding(246123, 50)
(pos1_embedding): Embedding(103, 5)
(pos2_embedding): Embedding(103, 5)
(conv): Conv2d(1, 200, kernel_size=(3, 60), stride=(1, 1), padding=(1, 0))
(maxpool): MaxPool2d(kernel_size=(100, 1), stride=(100, 1), padding=0, dilation=1, ceil_mode=False)
(tanh): Tanh()
(dropout): Dropout(p=0.5, inplace=False)
(linear): Linear(in_features=200, out_features=100, bias=True)
(dense): Linear(in_features=100, out_features=19, bias=True)
traning model parameters:
word_embedding.weight :  torch.Size([246123, 50])
pos1_embedding.weight :  torch.Size([103, 5])
pos2_embedding.weight :  torch.Size([103, 5])
conv.weight :  torch.Size([200, 1, 3, 60])
conv.bias :  torch.Size([200])
linear.weight :  torch.Size([100, 200])
linear.bias :  torch.Size([100])
dense.weight :  torch.Size([19, 100])
dense.bias :  torch.Size([19])
--------------------------------------
start to train the model ...
[001] train_loss: 2.446 | de