In [1]:
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
import nltk

In [202]:
from keras import regularizers
from keras.layers import *
from keras.models import *
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from keras.models import load_model

In [None]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('word2vec_300d.txt', binary=False)

In [166]:
def clear_puncts(data):
    data = re.sub("<e1>|</e1>|<e2>|</e2>", "", data)
    data = re.sub("[0-9\<\>\.\!\/_,~@#$&%^*\:\?()\+\-\=\"\']", " ", data.lower())
    return data

def tokenize(data):
    data = word_tokenize(data)
    return data

def clear_stopwords(data):
    english_stopwords = stopwords.words("english")
    text = []
    for i, w in enumerate(data):
        if w not in english_stopwords:
            text.append(w)
    return text

def preprocess(data):
    data = clear_puncts(data)
    data = tokenize(data)
    return data

In [167]:
test_data = {}
with open ("TEST_FILE.txt") as f:
    for line in f:
        l = line.split('\t')
        idx = l[0]
        test_data[idx] = {}
        text = l[1]
        e1_big = text.find('<e1>')
        e1_end = text.find('</e1>')
        e2_big = text.find('<e2>')
        e2_end = text.find('</e2>')
        e1 = text[e1_big + 4: e1_end]
        e2 = text[e2_big + 4: e2_end]
        test_data[idx]['text'] = preprocess(text)
        test_data[idx]['e1'] = e1
        test_data[idx]['e2'] = e2

In [168]:
train_data = {}
with open("TRAIN_FILE.txt") as f:
    for i in range(8000):
        line = f.readline()
        relation = f.readline().strip('\n')
        comment = f.readline()
        f.readline()
        l = line.split('\t')
        idx = l[0]
        text = l[1]
        train_data[idx] = {}
        e1_big = text.find('<e1>')
        e1_end = text.find('</e1>')
        e2_big = text.find('<e2>')
        e2_end = text.find('</e2>')
        e1 = text[e1_big + 4: e1_end]
        e2 = text[e2_big + 4: e2_end]
        train_data[idx]['text'] = preprocess(text)
        train_data[idx]['e1'] = e1
        train_data[idx]['e2'] = e2
        h = relation.find('(')
        t = relation.find(',')
        train_data[idx]['rel'] = relation[:h]
        train_data[idx]['head'] = relation[h + 1:t]
        train_data[idx]['tail'] = relation[t + 1:-1]

In [169]:
train_data['2']

{'e1': 'child',
 'e2': 'cradle',
 'head': 'Othe',
 'rel': 'Othe',
 'tail': 'Othe',
 'text': ['the',
  'child',
  'was',
  'carefully',
  'wrapped',
  'and',
  'bound',
  'into',
  'the',
  'cradle',
  'by',
  'means',
  'of',
  'a',
  'cord']}

In [170]:
test_data['8002']

{'e1': 'company',
 'e2': 'chairs',
 'text': ['the', 'company', 'fabricates', 'plastic', 'chairs']}

In [182]:
classes = {'Cause-Effect':{'e1':0, 'e2':1}, 'Instrument-Agency':{'e1':2, 'e2':3}, 'Product-Producer':{'e1':4, 'e2':5}, 'Content-Container':{'e1':6, 'e2':7}, 'Entity-Origin':{'e1':8, 'e2':9}, 'Entity-Destination':{'e1':10, 'e2':11}, 'Component-Whole':{'e1':12, 'e2':13}, 'Member-Collection':{'e1':14, 'e2':15}, 'Message-Topic':{'e1':16, 'e2':17}, 'Othe':18}
inf_classes = {0:'Cause-Effect(e1,e2)', 1:'Cause-Effect(e2,e1)', 2:'Instrument-Agency(e1,e2)', 3:'Instrument-Agency(e2,e1)', 4:'Product-Producer(e1,e2)', 5:'Product-Producer(e2,e1)', 6:'Content-Container(e1,e2)', 7:'Content-Container(e2,e1)', 8:'Entity-Origin(e1,e2)', 9:'Entity-Origin(e2,e1)', 10:'Entity-Destination(e1,e2)', 11:'Entity-Destination(e2,e1)', 12:'Component-Whole(e1,e2)', 13:'Component-Whole(e2,e1)', 14:'Member-Collection(e1,e2)', 15:'Member-Collection(e2,e1)', 16:'Message-Topic(e1,e2)', 17:'Message-Topic(e2,e1)', 18:'Other'}

# logistic regression

In [183]:
train_X = []
train_Y = []
test_X = []
for d in train_data:
    rel = train_data[d]['rel']
    if rel == 'Othe':
        y = 18
    else:
        head = train_data[d]['head']
        y = classes[rel][head]
    train_Y.append(y)
    x = 0
    for w in train_data[d]['text']:
        if w in word_vectors:
            x += word_vectors[w]
        else:
            x += word_vectors['UNK']
    x /= len(train_data[d]['text'])
    train_X.append(x)
for d in test_data:
    x = 0
    for w in test_data[d]['text']:
        if w in word_vectors:
            x += word_vectors[w]
        else:
            x += word_vectors['UNK']
    x /= len(test_data[d]['text'])
    test_X.append(x)

In [184]:
train_X = np.array(train_X)
test_X = np.array(test_X)

In [None]:
lg = LogisticRegression(solver = 'newton-cg', multi_class = 'multinomial', C = 1)
lg.fit(train_X, train_Y)
lg.score(train_X, train_Y)

In [82]:
y_pred = lg.predict(test_X)
with open('wvlg_result.txt', 'w') as f:
    for i,y in enumerate(y_pred):
        f.write(str(i+8001) + '\t' + inf_classes[y] + '\n')

In [192]:
max_len = 0
for d in train_data:
    if max_len < len(train_data[d]['text']):
         max_len = len(train_data[d]['text'])

In [193]:
max_len

86

In [45]:
V = set()
for d in train_data:
    for w in train_data[d]['text']:
        V.add(w)

In [46]:
len(V)

18703

# BLSTM

In [194]:
train_x = []
train_y = []
for d in train_data:
    rel = train_data[d]['rel']
    y = [0]*19
    if rel == 'Othe':
        y[18] = 1
    else:
        y[classes[rel][train_data[d]['head']]] = 1
    train_y.append(y)
    s = []
    for w in train_data[d]['text']:
        if w in word_vectors:
            s.append(word_vectors[w])
        else:
            s.append(word_vectors['UNK'])
    while len(s) < 86:
        s.append(np.zeros(300))
    train_x.append(s)

In [195]:
X_val = train_x[:800]
Y_val = train_y[:800]
X = train_x[800:]
Y = train_y[800:]

In [196]:
test_x = []
for d in test_data:
    s = []
    for w in test_data[d]['text']:
        if w in word_vectors:
            s.append(word_vectors[w])
        else:
            s.append(word_vectors['UNK'])
    while len(s) < 86:
        s.append(np.zeros(300))
    test_x.append(s)

In [230]:
# build model
def BLSTM(hidden_size):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=hidden_size, unit_forget_bias=True, implementation=2,
                                 activation='tanh', recurrent_activation='hard_sigmoid',
                                 kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal'),input_shape=(86,300)))
    model.add(Dropout(0.3))
    model.add(Dense(hidden_size, activation='relu', kernel_regularizer=regularizers.l2(0.1)))
    model.add(Dropout(0.3))
    model.add(Dense(19, activation='softmax'))

    model.compile('RMSprop', 'categorical_crossentropy', metrics=['accuracy'])

    return model
model = BLSTM(hidden_size=128)

In [231]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_20 (Bidirectio (None, 256)               439296    
_________________________________________________________________
dropout_34 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_35 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_36 (Dense)             (None, 19)                2451      
Total params: 474,643
Trainable params: 474,643
Non-trainable params: 0
_________________________________________________________________


In [198]:
nb_epoch = 20
batch_size = 256
save_path = 'BLSTM.h5'
checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     monitor='val_acc',
                                     mode='max' )
csv_logger = CSVLogger('%s-log.csv'%'BLSTM', separator=',', append=False)
earlystopping = EarlyStopping(monitor='val_acc', patience = 4, verbose=1, mode='max')
history = model.fit(X, Y,
                      validation_data=(X_val, Y_val),
                      epochs=nb_epoch,
                      batch_size=batch_size,
                      callbacks=[checkpoint, earlystopping, csv_logger])

Train on 7200 samples, validate on 800 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 00020: early stopping


In [199]:
model = load_model('BLSTM.h5')
y_pred = model.predict_classes(test_x)

In [200]:
with open('BLSTM_result.txt', 'w') as f:
    for i,y in enumerate(y_pred):
        f.write(str(i+8001) + '\t' + inf_classes[y] + '\n')

# attBLSTM

In [212]:
def attention_3d_block(inputs):
    #input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Dense(86, activation='softmax')(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    #output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    output_attention_mul = multiply([inputs, a_probs], name='attention_mul')
    return output_attention_mul

# build RNN model with attention
inputs = Input(shape=(86, 300))
drop1 = Dropout(0.3)(inputs)
lstm_out = Bidirectional(LSTM(units=128, unit_forget_bias=True, implementation=2, activation='tanh', recurrent_activation='hard_sigmoid',kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', return_sequences=True), name='bilstm')(drop1)
attention_mul = attention_3d_block(lstm_out)
attention_flatten = Flatten()(attention_mul)
drop2 = Dropout(0.3)(attention_flatten)
output = Dense(19, activation='softmax')(drop2)
model = Model(inputs=inputs, outputs=output)
model.compile('RMSprop', 'categorical_crossentropy', metrics=['accuracy'])

In [213]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 86, 300)      0                                            
__________________________________________________________________________________________________
dropout_32 (Dropout)            (None, 86, 300)      0           input_10[0][0]                   
__________________________________________________________________________________________________
bilstm (Bidirectional)          (None, 86, 256)      439296      dropout_32[0][0]                 
__________________________________________________________________________________________________
permute_4 (Permute)             (None, 256, 86)      0           bilstm[0][0]                     
__________________________________________________________________________________________________
dense_33 (

In [214]:
nb_epoch = 20
batch_size = 256
save_path = 'attBLSTM.h5'
checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     monitor='val_acc',
                                     mode='max' )
csv_logger = CSVLogger('%s-log.csv'%'attBLSTM', separator=',', append=False)
earlystopping = EarlyStopping(monitor='val_acc', patience = 4, verbose=1, mode='max')
history = model.fit(X, Y,
                      validation_data=(X_val, Y_val),
                      epochs=nb_epoch,
                      batch_size=batch_size,
                      callbacks=[checkpoint, earlystopping, csv_logger])

Train on 7200 samples, validate on 800 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [225]:
csv_logger = CSVLogger('%s-log.csv'%'attBLSTM_cont', separator=',', append=False)
history = model.fit(X, Y,
                      validation_data=(X_val, Y_val),
                      epochs=nb_epoch,
                      batch_size=batch_size,
                      callbacks=[checkpoint, earlystopping, csv_logger])

Train on 7200 samples, validate on 800 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 00007: early stopping


In [226]:
model_3 = load_model('attBLSTM.h5')

In [227]:
y_pred = model_3.predict(test_x)

In [228]:
result = []
for y in y_pred:
    result.append(np.argmax(y))

In [229]:
with open('attBLSTM_result.txt', 'w') as f:
    for i,y in enumerate(result):
        f.write(str(i+8001) + '\t' + inf_classes[y] + '\n')