In [1]:
from util import get_num_lines, get_pos2idx_idx2pos, index_sequence, get_vocab, embed_indexed_sequence, \
    get_word2idx_idx2word, get_embedding_matrix, write_predictions, get_performance_VUAverb_val, \
    get_performance_VUAverb_test, get_performance_VUA_test
from util import TextDatasetWithGloveElmoSuffix as TextDataset
from util import evaluate

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader

import csv
import h5py
import ast
import matplotlib.pyplot as plt

print("PyTorch version:")
print(torch.__version__)
print("GPU Detected:")
print(torch.cuda.is_available())
using_GPU = True

"""
1. Data pre-processing
"""
'''
1.1 VUA
get raw dataset as a list:
  Each element is a triple:
    a sentence: string
    a list of labels: 
    a list of pos: 
'''
pos_set = set()
raw_train_vua = []
with open('VUA_seq_formatted_train.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        pos_seq = ast.literal_eval(line[4])
        label_seq = ast.literal_eval(line[3])
        assert (len(pos_seq) == len(label_seq))
        assert (len(line[2].split()) == len(pos_seq))
        raw_train_vua.append([line[2], label_seq, pos_seq])
        pos_set.update(pos_seq)

raw_val_vua = []
with open('VUA_seq_formatted_val.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        pos_seq = ast.literal_eval(line[4])
        label_seq = ast.literal_eval(line[3])
        assert (len(pos_seq) == len(label_seq))
        assert (len(line[2].split()) == len(pos_seq))
        raw_val_vua.append([line[2], label_seq, pos_seq])
        pos_set.update(pos_seq)

# embed the pos tags
pos2idx, idx2pos = get_pos2idx_idx2pos(pos_set)

for i in range(len(raw_train_vua)):
    raw_train_vua[i][2] = index_sequence(pos2idx, raw_train_vua[i][2])
for i in range(len(raw_val_vua)):
    raw_val_vua[i][2] = index_sequence(pos2idx, raw_val_vua[i][2])
print('size of training set, validation set: ', len(raw_train_vua), len(raw_val_vua))


"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_train_vua)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r')
elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r')
# no suffix embeddings for sequence labeling
suffix_embeddings = None

'''
2. 2
embed the datasets
'''
# raw_train_vua: sentence, label_seq, pos_seq
# embedded_train_vua: embedded_sentence, pos, labels
embedded_train_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                      glove_embeddings, elmos_train_vua, suffix_embeddings),
                       example[2], example[1]]
                      for example in raw_train_vua]
embedded_val_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                    glove_embeddings, elmos_val_vua, suffix_embeddings),
                     example[2], example[1]]
                    for example in raw_val_vua]


'''
2. 3
set up Dataloader for batching
'''
# Separate the input (embedded_sequence) and labels in the indexed train sets.
# embedded_train_vua: embedded_sentence, pos, labels
train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
                                [example[1] for example in embedded_train_vua],
                                [example[2] for example in embedded_train_vua])
val_dataset_vua = TextDataset([example[0] for example in embedded_val_vua],
                              [example[1] for example in embedded_val_vua],
                              [example[2] for example in embedded_val_vua])

# Data-related hyperparameters
batch_size = 64
# Set up a DataLoader for the training, validation, and test dataset
train_dataloader_vua = DataLoader(dataset=train_dataset_vua, batch_size=batch_size, shuffle=True,
                              collate_fn=TextDataset.collate_fn)
val_dataloader_vua = DataLoader(dataset=val_dataset_vua, batch_size=batch_size,
                            collate_fn=TextDataset.collate_fn)




# """
# 3.3
# plot the training process: losses for validation and training dataset
# """
# plt.figure(0)
# plt.title('Loss for VUA dataset')
# plt.xlabel('iteration (unit:200)')
# plt.ylabel('Loss')
# plt.plot(val_loss, 'g')
# plt.plot(train_loss, 'b')
# plt.legend(['Validation loss', 'Training loss'], loc='upper right')
# plt.show()

# plt.figure(1)
# plt.title('Validation F1 for VUA dataset')
# plt.xlabel('iteration (unit:200)')
# plt.ylabel('F1')
# for i in range(len(idx2pos)):
#     plt.plot([x[i] for x in val_f1s])
# plt.legend([idx2pos[i] for i in range(len(idx2pos))], loc='upper left')
# plt.show()

# plt.figure(2)
# plt.title('Training F1 for VUA dataset')
# plt.xlabel('iteration (unit:200)')
# plt.ylabel('F1')
# for i in range(len(idx2pos)):
#     plt.plot([x[i] for x in train_f1s])
# plt.legend([idx2pos[i] for i in range(len(idx2pos))], loc='upper left')
# plt.show()


"""
write the test prediction on the VUA-verb to a file: sequence prediction
read and extract to get a comparabel performance on VUA-verb test set.
"""
def get_comparable_performance_test():
    result = write_predictions(raw_test_vua, test_dataloader_vua, RNNseq_model, using_GPU, '../data/VUAsequence/VUA_seq_formatted_test.csv')
    f = open('../predictions/vua_seq_test_predictions_LSTMsequence_vua.csv', 'w')
    writer = csv.writer(f)
    writer.writerows(result)
    f.close()

    get_performance_VUAverb_test()
    get_performance_VUA_test()


PyTorch version:
1.0.1.post2
GPU Detected:
False


  5%|▌         | 1075/20725 [00:00<00:01, 10745.07it/s]

size of training set, validation set:  6323 1550
vocab size:  13843


100%|██████████| 20725/20725 [00:00<00:00, 21099.60it/s]


Number of pre-trained word vectors loaded:  6594
Embeddings mean:  0.0002589632640592754
Embeddings stdev:  0.3578852415084839


OSError: Unable to open file (unable to open file: name = '../elmo/VUA_train.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [9]:
glove_path = "../glove/glove.840B.300d.txt"
glove_vectors = {}
with open(glove_path) as glove_file:
    print()




In [2]:
glove_embeddings

Embedding(13845, 300, padding_idx=0)

In [3]:
raw_train_vua

[["Ca n't fail to be entertaining .",
  [0, 0, 0, 0, 0, 0, 0],
  [14, 7, 14, 10, 14, 8, 9]],
 ['How much was he going to tell her ?',
  [0, 0, 0, 0, 0, 0, 0, 0, 0],
  [7, 8, 14, 1, 14, 10, 14, 1, 9]],
 ['Up until that news hit the Committee , Don had won the day with his UK Vehicle Division proposals .',
  [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 2, 13, 14, 2, 3, 9, 3, 14, 14, 2, 13, 0, 8, 3, 3, 3, 13, 9]],
 ["Could go on to the rugby and go with them could n't he ?",
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [14, 14, 10, 0, 2, 13, 12, 14, 0, 1, 14, 7, 1, 9]],
 ['Finally , we went to the office and they gave us a cheque , which bounced .',
  [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0],
  [7, 9, 1, 14, 0, 2, 13, 12, 1, 14, 1, 2, 13, 9, 8, 14, 9]],
 ['It must be shown that the defendant intended ( which , again , includes both purpose and awareness of practical certainty ) to cause really serious injury to someone .',
  [0,
   0,
   0,
   1,
  

In [5]:
import pandas as pd

In [7]:
pd.read_csv('VUA_seq_formatted_val.csv', encoding='latin-1')

Unnamed: 0,txt_id,sen_ix,sentence,label_seq,pos_seq,labeled_sentence,genre
0,acj-fragment01,148,Four alternative approaches have been describe...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","['NUM', 'ADJ', 'NOUN', 'VERB', 'VERB', 'VERB',...",Four alternative M_approaches have been descri...,academic
1,ab9-fragment03,908,"I wanted to say , you see , that I know you th...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","['PRON', 'VERB', 'PART', 'VERB', 'PUNCT', 'PRO...","I wanted to say , you M_see , that I know you ...",fiction
2,kbw-fragment42,14929,The one with you chop the chop and then there ...,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","['DET', 'NOUN', 'ADP', 'PRON', 'VERB', 'DET', ...",The M_one M_with you chop the chop and then th...,conversation
3,b1g-fragment02,772,"Given that most GIS are rather dumb systems , ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['VERB', 'ADP', 'ADJ', 'PROPN', 'VERB', 'ADV',...","Given that most GIS are rather dumb systems , ...",academic
4,a1n-fragment18,350,Lacking a goal that might have altered its che...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","['VERB', 'DET', 'NOUN', 'ADJ', 'VERB', 'VERB',...",Lacking a goal that might have altered its M_c...,news
5,a80-fragment15,297,Equally intriguing will be the line-out perfor...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","['ADV', 'ADJ', 'VERB', 'VERB', 'DET', 'PART', ...",Equally intriguing will be the line-out perfor...,news
6,a9j-fragment01,34,"Thus , wherever one looks , one finds people e...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, ...","['ADV', 'PUNCT', 'ADV', 'PRON', 'VERB', 'PUNCT...","Thus , wherever one M_looks , one M_finds peop...",news
7,a1p-fragment03,70,"As for his colleagues , Mr Smith said : We 're...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['ADP', 'ADP', 'ADJ', 'NOUN', 'PUNCT', 'PROPN'...","As for his colleagues , Mr Smith said : We 're...",news
8,a6u-fragment02,279,The red and green of the Aztec necklace links ...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","['DET', 'ADJ', 'CCONJ', 'ADJ', 'ADP', 'DET', '...",The red and green of the Aztec necklace M_link...,academic
9,ac2-fragment06,1473,"Jesus wept ! he continued , hackles rising .","[0, 0, 0, 0, 0, 0, 1, 1, 0]","['PROPN', 'VERB', 'PUNCT', 'PRON', 'VERB', 'PU...","Jesus wept ! he continued , M_hackles M_rising .",fiction
