<a href="https://colab.research.google.com/github/LaTarn14/Thai-Coreference/blob/main/Coref_Thai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparing

In [None]:
!gdown 14k1PLN9MVszCK6zUIRcLzWLwEGdR0ABw #TNCembeddings-200dimentions

!gdown 1qSwQCmCZQ8SDzBP_cHUsZiIuAHIfg171 #for modeling 
!unzip e2e-coref-e2e.zip
!rm e2e-coref-e2e.zip

!gdown 1FHbKGKXFPOi6v09H6Me3OpuYY66aTXxZ #training data
!unzip traindevtest_all.zip
!rm traindevtest_all.zip

In [None]:
!pip install scikit-learn==0.21.1 tensorflow==1.15 numpy==1.19.5
!pip install gensim pyhocon pyhocon jsonlines
import time, json, collections, re, glob, jsonlines
import tensorflow as tf
import numpy as np
from collections import Counter
from sklearn.utils.linear_assignment_ import linear_assignment
from gensim.models import Word2Vec, KeyedVectors

In [None]:
#setup env
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
!bash setup_all.sh
import os, operator, random, math, threading, h5py, copy, six
import tensorflow_hub as hub
import torch, util, coref_ops, conll, metrics

# Evaluator

In [114]:
def shape(x, n):
  return x.get_shape()[n].value or tf.shape(x)[n]

In [115]:
def time_used(start_time):
  curr_time = time.time()
  used_time = curr_time-start_time
  m = used_time // 60
  s = used_time - 60 * m
  return "%d m %d s" % (m, s)

In [116]:
def f1(p_num, p_den, r_num, r_den, beta=1):
    p = 0 if p_den == 0 else p_num / float(p_den)
    r = 0 if r_den == 0 else r_num / float(r_den)
    return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)

class CorefEvaluator(object):
    def __init__(self):
        self.evaluators = [Evaluator(m) for m in (muc, b_cubed, ceafe)]

    def update(self, predicted, gold, mention_to_predicted, mention_to_gold):
        for e in self.evaluators:
            e.update(predicted, gold, mention_to_predicted, mention_to_gold)

    def get_f1(self):
        return sum(e.get_f1() for e in self.evaluators) / len(self.evaluators)

    def get_recall(self):
        return sum(e.get_recall() for e in self.evaluators) / len(self.evaluators)

    def get_precision(self):
        return sum(e.get_precision() for e in self.evaluators) / len(self.evaluators)

    def get_prf(self):
        return self.get_precision(), self.get_recall(), self.get_f1()

class Evaluator(object):
    def __init__(self, metric, beta=1):
        self.p_num = 0
        self.p_den = 0
        self.r_num = 0
        self.r_den = 0
        self.metric = metric
        self.beta = beta

    def update(self, predicted, gold, mention_to_predicted, mention_to_gold):
        if self.metric == ceafe:
            pn, pd, rn, rd = self.metric(predicted, gold)
        else:
            pn, pd = self.metric(predicted, mention_to_gold)
            rn, rd = self.metric(gold, mention_to_predicted)
        self.p_num += pn
        self.p_den += pd
        self.r_num += rn
        self.r_den += rd

    def get_f1(self):
        return f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=self.beta)

    def get_recall(self):
        return 0 if self.r_num == 0 else self.r_num / float(self.r_den)

    def get_precision(self):
        return 0 if self.p_num == 0 else self.p_num / float(self.p_den)

    def get_prf(self):
        return self.get_precision(), self.get_recall(), self.get_f1()

    def get_counts(self):
        return self.p_num, self.p_den, self.r_num, self.r_den


def evaluate_documents(documents, metric, beta=1):
    evaluator = Evaluator(metric, beta=beta)
    for document in documents:
        evaluator.update(document)
    return evaluator.get_precision(), evaluator.get_recall(), evaluator.get_f1()


def b_cubed(clusters, mention_to_gold):
    num, dem = 0, 0
    for c in clusters:
        if len(c) == 1:
            continue
        gold_counts = Counter()
        correct = 0
        for m in c:
            if m in mention_to_gold:
                gold_counts[tuple(mention_to_gold[m])] += 1
        for c2, count in gold_counts.items():
            if len(c2) != 1:
                correct += count * count
        num += correct / float(len(c))
        dem += len(c)

    return num, dem


def muc(clusters, mention_to_gold):
    tp, p = 0, 0
    for c in clusters:
        p += len(c) - 1
        tp += len(c)
        linked = set()
        for m in c:
            if m in mention_to_gold:
                linked.add(mention_to_gold[m])
            else:
                tp -= 1
        tp -= len(linked)
    return tp, p


def phi4(c1, c2):
    return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2))


def ceafe(clusters, gold_clusters):
    clusters = [c for c in clusters if len(c) != 1]
    scores = np.zeros((len(gold_clusters), len(clusters)))
    for i in range(len(gold_clusters)):
        for j in range(len(clusters)):
            scores[i, j] = phi4(gold_clusters[i], clusters[j])
    matching = linear_assignment(-scores)
    similarity = sum(scores[matching[:, 0], matching[:, 1]])
    return similarity, len(clusters), similarity, len(gold_clusters)

# e2e-coref Model


In [138]:
class CorefModel(object):
  def __init__(self, use_coarse_to_fine):
    tf.reset_default_graph()
    self.embedding_size = 200
    #Hyperparameters
    self.embedding_dropout_rate = 0.5 #The dropout rate for word embeddings
    self.hidden_dropout_rate = 0.2 #The dropout rate for the hidden layers of LSTM and FFNN
    self.max_ant = 10 # The maximum number of candidate antecedents we will give to each of the candidate mentions.
    self.lstm_size = 200 #The size of the LSTM layer
    self.ffnn_layer = 2 #The number of hidden layers used for the FFNN
    self.hidden_size_men = 150 #The size of the hidden layer for antecedents scores calculation
    self.hidden_size_ant = 150 #The size of the hidden layer for mention scores calculation
    self.max_span_width = 10
    self.learning_rate = 2e-4
    self.decay_rate = 0.999
    self.decay_frequency = 100
    self.coarse_to_fine = use_coarse_to_fine
    self.use_features = True
    self.feature_size = 20
    self.model_heads = True

In [139]:
  def add_placeholder(self):
    self.word_embeddings = tf.placeholder(tf.float32, shape=[None, None, self.embedding_size])
    self.sent_lengths = tf.placeholder(tf.int32, shape=[None])
    self.mention_starts = tf.placeholder(tf.int32, shape=[None])
    self.mention_ends = tf.placeholder(tf.int32, shape=[None])
    self.mention_cluster_ids = tf.placeholder(tf.int32, shape=[None])
    self.is_training = tf.placeholder(tf.bool, shape=[])
  CorefModel.add_placeholder = add_placeholder

In [140]:
  #antecedent scores calculation
  def get_slow_antecedent_scores(self, top_span_emb, top_antecedents, top_antecedent_emb, top_antecedent_offsets):
    k = util.shape(top_span_emb, 0)
    c = util.shape(top_antecedents, 1)

    feature_emb_list = []

    if self.use_features:
      antecedent_distance_buckets = self.bucket_distance(top_antecedent_offsets) # [k, c]
      antecedent_distance_emb = tf.gather(tf.get_variable("antecedent_distance_emb", [10, self.feature_size]), antecedent_distance_buckets) # [k, c]
      feature_emb_list.append(antecedent_distance_emb)

    feature_emb = tf.concat(feature_emb_list, 2) # [k, c, emb]
    feature_emb = tf.nn.dropout(feature_emb, self.dropout) # [k, c, emb]

    target_emb = tf.expand_dims(top_span_emb, 1) # [k, 1, emb]
    similarity_emb = top_antecedent_emb * target_emb # [k, c, emb]
    target_emb = tf.tile(target_emb, [1, c, 1]) # [k, c, emb]

    pair_emb = tf.concat([target_emb, top_antecedent_emb, similarity_emb, feature_emb], 2) # [k, c, emb]

    with tf.variable_scope("slow_antecedent_scores"):
      slow_antecedent_scores = util.ffnn(pair_emb, 2, self.hidden_size_ant, 1, self.dropout) # [k, c, 1]
    slow_antecedent_scores = tf.squeeze(slow_antecedent_scores, 2) # [k, c]
    return slow_antecedent_scores # [k, c]
  CorefModel.get_slow_antecedent_scores = get_slow_antecedent_scores

  def get_fast_antecedent_scores(self, top_span_emb):
    with tf.variable_scope("src_projection"):
      source_top_span_emb = tf.nn.dropout(util.projection(top_span_emb, util.shape(top_span_emb, -1)), self.dropout) # [k, emb]
    target_top_span_emb = tf.nn.dropout(top_span_emb, self.dropout) # [k, emb]
    return tf.matmul(source_top_span_emb, target_top_span_emb, transpose_b=True) # [k, k]
  CorefModel.get_fast_antecedent_scores = get_fast_antecedent_scores

In [141]:
  def bucket_distance(self, distances):
    logspace_idx = tf.to_int32(tf.floor(tf.log(tf.to_float(distances))/math.log(2))) + 3
    use_identity = tf.to_int32(distances <= 4)
    combined_idx = use_identity * distances + (1 - use_identity) * logspace_idx
    return tf.clip_by_value(combined_idx, 0, 9)
  CorefModel.bucket_distance = bucket_distance

In [142]:
def batch_gather(emb, indices):
  batch_size = shape(emb, 0)
  seqlen = shape(emb, 1)
  if len(emb.get_shape()) > 2:
    emb_size = shape(emb, 2)
  else:
    emb_size = 1
  flattened_emb = tf.reshape(emb, [batch_size * seqlen, emb_size])  # [batch_size * seqlen, emb]
  offset = tf.expand_dims(tf.range(batch_size) * seqlen, 1)  # [batch_size, 1]
  gathered = tf.gather(flattened_emb, indices + offset) # [batch_size, num_indices, emb]
  if len(emb.get_shape()) == 2:
    gathered = tf.squeeze(gathered, 2) # [batch_size, num_indices]
  return gathered

In [143]:
  def get_predictions_and_loss(self, word_embeddings, sent_lengths, mention_starts, mention_ends, mention_cluster_ids, is_training):
    self.lexical_dropout = 1 - (tf.to_float(is_training)*self.embedding_dropout_rate)
    self.dropout = 1 - (tf.to_float(is_training)*self.hidden_dropout_rate)

    num_sentences = tf.shape(word_embeddings)[0]
    max_sentence_length = tf.shape(word_embeddings)[1]

    text_emb = tf.concat([word_embeddings], 2)
    text_emb = tf.nn.dropout(text_emb, self.lexical_dropout)

    text_len_mask = tf.sequence_mask(sent_lengths, maxlen=max_sentence_length)
    text_len_mask = tf.reshape(text_len_mask, [num_sentences * max_sentence_length])
    
    text_outputs = self.encode_sentences(text_emb, sent_lengths, text_len_mask)
    # text_outputs = tf.nn.dropout(text_outputs, self.dropout)

    num_words = util.shape(text_outputs, 0)

    sentence_indices = tf.tile(tf.expand_dims(tf.range(num_sentences), 1), [1, max_sentence_length]) # [num_sentences, max_sentence_length]
    flattened_sentence_indices = self.flatten_emb_by_sentence(sentence_indices, text_len_mask) # [num_words]
    flattened_text_emb = self.flatten_emb_by_sentence(text_emb, text_len_mask) # [num_words]

    candidate_starts = tf.tile(tf.expand_dims(tf.range(num_words), 1), [1, self.max_span_width]) # [num_words, max_span_width]
    candidate_ends = candidate_starts + tf.expand_dims(tf.range(self.max_span_width), 0) # [num_words, max_span_width]

    candidate_start_sentence_indices = tf.gather(flattened_sentence_indices, candidate_starts) # [num_words, max_span_width]
    candidate_end_sentence_indices = tf.gather(flattened_sentence_indices, tf.minimum(candidate_ends, num_words - 1)) # [num_words, max_span_width]

    candidate_mask = tf.logical_and(candidate_ends < num_words, tf.equal(candidate_start_sentence_indices, candidate_end_sentence_indices)) # [num_words, max_span_width]
    flattened_candidate_mask = tf.reshape(candidate_mask, [-1]) # [num_words * max_span_width]
    candidate_starts = tf.boolean_mask(tf.reshape(candidate_starts, [-1]), flattened_candidate_mask) # [num_candidates]
    candidate_ends = tf.boolean_mask(tf.reshape(candidate_ends, [-1]), flattened_candidate_mask) # [num_candidates]
    
    candidate_sentence_indices = tf.boolean_mask(tf.reshape(candidate_start_sentence_indices, [-1]), flattened_candidate_mask) # [num_candidates]

    candidate_cluster_ids = self.get_candidate_labels(candidate_starts, candidate_ends, mention_starts, mention_ends, mention_cluster_ids) # [num_candidates]
    
    candidate_mention_emb = self.get_span_emb(flattened_text_emb, text_outputs, candidate_starts, candidate_ends) # [num_candidates, emb]
    candidate_mention_scores =  self.get_mention_scores(candidate_mention_emb) # [k, 1]
    candidate_mention_scores = tf.squeeze(candidate_mention_scores, 1) # [k]

    k = tf.to_int32(tf.floor(tf.to_float(tf.shape(text_outputs)[0]) * 0.4))

    if self.coarse_to_fine:

      predicted_mention_indices = coref_ops.extract_spans(tf.expand_dims(candidate_mention_scores, 0),
                                               tf.expand_dims(candidate_starts, 0),
                                               tf.expand_dims(candidate_ends, 0),
                                               tf.expand_dims(k, 0),
                                               util.shape(text_outputs, 0),
                                               True) # [1, k]
      predicted_mention_indices.set_shape([1, None])
      predicted_mention_indices = tf.squeeze(predicted_mention_indices, 0) # [k]
    
      mention_cluster_ids = tf.gather(candidate_cluster_ids, predicted_mention_indices)
      mention_starts_pred = tf.gather(candidate_starts, predicted_mention_indices) # [num_mentions]
      mention_ends_pred = tf.gather(candidate_ends, predicted_mention_indices) # [num_mentions]
      mention_emb = tf.gather(candidate_mention_emb, predicted_mention_indices) # [num_mentions, emb]
      mention_scores = tf.gather(candidate_mention_scores, predicted_mention_indices) # [num_mentions]
      top_span_speaker_ids = None

      mention_starts_emb = tf.gather(text_outputs, mention_starts_pred) # [num_mentions, emb]
      mention_ends_emb = tf.gather(text_outputs, mention_ends_pred) # [num_mentions, emb]

      c = tf.minimum(self.max_ant, k)
      top_antecedents, top_antecedents_mask, top_fast_antecedent_scores, top_antecedent_offsets = self.coarse_to_fine_pruning(mention_emb, mention_scores, c)
                                                                                                                                                                                                                                          
      dummy_scores = tf.zeros([k, 1]) # [k, 1]

      for i in range(2):
       with tf.variable_scope("coref_layer", reuse=(i > 0)):
        top_antecedent_emb = tf.gather(mention_emb, top_antecedents) # [k, c, emb]
        top_antecedent_scores = top_fast_antecedent_scores + self.get_slow_antecedent_scores(mention_emb, top_antecedents, top_antecedent_emb, top_antecedent_offsets) # [k, c]
        top_antecedent_weights = tf.nn.softmax(tf.concat([dummy_scores, top_antecedent_scores], 1)) # [k, c + 1]
        top_antecedent_emb = tf.concat([tf.expand_dims(mention_emb, 1), top_antecedent_emb], 1) # [k, c + 1, emb]
        attended_span_emb = tf.reduce_sum(tf.expand_dims(top_antecedent_weights, 2) * top_antecedent_emb, 1) # [k, emb]
        with tf.variable_scope("f"):
          f = tf.sigmoid(util.projection(tf.concat([mention_emb, attended_span_emb], 1), util.shape(mention_emb, -1))) # [k, emb]
          mention_emb = f * attended_span_emb + (1 - f) * mention_emb # [k, emb]

      top_antecedent_scores = tf.concat([dummy_scores, top_antecedent_scores], 1) # [k, c + 1]
      top_antecedent_cluster_ids = tf.gather(mention_cluster_ids, top_antecedents) # [k, c]
      top_antecedent_cluster_ids += tf.to_int32(tf.log(tf.to_float(top_antecedents_mask))) # [k, c]
      same_cluster_indicator = tf.equal(top_antecedent_cluster_ids, tf.expand_dims(mention_cluster_ids, 1)) # [k, c]
      non_dummy_indicator = tf.expand_dims(mention_cluster_ids > 0, 1) # [k, 1]
      pairwise_labels = tf.logical_and(same_cluster_indicator, non_dummy_indicator) # [k, c]
      dummy_labels = tf.logical_not(tf.reduce_any(pairwise_labels, 1, keepdims=True)) # [k, 1]
      top_antecedent_labels = tf.concat([dummy_labels, pairwise_labels], 1) # [k, c + 1]
    
    else:
      
      predicted_mention_indices = coref_ops.extract_mentions(candidate_mention_scores, candidate_starts, candidate_ends, k) # ([k], [k])
      predicted_mention_indices.set_shape([None])
    
      mention_cluster_ids = tf.gather(candidate_cluster_ids, predicted_mention_indices)
      mention_starts_pred = tf.gather(candidate_starts, predicted_mention_indices) # [num_mentions]
      mention_ends_pred = tf.gather(candidate_ends, predicted_mention_indices) # [num_mentions]
      mention_emb = tf.gather(candidate_mention_emb, predicted_mention_indices) # [num_mentions, emb]
      mention_scores = tf.gather(candidate_mention_scores, predicted_mention_indices) # [num_mentions]

      mention_starts_emb = tf.gather(text_outputs, mention_starts_pred) # [num_mentions, emb]
      mention_ends_emb = tf.gather(text_outputs, mention_ends_pred) # [num_mentions, emb]

      top_antecedents, top_antecedent_labels, top_antecedents_len = coref_ops.antecedents(mention_starts_pred, 
                                                                              mention_ends_pred, 
                                                                              mention_starts, 
                                                                              mention_ends, 
                                                                              mention_cluster_ids, 
                                                                              self.max_ant) # ([num_mentions, max_ant], [num_mentions, max_ant + 1], [num_mentions]
      top_antecedents.set_shape([None, None])
      top_antecedent_labels.set_shape([None, None])
      top_antecedents_len.set_shape([None])

      top_antecedent_scores = self.get_antecedent_scores(mention_emb, mention_scores, top_antecedents, top_antecedents_len, mention_starts_pred, mention_ends_pred) # [num_mentions, max_ant + 1]
    
    loss = self.softmax_loss(top_antecedent_scores, top_antecedent_labels) # [k]
    loss = tf.reduce_sum(loss) # []

    return [candidate_starts, candidate_ends, candidate_mention_scores, mention_starts_pred, mention_ends_pred, top_antecedents, top_antecedent_scores], loss
  CorefModel.get_predictions_and_loss = get_predictions_and_loss

In [144]:
  def get_candidate_labels(self, candidate_starts, candidate_ends, labeled_starts, labeled_ends, labels):
    same_start = tf.equal(tf.expand_dims(labeled_starts, 1), tf.expand_dims(candidate_starts, 0)) # [num_labeled, num_candidates]
    same_end = tf.equal(tf.expand_dims(labeled_ends, 1), tf.expand_dims(candidate_ends, 0)) # [num_labeled, num_candidates]
    same_span = tf.logical_and(same_start, same_end) # [num_labeled, num_candidates]
    candidate_labels = tf.matmul(tf.expand_dims(labels, 0), tf.to_int32(same_span)) # [1, num_candidates]
    candidate_labels = tf.squeeze(candidate_labels, 0) # [num_candidates]
    return candidate_labels
  CorefModel.get_candidate_labels = get_candidate_labels   

In [145]:
  #antecedents pruning method
  def coarse_to_fine_pruning(self, top_span_emb, top_span_mention_scores, c):
    k = util.shape(top_span_emb, 0)
    top_span_range = tf.range(k) # [k]
    antecedent_offsets = tf.expand_dims(top_span_range, 1) - tf.expand_dims(top_span_range, 0) # [k, k]
    antecedents_mask = antecedent_offsets >= 1 # [k, k]
    fast_antecedent_scores = tf.expand_dims(top_span_mention_scores, 1) + tf.expand_dims(top_span_mention_scores, 0) # [k, k]
    fast_antecedent_scores += tf.log(tf.to_float(antecedents_mask)) # [k, k]
    fast_antecedent_scores += self.get_fast_antecedent_scores(top_span_emb) # [k, k]

    _, top_antecedents = tf.nn.top_k(fast_antecedent_scores, c, sorted=False) # [k, c]
    top_antecedents_mask = batch_gather(antecedents_mask, top_antecedents) # [k, c]
    top_fast_antecedent_scores = batch_gather(fast_antecedent_scores, top_antecedents) # [k, c]
    top_antecedent_offsets = batch_gather(antecedent_offsets, top_antecedents) # [k, c]
    return top_antecedents, top_antecedents_mask, top_fast_antecedent_scores, top_antecedent_offsets
  CorefModel.coarse_to_fine_pruning = coarse_to_fine_pruning

In [146]:
  def get_antecedent_scores(self, mention_emb, mention_scores, antecedents, antecedents_len, mention_starts, mention_ends):
    num_mentions = util.shape(mention_emb, 0)
    max_antecedents = util.shape(antecedents, 1)

    feature_emb_list = []

    if self.use_features:
      target_indices = tf.range(num_mentions) # [num_mentions]
      mention_distance = tf.expand_dims(target_indices, 1) - antecedents # [num_mentions, max_ant]
      mention_distance_bins = coref_ops.distance_bins(mention_distance) # [num_mentions, max_ant]
      mention_distance_bins.set_shape([None, None])
      mention_distance_emb = tf.gather(tf.get_variable("mention_distance_emb", [10, self.feature_size]), mention_distance_bins) # [num_mentions, max_ant]
      feature_emb_list.append(mention_distance_emb)

    feature_emb = tf.concat(feature_emb_list, 2) # [num_mentions, max_ant, emb]
    feature_emb = tf.nn.dropout(feature_emb, self.dropout) # [num_mentions, max_ant, emb]

    antecedent_emb = tf.gather(mention_emb, antecedents) # [num_mentions, max_ant, emb]
    target_emb_tiled = tf.tile(tf.expand_dims(mention_emb, 1), [1, max_antecedents, 1]) # [num_mentions, max_ant, emb]
    similarity_emb = antecedent_emb * target_emb_tiled # [num_mentions, max_ant, emb]

    pair_emb = tf.concat([target_emb_tiled, antecedent_emb, similarity_emb, feature_emb], 2) # [num_mentions, max_ant, emb]

    with tf.variable_scope("iteration"):
      with tf.variable_scope("antecedent_scoring"):
        antecedent_scores = util.ffnn(pair_emb, 2, self.hidden_size_ant, 1, self.dropout) # [num_mentions, max_ant, 1]
    antecedent_scores = tf.squeeze(antecedent_scores, 2) # [num_mentions, max_ant]

    antecedent_mask = tf.log(tf.sequence_mask(antecedents_len, max_antecedents, dtype=tf.float32)) # [num_mentions, max_ant]
    antecedent_scores += antecedent_mask # [num_mentions, max_ant]

    antecedent_scores += tf.expand_dims(mention_scores, 1) + tf.gather(mention_scores, antecedents) # [num_mentions, max_ant]
    antecedent_scores = tf.concat([tf.zeros([util.shape(mention_scores, 0), 1]), antecedent_scores], 1) # [num_mentions, max_ant + 1]
    return antecedent_scores  # [num_mentions, max_ant + 1]
  CorefModel.get_antecedent_scores = get_antecedent_scores

In [147]:
  def encode_sentences(self, text_emb, text_len, text_len_mask):
    num_sentences = tf.shape(text_emb)[0]
    max_sentence_length = tf.shape(text_emb)[1]

    inputs = tf.transpose(text_emb, [1, 0, 2]) # [max_sentence_length, num_sentences, emb]
    
    current_inputs = text_emb
    
    with tf.variable_scope("fw_cell"):
      cell_fw = util.CustomLSTMCell(self.lstm_size, num_sentences, self.dropout)
      preprocessed_inputs_fw = cell_fw.preprocess_input(inputs)
    with tf.variable_scope("bw_cell"):
      cell_bw = util.CustomLSTMCell(self.lstm_size, num_sentences, self.dropout)
      preprocessed_inputs_bw = cell_bw.preprocess_input(inputs)
      preprocessed_inputs_bw = tf.reverse_sequence(preprocessed_inputs_bw,
                                                   seq_lengths=text_len,
                                                   seq_dim=0,
                                                   batch_dim=1)
    state_fw = tf.contrib.rnn.LSTMStateTuple(tf.tile(cell_fw.initial_state.c, [num_sentences, 1]), tf.tile(cell_fw.initial_state.h, [num_sentences, 1]))
    state_bw = tf.contrib.rnn.LSTMStateTuple(tf.tile(cell_bw.initial_state.c, [num_sentences, 1]), tf.tile(cell_bw.initial_state.h, [num_sentences, 1]))
    with tf.variable_scope("lstm"):
      with tf.variable_scope("fw_lstm"):
        fw_outputs, fw_states = tf.nn.dynamic_rnn(cell=cell_fw,
                                                  inputs=preprocessed_inputs_fw,
                                                  sequence_length=text_len,
                                                  initial_state=state_fw,
                                                  time_major=True)
      with tf.variable_scope("bw_lstm"):
        bw_outputs, bw_states = tf.nn.dynamic_rnn(cell=cell_bw,
                                                  inputs=preprocessed_inputs_bw,
                                                  sequence_length=text_len,
                                                  initial_state=state_bw,
                                                  time_major=True)

    bw_outputs = tf.reverse_sequence(bw_outputs,
                                     seq_lengths=text_len,
                                     seq_dim=0,
                                     batch_dim=1)

    text_outputs = tf.concat([fw_outputs, bw_outputs], 2)
    text_outputs = tf.transpose(text_outputs, [1, 0, 2]) # [num_sentences, max_sentence_length, emb]
    return self.flatten_emb_by_sentence(text_outputs, text_len_mask)
  CorefModel.encode_sentences = encode_sentences

In [148]:
  def get_span_emb(self, head_emb, context_outputs, span_starts, span_ends):
    span_emb_list = []

    span_start_emb = tf.gather(context_outputs, span_starts) # [k, emb]
    span_emb_list.append(span_start_emb)

    span_end_emb = tf.gather(context_outputs, span_ends) # [k, emb]
    span_emb_list.append(span_end_emb)

    span_width = 1 + span_ends - span_starts # [k]
    if self.use_features:
      span_width_index = span_width - 1 # [k]
      span_width_emb = tf.gather(tf.get_variable("span_width_embeddings", [self.max_span_width, self.feature_size]), span_width_index) # [k, emb]
      span_width_emb = tf.nn.dropout(span_width_emb, self.dropout)
      span_emb_list.append(span_width_emb)

    if self.model_heads:
      span_indices = tf.expand_dims(tf.range(self.max_span_width), 0) + tf.expand_dims(span_starts, 1) # [k, max_span_width]
      span_indices = tf.minimum(util.shape(context_outputs, 0) - 1, span_indices) # [k, max_span_width]
      span_text_emb = tf.gather(head_emb, span_indices) # [k, max_span_width, emb]
      with tf.variable_scope("head_scores"):
        self.head_scores = util.projection(context_outputs, 1) # [num_words, 1]
      span_head_scores = tf.gather(self.head_scores, span_indices) # [k, max_span_width, 1]
      span_mask = tf.expand_dims(tf.sequence_mask(span_width, self.max_span_width, dtype=tf.float32), 2) # [k, max_span_width, 1]
      span_head_scores += tf.log(span_mask) # [k, max_span_width, 1]
      span_attention = tf.nn.softmax(span_head_scores, 1) # [k, max_span_width, 1]
      span_head_emb = tf.reduce_sum(span_attention * span_text_emb, 1) # [k, emb]
      span_emb_list.append(span_head_emb)

    span_emb = tf.concat(span_emb_list, 1) # [k, emb]
    return span_emb # [k, emb]
  CorefModel.get_span_emb = get_span_emb

In [149]:
  def get_mention_scores(self, mention_emb):
    with tf.variable_scope("mention_scores"):
      return util.ffnn(mention_emb, 2, self.hidden_size_men, 1, self.dropout) # [num_mentions, 1]
  CorefModel.get_mention_scores = get_mention_scores

In [150]:
  def flatten_emb_by_sentence(self, emb, text_len_mask):
    num_sentences = tf.shape(emb)[0]
    max_sentence_length = tf.shape(emb)[1]

    emb_rank = len(emb.get_shape())
    if emb_rank  == 2:
      flattened_emb = tf.reshape(emb, [num_sentences * max_sentence_length])
    elif emb_rank == 3:
      flattened_emb = tf.reshape(emb, [num_sentences * max_sentence_length, util.shape(emb, 2)])
    else:
      raise ValueError("Unsupported rank: {}".format(emb_rank))
    return tf.boolean_mask(flattened_emb, tf.reshape(text_len_mask, [num_sentences * max_sentence_length]))
  CorefModel.flatten_emb_by_sentence = flatten_emb_by_sentence

In [151]:
  def softmax_loss(self, antecedent_scores, antecedent_labels):
    gold_scores = antecedent_scores + tf.log(tf.to_float(antecedent_labels)) # [num_mentions, max_ant + 1]
    marginalized_gold_scores = tf.reduce_logsumexp(gold_scores, [1]) # [num_mentions]
    log_norm = tf.reduce_logsumexp(antecedent_scores, [1]) # [num_mentions]
    return log_norm - marginalized_gold_scores # [num_mentions]
  CorefModel.softmax_loss = softmax_loss

In [152]:
  def build(self):
    #loads pre-trained word embeddings
    self.embedding_dict = KeyedVectors.load_word2vec_format("TNC_embeddings-200.bin", binary = True, unicode_errors = 'ignore')
    
    #create placeholders
    self.add_placeholder()

    #create tensorflow graph
    self.predictions, self.loss = self.get_predictions_and_loss(self.word_embeddings,
                                                                self.sent_lengths, 
                                                                self.mention_starts, 
                                                                self.mention_ends,
                                                                self.mention_cluster_ids, 
                                                                self.is_training)


    self.global_step = tf.Variable(0, name="global_step", trainable = False)
    self.reset_global_step = tf.assign(self.global_step, 0)
    learning_rate = tf.train.exponential_decay(self.learning_rate, 
                                               self.global_step,
                                               self.decay_frequency, 
                                               self.decay_rate,
                                               staircase=True)
    trainable_params = tf.trainable_variables()
    gradients = tf.gradients(self.loss, trainable_params)
    gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
    optimizers = {
      "adam" : tf.train.AdamOptimizer,
      "sgd" : tf.train.GradientDescentOptimizer
    }
    optimizer = optimizers['adam'](learning_rate)
    self.train_op = optimizer.apply_gradients(zip(gradients, trainable_params), global_step = self.global_step)
    self.sess = tf.Session()
    self.sess.run(tf.global_variables_initializer())

  CorefModel.build = build

In [153]:
  def get_feed_dict_list(self, path, is_training):
    feed_dict_list = []
    for line in open(path):
      doc = json.loads(line)

      clusters = doc['clusters']
      gold_mentions = sorted([tuple(m) for cl in clusters for m in cl])
      gold_mention_map = {m:i for i,m in enumerate(gold_mentions)}
      cluster_ids = np.zeros(len(gold_mentions))
      for cid, cluster in enumerate(clusters):
        for mention in cluster:
          cluster_ids[gold_mention_map[tuple(mention)]] = cid + 1
      
      starts, ends = [], []
      if len(gold_mentions) > 0:
        starts, ends = zip(*gold_mentions)
      starts, ends = np.array(starts), np.array(ends)
      
      sentences = doc['sentences']
      sent_lengths = [len(sent) for sent in sentences]
      max_sent_length = max(sent_lengths)
      word_emb = np.zeros([len(sentences), max_sent_length, self.embedding_size])

      for i, sent in enumerate(sentences):
        for j, word in enumerate(sent):
          current_dim = 0
          try:
            word_emb[i,j] = self.embedding_dict[word]
          except:
            try:
              new_word = word.replace(' ','')
              word_emb[i,j] = self.embedding_dict[new_word]
            except:
              word_emb[i,j] = 0

      fd = {}
      fd[self.word_embeddings] = word_emb
      fd[self.sent_lengths] = np.array(sent_lengths)
      fd[self.mention_starts] = starts
      fd[self.mention_ends] = ends
      fd[self.mention_cluster_ids] = cluster_ids
      fd[self.is_training] = is_training
      feed_dict_list.append(tuple((fd,clusters)))

    return feed_dict_list
  CorefModel.get_feed_dict_list = get_feed_dict_list

In [154]:
  def get_predicted_clusters(self, mention_starts, mention_ends, predicted_antecedents):
    mention_to_predicted = {}
    predicted_clusters = []

    for i, predicted_index in enumerate(predicted_antecedents):
      if predicted_index < 0:
        continue
      assert i > predicted_index
      predicted_antecedent = (int(mention_starts[predicted_index]), int(mention_ends[predicted_index]))
      if predicted_antecedent in mention_to_predicted:
        predicted_cluster = mention_to_predicted[predicted_antecedent]
      else:
        predicted_cluster = len(predicted_clusters)
        predicted_clusters.append([predicted_antecedent])
        mention_to_predicted[predicted_antecedent] = predicted_cluster

      mention = (int(mention_starts[i]), int(mention_ends[i]))
      predicted_clusters[predicted_cluster].append(mention)
      mention_to_predicted[mention] = predicted_cluster

    predicted_clusters = [tuple(pc) for pc in predicted_clusters]
    mention_to_predicted = {m: predicted_clusters[i] for m, i in mention_to_predicted.items()}

    return predicted_clusters, mention_to_predicted

  CorefModel.get_predicted_clusters = get_predicted_clusters

In [155]:
  def evaluate_coref(self, mention_starts, mention_ends, predicted_antecedents, gold_clusters, evaluator):
    gold_clusters = [tuple(tuple(m) for m in gc) for gc in gold_clusters]
    mention_to_gold = {}
    for gc in gold_clusters:
      for mention in gc:
        mention_to_gold[mention] = gc
    predicted_clusters, mention_to_predicted = self.get_predicted_clusters(mention_starts, mention_ends,predicted_antecedents)
    evaluator.update(predicted_clusters, gold_clusters, mention_to_predicted, mention_to_gold)
  CorefModel.evaluate_coref = evaluate_coref

In [156]:
  def eval(self, path):
    eval_fd_list = self.get_feed_dict_list(path, False)
    coref_evaluator = CorefEvaluator()

    for fd, clusters in eval_fd_list:
      candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, mention_pair_scores = self.sess.run(self.predictions, fd)

      predicted_antecedents = []
      for i, index in enumerate(np.argmax(mention_pair_scores, axis = 1) - 1):
        if index < 0:
          predicted_antecedents.append(-1)
        else:
          predicted_antecedents.append(top_antecedents[i, index])

      self.evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, clusters, coref_evaluator)

    p, r, f = coref_evaluator.get_prf()
    print("Average F1 (py): {:.2f}%".format(f * 100))
    print("Average precision (py): {:.2f}%".format(p * 100))
    print("Average recall (py): {:.2f}%".format(r * 100))
  CorefModel.eval = eval

In [157]:
  def train(self, train_path, dev_path, test_path, epochs):
     train_fd_list = self.get_feed_dict_list(train_path, True)
     start_time = time.time()
     for epoch in range(epochs):
      print("Starting training epoch {}/{}".format(epoch+1,epochs))
      epoch_time = time.time()
      losses = []
      for i, (fd, _) in enumerate(train_fd_list):
        _,loss = self.sess.run([self.train_op,self.loss], feed_dict=fd)
        losses.append(loss)
        if i>0 and i%200 == 0:
          print("[{}]: loss:{:.2f}".format(i,sum(losses[i-200:])/200.0))
      print("Average epoch loss:{}".format(sum(losses)/len(losses)))
      print("Time used for epoch {}: {}".format(epoch+1, time_used(epoch_time)))
      dev_time = time.time()
      print("Evaluating on dev set after epoch {}/{}:".format(epoch+1,epochs))
      self.eval(dev_path)
      print("Time used for evaluate on dev set: {}".format(time_used(dev_time)))

     print("Training finished!")
     print("Time used for training: {}".format(time_used(start_time)))

     print("Evaluating on test set:")
     test_time = time.time()
     self.eval(test_path)
     print("Time used for evaluate on test set: {}".format(time_used(test_time)))
  CorefModel.train = train

# Train

In [None]:
# w/max antecedents = 10
if __name__ == '__main__':
  source = 'all'
  train_path = 'train_{}.jsonl'.format(source)
  dev_path = 'dev_{}.jsonl'.format(source)
  test_path = 'test_{}.jsonl'.format(source)
  model = CorefModel(use_coarse_to_fine=False)
  model.build()
  epochs = 25
  model.train(train_path, dev_path, test_path, epochs)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Starting training epoch 1/25
[200]: loss:89.84
[400]: loss:8.49
[600]: loss:14.75
Average epoch loss:34.16908417249547
Time used for epoch 1: 12 m 49 s
Evaluating on dev set after epoch 1/25:




Average F1 (py): 0.00%
Average precision (py): 0.00%
Average recall (py): 0.00%
Time used for evaluate on dev set: 0 m 10 s
Starting training epoch 2/25
[200]: loss:99.06
[400]: loss:19.53
[600]: loss:23.59
Average epoch loss:44.61408736193991
Time used for epoch 2: 12 m 10 s
Evaluating on dev set after epoch 2/25:




Average F1 (py): 6.40%
Average precision (py): 74.84%
Average recall (py): 3.36%
Time used for evaluate on dev set: 0 m 9 s
Starting training epoch 3/25
[200]: loss:113.14
[400]: loss:22.80
[600]: loss:29.88
Average epoch loss:52.14069853245397
Time used for epoch 3: 11 m 56 s
Evaluating on dev set after epoch 3/25:




Average F1 (py): 7.67%
Average precision (py): 70.91%
Average recall (py): 4.07%
Time used for evaluate on dev set: 0 m 9 s
Starting training epoch 4/25
[200]: loss:124.47
[400]: loss:24.10
[600]: loss:33.78
Average epoch loss:57.25084083256649
Time used for epoch 4: 11 m 49 s
Evaluating on dev set after epoch 4/25:




Average F1 (py): 7.04%
Average precision (py): 68.71%
Average recall (py): 3.73%
Time used for evaluate on dev set: 0 m 9 s
Starting training epoch 5/25
[200]: loss:132.94
[400]: loss:26.29
[600]: loss:36.11
Average epoch loss:60.82252651632104
Time used for epoch 5: 11 m 49 s
Evaluating on dev set after epoch 5/25:




Average F1 (py): 6.92%
Average precision (py): 70.31%
Average recall (py): 3.66%
Time used for evaluate on dev set: 0 m 8 s
Starting training epoch 6/25
[200]: loss:136.26
[400]: loss:27.53
[600]: loss:36.37
Average epoch loss:61.99413625472816
Time used for epoch 6: 11 m 43 s
Evaluating on dev set after epoch 6/25:




Average F1 (py): 6.81%
Average precision (py): 71.96%
Average recall (py): 3.59%
Time used for evaluate on dev set: 0 m 8 s
Starting training epoch 7/25
[200]: loss:141.52
[400]: loss:28.34
[600]: loss:36.44
Average epoch loss:63.861720935311006
Time used for epoch 7: 11 m 49 s
Evaluating on dev set after epoch 7/25:




Average F1 (py): 7.30%
Average precision (py): 62.60%
Average recall (py): 3.89%
Time used for evaluate on dev set: 0 m 8 s
Starting training epoch 8/25
[200]: loss:143.18
[400]: loss:27.78
[600]: loss:38.03
Average epoch loss:64.85404224692378
Time used for epoch 8: 12 m 5 s
Evaluating on dev set after epoch 8/25:




Average F1 (py): 7.94%
Average precision (py): 67.48%
Average recall (py): 4.23%
Time used for evaluate on dev set: 0 m 9 s
Starting training epoch 9/25
[200]: loss:144.56
[400]: loss:28.68
[600]: loss:39.16
Average epoch loss:65.64677799528333
Time used for epoch 9: 11 m 51 s
Evaluating on dev set after epoch 9/25:




Average F1 (py): 6.88%
Average precision (py): 62.67%
Average recall (py): 3.65%
Time used for evaluate on dev set: 0 m 8 s
Starting training epoch 10/25
[200]: loss:147.62
[400]: loss:28.36
[600]: loss:38.09
Average epoch loss:66.05574828711254
Time used for epoch 10: 11 m 49 s
Evaluating on dev set after epoch 10/25:




Average F1 (py): 7.09%
Average precision (py): 67.03%
Average recall (py): 3.76%
Time used for evaluate on dev set: 0 m 9 s
Starting training epoch 11/25
[200]: loss:152.22
[400]: loss:29.10
[600]: loss:39.42
Average epoch loss:67.92179841335457
Time used for epoch 11: 11 m 45 s
Evaluating on dev set after epoch 11/25:




Average F1 (py): 7.34%
Average precision (py): 66.62%
Average recall (py): 3.90%
Time used for evaluate on dev set: 0 m 9 s
Starting training epoch 12/25
[200]: loss:152.82
[400]: loss:29.37
[600]: loss:38.20
Average epoch loss:67.70733222194116
Time used for epoch 12: 12 m 11 s
Evaluating on dev set after epoch 12/25:




Average F1 (py): 3.64%
Average precision (py): 54.05%
Average recall (py): 1.89%
Time used for evaluate on dev set: 0 m 9 s
Starting training epoch 13/25
[200]: loss:155.58
[400]: loss:30.24
[600]: loss:38.65
Average epoch loss:68.73390620366824
Time used for epoch 13: 11 m 51 s
Evaluating on dev set after epoch 13/25:




Average F1 (py): 7.69%
Average precision (py): 57.45%
Average recall (py): 4.14%
Time used for evaluate on dev set: 0 m 9 s
Starting training epoch 14/25
[200]: loss:160.06
[400]: loss:29.26
[600]: loss:39.10
Average epoch loss:69.92103970952657
Time used for epoch 14: 11 m 55 s
Evaluating on dev set after epoch 14/25:




Average F1 (py): 9.42%
Average precision (py): 60.17%
Average recall (py): 5.12%
Time used for evaluate on dev set: 0 m 9 s
Starting training epoch 15/25
[200]: loss:156.38
