# Relation Prediction in Argument Mining with Pre-trained Deep Bidirectional Transformers

Code for BA thesis.

In [None]:
# All imports
import os 
import sys
import random

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

# from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold


from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
from datetime import datetime



In [None]:
def load_local_data(filename, data):
    df = pd.read_csv(filename, sep='\t')
    print(df.groupby('org_dataset').org.apply(lambda x: x.str.split().str.len().mean()))
    print(df.groupby('org_dataset').response.apply(lambda x: x.str.split().str.len().mean()))
    # Split in Training and Validation data
    if data == 'node':
        # Training data: NoDe debatepedia all versions without neutral label
        # Validation data: NoDe procon
        dataset = df.loc[~df['org_dataset'].isin(['political', 'comargGM', 'comargUGIP', 'agreement'])]
        dataset = df.loc[df['org_dataset'].isin(['debate_test', 'debate_train', 'procon'])] # Use orignal data
        # dataset = dataset[dataset['label'] != 'unrelated'] # Filter only support/attack
        dataset = dataset.sample(frac=1)
        #data_train = dataset.iloc[:-100]
        #data_val = dataset #.iloc[-100:]
        data_train = dataset.loc[~dataset['org_dataset'].isin(['debate_test'])]
        data_val = dataset.loc[dataset['org_dataset'].isin(['debate_test'])]
    elif data == 'political':
        dataset = df.loc[df['org_dataset'].isin(['political'])]
        #dataset = dataset[dataset['label'] != 'unrelated'] # Filter only support/attack
        dataset = dataset.sample(frac=1)
        data_train = dataset.iloc[:-200]
        data_val = dataset.iloc[-200:]
    elif data == 'agreement':
        dataset = df.loc[df['org_dataset'].isin(['agreement'])]
        dataset = dataset.sample(frac=1).dropna()
        data_train = dataset.iloc[:-2000]
        data_val = dataset.iloc[-2000:]
    else:
        print('Invalid dataset')
        sys.exit(-1)
    return data_train, data_val

def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
    """Creates a classification model."""
    
    is_training = not is_predicting
    
    """
    model = modeling.BertModel(
        config=BERT_CONFIG,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=False)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = model.get_pooled_output()
    
    """
    tags = set()
    if is_training:
        tags.add("train")
    bert_module = hub.Module(
        BERT_MODEL_HUB,
        tags=tags,
        trainable=True)
    bert_inputs = dict(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids)
    bert_outputs = bert_module(
        inputs=bert_inputs,
        signature="tokens",
        as_dict=True)
    

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]
    
    #######
    

    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02, seed=0))

    output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # Dropout helps prevent overfitting
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9, seed=1)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        # If we're predicting, we want predicted labels and the probabiltiies.
        if is_predicting:
            return (predicted_labels, log_probs)

        # If we're train/eval, compute loss between predicted and actual label
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)

# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps, num_warmup_steps):
    """Returns `model_fn` closure for TPUEstimator."""
    
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

        # TRAIN 
        if not is_predicting:

            (loss, predicted_labels, log_probs) = create_model(
                is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

            train_op = optimization.create_optimizer(
              loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)


            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode,
                  loss=loss,
                  train_op=train_op)

        # TEST
        else:
            (predicted_labels, log_probs) = create_model(
                is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

            predictions = {
                'probabilities': log_probs,
                'labels': predicted_labels
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Return the actual model function in the closure
    return model_fn   

In [None]:
# All parameters

os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Deactivate GPU for testing stability

cross_val = False
data_path = '../data/complete_data.tsv'
dataset = 'node'  # One of 'agreement', 'node' and 'political'
use_org = True
use_resp = True
convert_dicts = {'agreement': {"agreement": 0, "disagreement": 1, "unrelated": 2},
                'node': {"attack": 0, "support": 1, "unrelated": 2},
                'political': {"attack": 0, "support": 0, "unrelated": 1}}
                #'political': {"attack": 0, "support": 1, "unrelated": 1}}
convert_dict = convert_dicts[dataset]

ORG_COLUMN = 'org'
RESP_COLUMN = 'response'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1] # [0, 1, 2]
BERT_VOCAB= './uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = './uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG_PATH = './uncased_L-12_H-768_A-12/bert_config.json'
BERT_CONFIG = modeling.BertConfig.from_json_file(BERT_CONFIG_PATH)
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

# Fix the seeds
random.seed(2314)
np.random.seed(1234) 
tf.set_random_seed(4321) 
# https://github.com/jkschin/tensorflow/blob/81ebecec7f1952be31d6dd102efd60be5bde968d/tensorflow/docs_src/programmers_guide/non_determinism.md
# Results are still not perfectly deterministic because of GPU
# Results not even deterministic on CPU (something is missing?)

# Do not allow parallism on CPU (makes it deterministic?)
session_conf = tf.ConfigProto(
      intra_op_parallelism_threads=1,
      inter_op_parallelism_threads=1,
      device_count={'CPU':1}
    ) # Only use one CPU thread!
# Does not work, still not deterministic

MAX_SEQ_LENGTH = 128

# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 5000
SAVE_SUMMARY_STEPS = 100
OUTPUT_DIR = 'BERT_RUN' + str(datetime.now()) + dataset

# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
#run_config = tf.contrib.tpu.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    session_config=session_conf, # Only one thread!
    tf_random_seed=3241) # Maybe? use different seed for every Fold in Crossval e.g. seed+fold_number


In [None]:
# Create tokenizer
tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=True)

In [None]:
# Load local data
train_df, test_df = load_local_data(data_path, dataset)
#print(train_df.head())

# Create datasets (Only take up to max_seq_length words for memory)

train_df = train_df.replace({'label': convert_dict})
test_df = test_df.replace({'label': convert_dict})
#print(train_df.groupby('label').describe())
#print(test_df.groupby('label').describe())
train = train_df.sample(frac=1)
test = test_df.sample(frac=1)

# Use org + response
if use_org and use_resp:
    # Use the InputExample class from BERT's run_classifier code to create examples from the data
    train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                       text_a = x[ORG_COLUMN], 
                                                                       text_b = x[RESP_COLUMN], 
                                                                       label = x[LABEL_COLUMN]), axis = 1)

    test_InputExamples = test.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                       text_a = x[ORG_COLUMN], 
                                                                       text_b = x[RESP_COLUMN], 
                                                                       label = x[LABEL_COLUMN]), axis = 1)
# Use only org
elif use_org:
    # Use the InputExample class from BERT's run_classifier code to create examples from the data
    train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                       text_a = x[ORG_COLUMN], 
                                                                       text_b = None, 
                                                                       label = x[LABEL_COLUMN]), axis = 1)

    test_InputExamples = test.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                       text_a = x[ORG_COLUMN], 
                                                                       text_b = None, 
                                                                       label = x[LABEL_COLUMN]), axis = 1)
# Use only resp
else:
    # Use the InputExample class from BERT's run_classifier code to create examples from the data
    train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                       text_a = x[RESP_COLUMN], 
                                                                       text_b = None, 
                                                                       label = x[LABEL_COLUMN]), axis = 1)

    test_InputExamples = test.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                       text_a = x[RESP_COLUMN], 
                                                                       text_b = None, 
                                                                       label = x[LABEL_COLUMN]), axis = 1)
    
# Convert our train and test features to InputFeatures that BERT understands.
train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

In [None]:
if not cross_val:
    # Compute # train and warmup steps from batch size
    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    model_fn = model_fn_builder(
      num_labels=len(label_list),
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps)

    """model_fn = run_classifier.model_fn_builder(
        bert_config=BERT_CONFIG,
        num_labels=len(label_list),
        init_checkpoint=False,
        learning_rate=LEARNING_RATE,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=False,
        use_one_hot_embeddings=False)"""

    estimator = tf.estimator.Estimator(
    #estimator = tf.contrib.tpu.TPUEstimator(
      model_fn=model_fn,
      config=run_config,
      params={"batch_size": BATCH_SIZE})
      #train_batch_size=BATCH_SIZE)

    # Create an input function for training. drop_remainder = True for using TPUs.
    train_input_fn = run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)

    print(f'Beginning Training!')
    current_time = datetime.now()
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    print("Training took time ", datetime.now() - current_time)
    
    test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)
    
    predictions = estimator.predict(input_fn=test_input_fn)
    pred_label = [prediction['labels'] for prediction in predictions]
    print("Confusion Matrix:")
    conf_mat = confusion_matrix(test['label'].values.astype(int), np.array(pred_label))
    print(conf_mat)
    
    print("Classification Report:")
    if len(set(test['label'])) == 3:
        print(classification_report(test['label'].values.astype(int), pred_label, target_names=["attack", "support", "unrelated"]))
    else: 
        #print(classification_report(test['label'].values.astype(int), pred_label, target_names=["agreement", "disagreement"]))
        print(classification_report(test['label'].values.astype(int), pred_label, target_names=["attack", "support"]))
        #print(classification_report(test['label'].values.astype(int), pred_label, target_names=["relation", "unrelated"]))

In [None]:
if not cross_val:
    test['predictions'] = pred_label
    test['correctness'] = test.apply(lambda r: 1 if r['label'] == r['predictions'] else 0, axis=1)
    rels = pd.crosstab(test['topic'], [test['label'],test['predictions']], margins=True, colnames=['label', 'prediction'])
    rels2 = pd.crosstab(test['topic'], test['correctness'], normalize='index')

In [None]:
def run_training(train_features, test_features, test, train, target_names):
    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
    model_fn = model_fn_builder(
        num_labels=len(label_list),
        learning_rate=LEARNING_RATE,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps)
    train_input_fn = run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)
    test_input_fn = run_classifier.input_fn_builder(
        features=test_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)
    print(f'Beginning Training!')
    print('test labels', test['label'].value_counts())
    print('train labels', train['label'].value_counts())
    current_time = datetime.now()
    estimator = tf.estimator.Estimator(
              model_fn=model_fn,
              config=run_config,
              params={"batch_size": BATCH_SIZE})
    estimator = estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    predictions = estimator.predict(input_fn=test_input_fn, yield_single_examples=False)
    print(predictions)
    pred_label = np.array([])
    pred_label = [np.append(pred_label, prediction['labels']) for prediction in predictions]
    pred_label = np.concatenate(pred_label).ravel()
    print(pred_label)
    class_rep = classification_report(test['label'].values.astype(int), pred_label, labels= [0,1], target_names=target_names, output_dict=True)
    acc = accuracy_score(test['label'].values.astype(int), pred_label)
    print(classification_report(test['label'].values.astype(int), pred_label, target_names=target_names))
    print("Accuracy:",  acc)
    print("Training took time ", datetime.now() - current_time)
    return (class_rep, acc)

def cross_validate(data):
    results = []
    skf = StratifiedKFold(n_splits=10)
    for train_idx, val_idx in skf.split(data, data['label']):
        try:
            tf.gfile.DeleteRecursively(OUTPUT_DIR)
        except:
            # Doesn't matter if the directory didn't exist
            pass
        tf.gfile.MakeDirs(OUTPUT_DIR)
        train = data.iloc[train_idx]
        test = data.iloc[val_idx]
        print(train.shape, test.shape)
        # Use the InputExample class from BERT's run_classifier code to create examples from the data
        train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                           text_a = x[ORG_COLUMN], 
                                                                           text_b = x[RESP_COLUMN], 
                                                                           label = x[LABEL_COLUMN]), axis = 1)

        test_InputExamples = test.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                           text_a = x[ORG_COLUMN], 
                                                                           text_b = x[RESP_COLUMN], 
                                                                           label = x[LABEL_COLUMN]), axis = 1)

        # Convert our train and test features to InputFeatures that BERT understands.
        train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
        test_features = run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
        results.append(run_training(train_features, test_features, test, train, ["relation", "unrelated"]))
    return results

In [None]:
if cross_val:

    result = cross_validate(train_df.append(test_df))

In [None]:
if cross_val:
    #print(result)
    res = np.array(result)[:,0]
    result_df = pd.DataFrame(res[0])
    for data in res[1:]:
        new_data = pd.DataFrame(data)
        result_df = result_df.append(new_data)
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    print(result_df.groupby(level=0).agg([np.mean, np.max, np.min]).transpose())

In [None]:
#print(estimator.get_variable_names())