#Predicting Movie Review Sentiment with BERT on TF Hub

In [0]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

In [0]:
!pip install bert-tensorflow

In [0]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [0]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

OUTPUT_DIR = 'out'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = False #@param {type:"boolean"}
BUCKET = 'BUCKET_NAME' #@param {type:"string"}

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/train_bodies.csv
!wget https://github.com/Dragonet95/utils/raw/master/train_stances.csv

In [0]:
import pandas as pd
import pickle
import os
from os import listdir
from os.path import join, dirname
import re


_COL_1 = ['Headline', 'Body ID', 'Stance']
_COL_2 = ['Body ID', 'articleBody']
_COL_3 = ['Headline', 'Body ID', 'Stance', 'articleBody']


link = pd.read_csv("./train_stances.csv", index_col=None, names=_COL_1, sep=',')
bodies = pd.read_csv("./train_bodies.csv", index_col=None, names=_COL_2, sep=',')

link.Stance[link.Stance=='agree'] = "1"
link.Stance[link.Stance=='disagree'] = "2"
link.Stance[link.Stance=='discuss'] = "0"
link.Stance[link.Stance=='unrelated'] = "-1"

link = link[1:]
bodies = bodies [1:]

complete = link.merge(bodies, on='Body ID')
complete = complete[1:]
complete.info()

complete.to_csv("stance.csv", sep=',', encoding='utf-8', index=False)

complete = complete.sample(frac=1).reset_index(drop=True)
complete.info()

train_df = complete[:40000]
test_df = complete[40000:]

with open("stance.pickle", "wb") as f:
    pickle.dump([train_df, test_df], f)

In [0]:
import pandas as pd
import pickle
import os
from os import listdir
from os.path import join, dirname
import re


_COL_1 = ['id', 'label', 'statement','subject','speaker','speakertitle','state','aff','c1','c2','c3','c4','c5']


train = pd.read_csv("./train.tsv", index_col=False, names=_COL_1, sep='\t')
test = pd.read_csv("./test.tsv", index_col=False, names=_COL_1, sep='\t')
valid = pd.read_csv("./valid.tsv", index_col=False, names=_COL_1, sep='\t')


train.info()
test.info()
valid.info()

part = train.append(test)
complete = part.append(valid)
complete.info()

complete2 = complete[['statement','label']]
complete2.info()
print(complete2)


complete2.to_csv("politifactfull.csv", sep='\t', encoding='utf-8')

train_df = complete[:10000]
test_df = complete[10000:]

with open("politifactfull.pickle", "wb") as f:
    pickle.dump([train_df, test_df], f)

In [0]:
!pip install pandas --upgrade

In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/testtask2t.pickle

In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/testtask2eng.pickle

In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/task2treng2000.pickle

In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/stancetask2.csv

In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/politi.pickle

In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/train.tsv
!wget https://github.com/Dragonet95/utils/raw/master/test.tsv
!wget https://github.com/Dragonet95/utils/raw/master/valid.tsv

#Data

First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub).

In [0]:
from tensorflow import keras
import os
import re
import pickle
import pandas

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  with open("stance.pickle", 'rb') as f:
    train_df, test_df = pickle.load(f)
  
  return train_df, test_df


In [0]:
from tensorflow import keras
import os
import re
import pickle
import pandas

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  with open("task2treng2000.pickle", 'rb') as f:
    train_df, test_df = pickle.load(f)
  
  return train_df, test_df

In [0]:
from tensorflow import keras
import os
import re
import pickle
import pandas

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  with open("testtask2t.pickle", 'rb') as f:
    train_df, test_df = pickle.load(f)
  
  return train_df, test_df


In [0]:
from tensorflow import keras
import os
import re
import pickle
import pandas

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  with open("politi.pickle", 'rb') as f:
    train_df, test_df = pickle.load(f)
  
  return train_df, test_df

In [0]:
from tensorflow import keras
import os
import re
import pickle
import pandas

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  with open("politifactfull.pickle", 'rb') as f:
    train_df, test_df = pickle.load(f)
  
  return train_df, test_df

In [0]:
train, test = download_and_load_datasets()

In [0]:
test.info()

In [0]:
_COL_1 = ['claim', 'claimtext', 'page', 'body']
test = pd.read_csv("./stancetask2.csv", index_col=None, names=_COL_1, sep='\t')

totest = test[395:]

print(totest['page'])

In [0]:
totest['body'] = totest['body'].replace('\n', '',regex=True).astype(str)
totest['body'] = totest['body'].replace('\d', '',regex=True).astype(str)
totest['body'] = totest['body'].replace(' +', ' ',regex=True).astype(str)
totest['body'] = totest['body'].replace('\W+', ' ',regex=True).astype(str)

totest['body'] = totest['body'].replace('[^\u0621-\u064A\u0660-\u0669\u066E-\u06D5\u06EE\u06EF\u06FA-\u06FC\u06FF\u06F0-\u06F9a-z-A-Z]+$', '',regex=True).astype(str)

In [0]:

train['body'] = train['body'].replace('\n', '',regex=True).astype(str)
train['body'] = train['body'].replace('\d', '',regex=True).astype(str)
train['body'] = train['body'].replace(' +', ' ',regex=True).astype(str)
train['body'] = train['body'].replace('\W+', ' ',regex=True).astype(str)

test['body'] = test['body'].replace('\n', '',regex=True).astype(str)
test['body'] = test['body'].replace('\d', '',regex=True).astype(str)
test['body'] = test['body'].replace(' +', ' ',regex=True).astype(str)
test['body'] = test['body'].replace('\W+', ' ',regex=True).astype(str)

In [0]:
train= train.replace('[^\u0621-\u064A\u0660-\u0669\u066E-\u06D5\u06EE\u06EF\u06FA-\u06FC\u06FF\u06F0-\u06F9a-z-A-Z]+$', '',regex=True).astype(str)
#train['claimtext'] = train['claimtext'].replace('[\u0621-\u064A0-9 ]+$', '',regex=True).astype(str)
test = test.replace('[^\u0621-\u064A\u0660-\u0669\u066E-\u06D5\u06EE\u06EF\u06FA-\u06FC\u06FF\u06F0-\u06F9a-z-A-Z]+$', '',regex=True).astype(str)

In [0]:
!pip install googletrans

In [0]:
from googletrans import Translator

In [0]:
#translator = Translator()
train['claimtext'] = train['claimtext'].map(lambda x: Translator().translate(text=x, dest='en').text)

In [0]:
#translator = Translator()
test['claimtext'] = test['claimtext'].map(lambda x: Translator().translate(text=x, dest='en').text)


In [0]:
import time

translator = Translator()
for index_label, row_series in train.iterrows():
  # For each row update the 'Bonus' value to it's double
  x = row_series['body']
  x = x[:2000]
  train.at[index_label , 'body'] = Translator().translate(text=x, dest='en').text 
  #time.sleep(0.1)
  
for index_label, row_series in test.iterrows():
  # For each row update the 'Bonus' value to it's double
  x = row_series['body']
  x = x[:2000]
  test.at[index_label , 'body'] = Translator().translate(text=x, dest='en').text 

In [0]:
with open("task2treng2000.pickle", "wb") as f:
    pickle.dump([train, test], f)

In [0]:
train = train.sample(len(train))
test = test.sample(len(test))

For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)

In [0]:
HEAD_COLUMN = 'claimtext'
BODY_COLUMN = 'body'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ["-1","0", "1","2"]

In [0]:
HEAD_COLUMN = 'Headline'
BODY_COLUMN = 'articleBody'
LABEL_COLUMN = 'Stance'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ["-1","0", "1","2"]

In [0]:
HEAD_COLUMN = 'text'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ["0","1"]

In [0]:
HEAD_COLUMN = 'statement'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = ["true","barely-true","half-true","pants-fire","false","mostly-true"]

#Data Preprocessing
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. True, False

In [0]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[HEAD_COLUMN], 
                                                                   text_b = x[BODY_COLUMN], 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[HEAD_COLUMN], 
                                                                   text_b = x[BODY_COLUMN], 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

In [0]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[HEAD_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[HEAD_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

In [0]:
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"
#BERT_MODEL_HUB = "https://tfhub.dev/google/bert_cased_L-24_H-1024_A-16/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

In [0]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

#Creating a model

Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning).

In [0]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)


In [0]:
from sklearn.metrics import precision_score
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    
    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
 
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        #f1_score = tf.contrib.metrics.f1_score(
            #label_ids,
            #predicted_labels)
        #auc = tf.metrics.auc(
          #   label_ids,
           # predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels) 
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)

        #p0 = tf.metrics.precision_at_k(label_ids, predicted_labels, 1, class_id=0)
        #p1 = tf.metrics.precision_at_k(label_ids, predicted_labels, 1, class_id=1)
        #p2 = tf.metrics.precision_at_k(label_ids, predicted_labels, 1, class_id=2)
        #p3 = tf.metrics.precision_at_k(label_ids, predicted_labels, 1, class_id=3)
        #p4 = tf.metrics.precision_at_k(label_ids, predicted_labels, 1, class_id=4)
        #p5 = tf.metrics.precision_at_k(label_ids, predicted_labels, 1, class_id=5)
        return {
            "eval_accuracy": accuracy,
            #"f1_score": f1_score,
            #"auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg,
            #"p0":p0,
            #"p1":p1,
            #"p2":p2,
            #"p3":p3,
            #"p4":p4,
            #"p5":p5
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn


In [0]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 100

In [0]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [0]:
num_train_steps = 190
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [0]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [0]:
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})


In [0]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [0]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0623 18:24:04.432298 139940367767424 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
W0623 18:24:06.919957 139940367767424 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1066: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.


Training took time  0:25:06.317314


In [0]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [0]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

In [0]:
def getPrediction(in_sentences):
  labels = ["-1", "0","1","2"]
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = y, label = "0") for x,y in in_sentences] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

In [0]:
predictions = getPrediction(test_df.statement)

In [0]:
import re
import csv
newlist = []
cont=0
for x in predictions:
  x = str(x)
  start = x.find('2), ')+5
  end = x.find(')', start)-1
  newlist.append( int(x[start:end]))

cont=395
with open('out2B2.tsv', 'w') as f:
  writer=csv.writer(f, delimiter='\t')
  for x in newlist:
    print('%s\t%s\t%s\t%s' %(totest['claim'][cont], totest['page'][cont],x, "TheEarthIsFlat2Bext") )
    writer.writerow([totest['claim'][cont], totest['page'][cont],x, "TheEarthIsFlat2Bext"])
    cont=cont+1