<a href="https://colab.research.google.com/github/MHDBST/BERT_examples/blob/master/Joint__MaskLM_document_TPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf
import pandas as pd
!pip install bert-tensorflow

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

In [0]:
import sys


# import python modules defined by BERT
from bert import modeling
# import optimization
# import run_classifier
from bert import run_classifier_with_tfhub
# import tokenization

# import tfhub 
import tensorflow_hub as hub



In [3]:
BUCKET = 'bert_example' #@param {type:"string"}
assert BUCKET, 'Must specify an existing GCS bucket name'
OUTPUT_DIR = 'gs://{}/aug19_models/joint/multitask/last3layers'.format(BUCKET)
# V2_augment/doc_level/v2/smallBERT-docLevel-seq512/experiment1'.format(BUCKET)
tf.gfile.MakeDirs(OUTPUT_DIR)

print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'uncased_L-12_H-768_A-12' #@param {type:"string"}
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'
BERT_PRETRAINED_DIR = 'gs://cloud-tpu-checkpoints/bert/' + BERT_MODEL
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
# !gsutil ls $BERT_PRETRAINED_DIR





***** Model output directory: gs://bert_example/aug19_models/joint/multitask/last3layers *****
***** BERT pretrained directory: gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12 *****


In [0]:
# import modeling
# import run_classifier


# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions and classes related to optimization (weight updates)."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import re
import tensorflow as tf


def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  """Creates an optimizer training op."""
  global_step = tf.train.get_or_create_global_step()

  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

  # Implements linear decay of the learning rate.
  learning_rate = tf.train.polynomial_decay(
      learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=1.0,
      cycle=False)

  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
  # learning rate will be `global_step/num_warmup_steps * init_lr`.
  if num_warmup_steps:
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = init_lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = (
        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

  # It is recommended that you use this optimizer for fine tuning, since this
  # is how the model was trained (note that the Adam m/v variables are NOT
  # loaded from init_checkpoint.)
  optimizer = AdamWeightDecayOptimizer(
      learning_rate=learning_rate,
      weight_decay_rate=0.01,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=1e-6,
      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

  if use_tpu:
    optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

  pre_tvars = tf.trainable_variables()
#   print('all trainable variables: >>',pre_tvars)
  tvars = pre_tvars
  tvars = [item for item in pre_tvars if not '/layer_0/'  in item.name and not '/layer_1/'  in item.name and not '/layer_2/'  in item.name
          and not '/layer_3/'  in item.name and not '/layer_4/'  in item.name and not '/layer_5/'  in item.name and not '/layer_6/'  in item.name 
          and not '/layer_7/'  in item.name and not '/layer_8/'  in item.name ]
           #and not '/layer_9/' in item.name]
#   and not '/layer_10/'  in item.name 
#           and not '/layer_11/'  in item.name ]
  print('excluded trainable variables: >>',tvars)
  grads = tf.gradients(loss, tvars)

  # This is how the model was pre-trained.
  (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

  train_op = optimizer.apply_gradients(
      zip(grads, tvars), global_step=global_step)

  # Normally the global step update is done inside of `apply_gradients`.
  # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
  # a different optimizer, you should probably take this line out.
  new_global_step = global_step + 1
  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
  return train_op


class AdamWeightDecayOptimizer(tf.train.Optimizer):
  """A basic Adam optimizer that includes "correct" L2 weight decay."""

  def __init__(self,
               learning_rate,
               weight_decay_rate=0.0,
               beta_1=0.9,
               beta_2=0.999,
               epsilon=1e-6,
               exclude_from_weight_decay=None,
               name="AdamWeightDecayOptimizer"):
    """Constructs a AdamWeightDecayOptimizer."""
    super(AdamWeightDecayOptimizer, self).__init__(False, name)

    self.learning_rate = learning_rate
    self.weight_decay_rate = weight_decay_rate
    self.beta_1 = beta_1
    self.beta_2 = beta_2
    self.epsilon = epsilon
    self.exclude_from_weight_decay = exclude_from_weight_decay

  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """See base class."""
    assignments = []
    for (grad, param) in grads_and_vars:
      if grad is None or param is None:
        continue

      param_name = self._get_variable_name(param.name)

      m = tf.get_variable(
          name=param_name + "/adam_m",
          shape=param.shape.as_list(),
          dtype=tf.float32,
          trainable=False,
          initializer=tf.zeros_initializer())
      v = tf.get_variable(
          name=param_name + "/adam_v",
          shape=param.shape.as_list(),
          dtype=tf.float32,
          trainable=False,
          initializer=tf.zeros_initializer())

      # Standard Adam update.
      next_m = (
          tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
      next_v = (
          tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
                                                    tf.square(grad)))

      update = next_m / (tf.sqrt(next_v) + self.epsilon)

      # Just adding the square of the weights to the loss function is *not*
      # the correct way of using L2 regularization/weight decay with Adam,
      # since that will interact with the m and v parameters in strange ways.
      #
      # Instead we want ot decay the weights in a manner that doesn't interact
      # with the m/v parameters. This is equivalent to adding the square
      # of the weights to the loss with plain (non-momentum) SGD.
      if self._do_use_weight_decay(param_name):
        update += self.weight_decay_rate * param

      update_with_lr = self.learning_rate * update

      next_param = param - update_with_lr

      assignments.extend(
          [param.assign(next_param),
           m.assign(next_m),
           v.assign(next_v)])
    return tf.group(*assignments, name=name)

  def _do_use_weight_decay(self, param_name):
    """Whether to use L2 weight decay for `param_name`."""
    if not self.weight_decay_rate:
      return False
    if self.exclude_from_weight_decay:
      for r in self.exclude_from_weight_decay:
        if re.search(r, param_name) is not None:
          return False
    return True

  def _get_variable_name(self, param_name):
    """Get the variable name from the tensor name."""
    m = re.match("^(.*):\\d+$", param_name)
    if m is not None:
      param_name = m.group(1)
    return param_name



In [0]:
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-4
NUM_TRAIN_EPOCHS = 10.0  ## Activate if ** is Not ACTIVATED
MAX_SEQ_LENGTH = 512
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 50
SAVE_SUMMARY_STEPS = 20



In [0]:
import pandas as pd

def fix_doc_tgt(doc,index,df):
  new_doc = doc.replace('.[TGT]',' .\n[TGT] ')
  doc_length = len(new_doc.split('\n'))
  if doc_length == 16 or not pd.notnull(df['Paragraph%s'%str(doc_length)].iloc[index]):
    return new_doc

  new_doc = new_doc.replace('?[TGT]',' ?\n[TGT] ')
  doc_length = len(new_doc.split('\n'))
  if doc_length == 16 or  not pd.notnull(df['Paragraph%s'%str(doc_length)].iloc[index]):
    return new_doc
  new_doc = new_doc.replace( '[TGT][TGT]', '[TGT] \n [TGT]' )
  doc_length = len(new_doc.split('\n'))
  if doc_length == 16 or  not pd.notnull(df['Paragraph%s'%str(doc_length)].iloc[index]):
    return new_doc

  new_doc = new_doc.replace( '. USA TODAY', '. USA TODAY Sports \n' )
  doc_length = len(new_doc.split('\n'))
  if doc_length == 16 or  not pd.notnull(df['Paragraph%s'%str(doc_length)].iloc[index]):
    return new_doc
  
  new_doc = new_doc.replace('![TGT]','!\n[TGT]' )
  doc_length = len(new_doc.split('\n'))
  return new_doc

data_pref = 'gs://bert_example/data_aug19/masked_lm/mask_lm_combined_shuffled_3Dec_7Dec_aug19_reindex_%s.csv'

data_train = pd.read_csv(tf.gfile.GFile(data_pref % str('train')), encoding='latin-1')
data_dev   = pd.read_csv(tf.gfile.GFile(data_pref % 'dev'), encoding='latin-1')
data_test  = pd.read_csv(tf.gfile.GFile(data_pref % 'random_test'), encoding='latin-1')
data_test_fixed= pd.read_csv(tf.gfile.GFile(data_pref % 'fixed_test'), encoding='latin-1')





# Load all files from a directory in a DataFrame.
def load_directory_data(df):
#   print('df length>>',len(df['DOCUMENT']))
  data = {}
  df['DOCUMENT'] = df['DOCUMENT'].str.replace('\[TGT\]','tgt')
  data["sentence"] = df['DOCUMENT']
  data["label"] =df["LABEL"]
  data['doc_id'] = df["DOCUMENT_INDEX"]
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(df,index = None):
  df_new = load_directory_data(df[:index])
#   print(df_new)
  true_df = df_new[df_new['label'] == True]
  false_df = df_new[df_new['label'] == False]
#   print('true_df>>>',len(true_df))
#   true_df["polarity"] = 1
#   false_df["polarity"] = 0
  return pd.concat([true_df, false_df]).sample(frac=1).reset_index(drop=True)


train = load_dataset(data_train)
# train_augment = load_dataset(data_augment)
# train = pd.concat([train, train_augment]).sample(frac=1).reset_index(drop=True)
dev = load_dataset(data_dev)
test = load_dataset(data_test)
test_fixed = load_dataset(data_test_fixed)

# print('train set length: %d,dev set length: %d, test set lenght: %d,  fixed test: %d'%
#       (len(train),len(dev),len(test),len(test_fixed)))
    
    

    




In [7]:


## if pl is true, append  paragraph leve data and label to the output
## if dl is true, append document leve data and label to the output
#### Both pl and dl can not be false
def load_paragraphs_documents(df,pl=True,dl=True,ent=False,column= 'MASKED_DOCUMENT'):
    if not dl and not pl:
      print('both document level label and paragaph level label is false, choose one of them a True')
      return 
    labels = []
    texts = []
    doc_ids = []
    uniq_ents = []
    num_doc = 0
    index = -1
    for doc in df[column]:
      index += 1
      docs = doc.split('\n')
      doc_length = len(docs)

      if pd.isnull(df['Paragraph0'].iloc[index]):
      # add documents with no paragraph labels as one document and its label to the input data dataframe
        if dl:
          labels.append(df['TRUE_SENTIMENT'].iloc[index])
          texts.append(doc)
          doc_ids.append(df['DOCUMENT_INDEX'].iloc[index])
          if ent:
            uniq_ents.append(df['Unique_Entities'].iloc[index])
        num_doc +=1
        
        continue
      try:
        if  doc_length != 16 and pd.notnull(df['Paragraph%s'%str(doc_length)].iloc[index]):
         
          doc = fix_doc_tgt(doc,index,df)
          docs = doc.split('\n')
          
          doc_length = len(docs)
          if  doc_length != 16 and pd.notnull(df['Paragraph%s'%str(doc_length)].iloc[index]):
            # print(doc)
            if column == 'summary':
              pass
            else:
              print('error on document %d'% df['DOCUMENT_INDEX'].iloc[index])
              print('document length is %s'%str(doc_length))
              continue

      except Exception as e:
        print('err is %s'%str(e))
        print('this document has %d paragraphs %d' %(doc_length,df['DOCUMENT_INDEX'].iloc[index]))

      if pl:
        for i in range(doc_length):
          doc_ids.append(df['DOCUMENT_INDEX'].iloc[index])
          if ent:
            uniq_ents.append(df['Unique_Entities'].iloc[index])
          
          texts.append(docs[i])
          label_i = df['Paragraph%d'%i].iloc[index]
          labels.append(label_i)
        ### increase the effect of documents by adding the whole document per each paragraph :D
          if dl:
          ## add the document text and its label to the input data after adding each paragraph and their labels
            labels.append(df['TRUE_SENTIMENT'].iloc[index])
            texts.append(doc)
            doc_ids.append(df['DOCUMENT_INDEX'].iloc[index])
            if ent:
              uniq_ents.append(df['Unique_Entities'].iloc[index])
              uniq_ents.append(df['Unique_Entities'].iloc[index])
      else:
         if dl:
          ## add the document text and its label to the input data after adding each paragraph and their labels
            labels.append(df['TRUE_SENTIMENT'].iloc[index])
            texts.append(doc)
            doc_ids.append(df['DOCUMENT_INDEX'].iloc[index])
            if ent:
              
              uniq_ents.append(df['Unique_Entities'].iloc[index])
         
    print('number of one-paragraph docs: %d'%num_doc)
    return(texts,labels,doc_ids,uniq_ents)
 

data_pref = 'gs://bert_example/data_aug19/all_data_combined_shuffled_3Dec_7Dec_aug19_reindex_%s.csv'
train_df = pd.read_csv(tf.gfile.GFile(data_pref % str('train')), encoding='latin-1')
dev_df = pd.read_csv(tf.gfile.GFile(data_pref % str('dev')), encoding='latin-1')
dev_ent_df = pd.read_csv(tf.gfile.GFile('gs://bert_example/data_aug19/dev_entities_v5.csv'), encoding='latin-1')

# dev_df = pd.read_csv(tf.gfile.GFile('gs://bert_example/data_aug19/abstractive_pgn_summary_dev_comma.csv'), encoding='latin-1')
test_random_df= pd.read_csv(tf.gfile.GFile(data_pref % str('random_test')), encoding='latin-1')
test_fixed_df= pd.read_csv(tf.gfile.GFile(data_pref % str('fixed_test')), encoding='latin-1')


# Load all files from a directory in a DataFrame.
def load_file(df,dl=True,pl=True,ent=False,column= 'MASKED_DOCUMENT'):
  data = {}
  (texts,labels,doc_ids,uniq_ent ) = load_paragraphs_documents(df,dl=dl,pl=pl,column= column,ent=ent)
  data["sentence"] = texts
  data["sentiment"] =labels
  data["doc_id"] = doc_ids
  if ent:
    data["uniq_ent"] = uniq_ent
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(df,index = None,dl=True,pl=True,column= 'MASKED_DOCUMENT',ent=False):
  df_new = load_file(df,dl,pl,column= column,ent=ent)
  pos_df = df_new[df_new['sentiment'] == 'Positive']
  neg_df = df_new[df_new['sentiment'] == 'Negative']
  neu_df = df_new[df_new['sentiment'] == 'Neutral']
  pos_df["polarity"] = 1
  neg_df["polarity"] = -1
  neu_df["polarity"] = 0
  return pd.concat([pos_df, neg_df,neu_df]).sample(frac=1).reset_index(drop=True)
# train_all,dev_par,dev_doc,test_par,test_doc,test_fixed_par,test_fixed_doc= [],[],[],[],[],[],[]
### train should consist both paragraph level and document level labels
print('processing train set')

train_doc = load_dataset(train_df,dl=True,pl=False)
### two dev set, to test the model with paragraph level dev and document level dev
print('processing dev set')
# dev_doc = load_dataset(dev_df,pl=pl,dl=dl)
dev_ent = load_dataset(dev_ent_df,pl=False,dl=True,ent=True)

print('processing random test set')
test_doc = load_dataset(test_random_df,pl=False,dl=True)
# ### two fixed tests like above
print('processing fixed test set')
test_fixed_doc = load_dataset(test_fixed_df,pl=False,dl=True)

print('Number of train inputs: %d\n \\
        Number of document level dev inputs: %d\n  \\
        Number of document level fixed test inputs: %d \n \\
       Number of document level random test inputs: %d'
      %(len(train_doc),len(dev_ent),len(test_fixed_doc),len(test_doc)))


processing train set
number of one-paragraph docs: 252


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


processing dev set
number of one-paragraph docs: 45
processing random test set
number of one-paragraph docs: 52
processing fixed test set
number of one-paragraph docs: 114
Number of train inputs: 3355
 \        Number of document level dev inputs: 578
  \        Number of document level fixed test inputs: 827 
 \       Number of document level random test inputs: 579


In [8]:
print(len(train))
print(len(train_doc))
# print(set(train['doc_id']))
# print(set(train['doc_id']) == set(train_doc['doc_id']))

45944
3355


In [74]:
print(len(dev_ent))

578


In [9]:
train_joint = pd.merge(left=train,right=train_doc, left_on='doc_id', right_on='doc_id',suffixes=('','_y'))[['doc_id','label','sentence','sentiment', 'polarity']]
dev_joint = pd.merge(left=dev,right=dev_ent, left_on='doc_id', right_on='doc_id',suffixes=('','_y'))[['doc_id','label','sentence','sentiment', 'polarity']]
random_test_joint = pd.merge(left=test,right=test_doc, left_on='doc_id', right_on='doc_id',suffixes=('','_y'))[['doc_id','label','sentence','sentiment', 'polarity']]
fixed_test_joint = pd.merge(left=test_fixed,right=test_fixed_doc, left_on='doc_id', right_on='doc_id',suffixes=('','_y'))[['doc_id','label','sentence','sentiment', 'polarity']]
print(list(train_joint))
print(len(train_joint),len(dev_joint),len(random_test_joint),len(fixed_test_joint))

['doc_id', 'label', 'sentence', 'sentiment', 'polarity']
45944 7187 8130 12604


In [34]:
train_joint['joint_label'] = train_joint['label'].astype(str)+'_'+train_joint['polarity'].astype(str)

dev_joint['joint_label'] = dev_joint['label'].astype(str)+'_'+dev_joint['polarity'].astype(str)

random_test_joint['joint_label'] = random_test_joint['label'].astype(str)+'_'+random_test_joint['polarity'].astype(str)

fixed_test_joint['joint_label'] = fixed_test_joint['label'].astype(str)+'_'+fixed_test_joint['polarity'].astype(str)

print(list(train_joint))
print(list(set(train_joint['joint_label'])))

DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'joint_label'
label_list = list(set(train_joint['joint_label']))
use_tpu = True
len(set(train_joint[LABEL_COLUMN]))

['doc_id', 'label', 'sentence', 'sentiment', 'polarity', 'joint_label']
['False_1', 'False_0', 'False_-1', 'True_-1', 'True_1', 'True_0']


6

In [0]:
from bert import tokenization
from bert import run_classifier

path = 'gs://bert_example/bert/uncased_L-12_H-768_A-12/vocab_tgt.txt'
f_in = tf.gfile.GFile('gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/vocab.txt')
f_out = tf.gfile.GFile(path,'w')
lines = f_in.readlines()


lines[1] = 'tgt\n'
for line in lines:
  f_out.write(line)
f_out.close()

VOCAB_FILE = os.path.join('gs://bert_example/bert/uncased_L-12_H-768_A-12', 'vocab_tgt.txt')
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

In [0]:

train_joint_InputExamples = train_joint.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

dev_joint_InputExamples = dev_joint.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

# dev_features = run_classifier.convert_examples_to_features(dev_joint_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

test_joint_InputExamples = random_test_joint.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

# test_joint_features = run_classifier.convert_examples_to_features(test_joint_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


test_joint_InputExamples_fixed = fixed_test_joint.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
# test_joint_features_fixed = run_classifier.convert_examples_to_features(test_joint_InputExamples_fixed, label_list, MAX_SEQ_LENGTH, tokenizer)



# ## These two lines should be activated if ** is not activated
num_train_steps = int(len(train_joint_InputExamples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Setup TPU related config
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
NUM_TPU_CORES = 8
# ITERATIONS_PER_LOOP = 100 # I don't know what it is doing just decrease it to smaller value
ITERATIONS_PER_LOOP = int(len(train_joint_InputExamples) / TRAIN_BATCH_SIZE) ## set as the number of iterations in each epoch 




In [14]:
import pickle
tf.logging.set_verbosity(tf.logging.INFO)
try:
  train_joint_features = pickle.load(open('train_joint_features_large','rb'))
except Exception as e: 
    print('can not load train features, creating train features: %s'%str(e))
#     train_features = pickle.load(tf.gfile.GFile('gs://bert_example/mask_lm/models/V2_augment/doc_leve/last_2/smallBERT-docLevel-seq512/data/train_features_large.dms', "rb"))


  
    train_joint_features = run_classifier.convert_examples_to_features(train_joint_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
    with open('train_joint_features_large','wb') as f:
      pickle.dump(train_joint_features,f)
try:

  dev_joint_features = pickle.load(open('dev_joint_features_large','rb'))
except Exception as e:

   
    print('can not load from GC, creating dev features')
    dev_joint_features = run_classifier.convert_examples_to_features(dev_joint_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
    with open('dev_joint_features_large','wb') as f:
      pickle.dump(dev_joint_features,f)
print(len(train_joint_features))
print(len(dev_joint_features))


W1124 21:34:08.753768 140629443692416 module_wrapper.py:139] From /usr/local/lib/python2.7/dist-packages/bert/run_classifier.py:774: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

I1124 21:34:08.755482 140629443692416 run_classifier.py:774] Writing example 0 of 45944
I1124 21:34:08.781603 140629443692416 run_classifier.py:461] *** Example ***
I1124 21:34:08.783005 140629443692416 run_classifier.py:462] guid: None
I1124 21:34:08.785175 140629443692416 run_classifier.py:464] tokens: [CLS] catalan regional police officers walk through a street covered with ballots for the banned independence referendum thrown by people outside people ' s party ( pp ) regional headquarters during a protest in barcelona spain october 3 2017 . reuters / jon na ##z ##ca madrid ( reuters ) - leaders of spain a s industrial ##ised northeastern region of catalonia said the regional population had voted for independence in a ballot on sunday that the central government said

can not load train features, creating train features: [Errno 2] No such file or directory: 'train_joint_features_large'


I1124 21:39:24.036262 140629443692416 run_classifier.py:774] Writing example 10000 of 45944
I1124 21:41:38.132019 140629443692416 run_classifier.py:774] Writing example 20000 of 45944
I1124 21:43:22.792792 140629443692416 run_classifier.py:774] Writing example 30000 of 45944
I1124 21:44:38.843560 140629443692416 run_classifier.py:774] Writing example 40000 of 45944
I1124 21:46:46.213128 140629443692416 run_classifier.py:774] Writing example 0 of 7187
I1124 21:46:46.232873 140629443692416 run_classifier.py:461] *** Example ***
I1124 21:46:46.234083 140629443692416 run_classifier.py:462] guid: None
I1124 21:46:46.235904 140629443692416 run_classifier.py:464] tokens: [CLS] ' i came i saw i self ##ied ' : how ins ##tagram transformed the way we experience art en ##lar ##ge this image to ##ggle capt ##ion brendan sm ##ial ##owski / af ##p / get ##ty images brendan sm ##ial ##owski / af ##p / get ##ty images you are suspended in an endless dark chamber as thousands of red green yellow and bl

can not load from GC, creating dev features
45944
7187


In [0]:
train_label = train_joint['joint_label']
test_label = random_test_joint['joint_label']
test_fixed_label = fixed_test_joint['joint_label']
dev_label = dev_joint['joint_label']

In [16]:
len(set(test_fixed_label))

6

In [0]:
# train.head(500).to_csv(open('train_500.csv','w'),encoding='latin-1')
data_train = ''

In [0]:
reset_selective -f '\brandom_test_joint\b'

In [23]:
# reset_selective -f data_train
# who_ls
import sys
ipython_vars =[]# ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']


sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)


[('data_test', 117534962),
 ('test', 117469922),
 ('dev_joint', 71121300),
 ('data_dev', 70042071),
 ('dev', 69984575),
 ('train_df', 60479996),
 ('train_doc', 28013976),
 ('test_fixed_df', 16047828),
 ('test_random_df', 10887664),
 ('dev_ent_df', 9664744),
 ('dev_df', 9655496),
 ('test_fixed_doc', 7478148),
 ('test_doc', 5052140),
 ('dev_ent', 4447104),
 ('train_label', 2732054),
 ('train_joint_InputExamples', 2205344),
 ('test_joint_InputExamples_fixed', 605024),
 ('test_label', 483662),
 ('train_joint_features', 406504),
 ('test_joint_InputExamples', 390272),
 ('dev_joint_InputExamples', 345008),
 ('lines', 253640),
 ('test_fixed_label', 113468),
 ('dev_label', 64715),
 ('dev_joint_features', 61440),
 ('auth_info', 1048),
 ('AdamWeightDecayOptimizer', 904),
 ('Out', 280),
 ('In', 272),
 ('f', 144),
 ('label_list', 144),
 ('data_pref', 123),
 ('create_optimizer', 120),
 ('fix_doc_tgt', 120),
 ('load_dataset', 120),
 ('load_directory_data', 120),
 ('load_file', 120),
 ('load_paragraph

In [17]:
import gc
gc.collect()

116

In [18]:
True_ng = (len(dev_joint[dev_joint['joint_label']=='True_-1']))
True_nu = (len(dev_joint[dev_joint['joint_label']=='True_0']))
True_po = (len(dev_joint[dev_joint['joint_label']=='True_1']))
False_ng = (len(dev_joint[dev_joint['joint_label']=='False_-1']))
False_nu = (len(dev_joint[dev_joint['joint_label']=='False_0']))
False_po = (len(dev_joint[dev_joint['joint_label']=='False_1']))
print(label_list)



['False_1', 'False_0', 'False_-1', 'True_-1', 'True_1', 'True_0']


In [0]:


# class_weights_arr = [neg_w/(neg_w+pos_w),pos_w/(neg_w+pos_w)]
# class_weights_arr = [0.6,0.4]

# sent_proj = [[0,0,1],[0,1,0],[1,0,0],[1,0,0],[0,0,1],[0,1,0]]
# mask_proj = [[1,0],[1,0],[1,0],[0,1],[0,1],[0,1]]
trainable = True
# print(class_weights_arr)
# class_weights_arr = [0.5,0.5]
def create_model(bert_config,is_training, input_ids, input_mask, segment_ids, labels,
                 num_labels_mask,num_labels_sent,
                 use_one_hot_embeddings):#, bert_hub_module_handle):
  """Creates a classification model."""
    
  tags = set()
  if is_training:
    tags.add("train")
  bert_module = hub.Module(BERT_MODEL_HUB, tags=tags, trainable=trainable)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # In the demo, we are doing a simple classification task on the entire
  # segment.
  #
  # If you want to use the token-level output, use
  # bert_outputs["sequence_output"] instead.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value


  #### mask lm task
  # print('number of labels mask: ' , num_labels_mask)

  # output_weights_mask = tf.get_variable(
  #     "output_weights", [num_labels_mask, hidden_size],
  #     initializer=tf.truncated_normal_initializer(stddev=0.02))
  # output_bias_mask = tf.get_variable(
  #     "output_bias", [num_labels_mask], initializer=tf.zeros_initializer())

  output_weights_mask = tf.get_variable(
      "output_weights", [num_labels_mask, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))
  output_bias_mask = tf.get_variable(
      "output_bias", [num_labels_mask], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits_mask = tf.matmul(output_layer, output_weights_mask, transpose_b=True)
    logits_mask = tf.nn.bias_add(logits_mask, output_bias_mask)
    probabilities_mask = tf.nn.softmax(logits_mask, axis=-1)
    log_probs_mask = tf.nn.log_softmax(logits_mask, axis=-1)
    print(labels.get_shape())
    # print(labels_mask.get_shape())
    # mask_proj_arr = tf.constant(mask_proj)
    mask_proj_arr = tf.constant([1,0,1,0,1,0,0,1,0,1,0,1], shape=[num_labels_mask*num_labels_sent, num_labels_mask], dtype=tf.float32)

    print(mask_proj_arr.get_shape())
    one_hot_labels_mask = tf.one_hot(labels, depth=num_labels_mask*num_labels_sent, dtype=tf.float32)
    one_hot_labels_mask = tf.matmul(one_hot_labels_mask,mask_proj_arr)
    print(one_hot_labels_mask.get_shape())

    per_example_loss_mask = -tf.reduce_sum(one_hot_labels_mask * log_probs_mask, axis=-1)
    loss_mask = tf.reduce_mean(per_example_loss_mask)





    #### sentiment task
    # print('number of labels sent: ' , num_labels_sent)

    # output_weights_sent = tf.get_variable(
    #     "output_weights", [num_labels_sent, hidden_size],
    #     initializer=tf.truncated_normal_initializer(stddev=0.02))
    # output_bias_sent = tf.get_variable(
    #     "output_bias", [num_labels_sent], initializer=tf.zeros_initializer())

    output_weights_sent = tf.get_variable(
        "output_weights", [num_labels_sent, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    output_bias_sent = tf.get_variable(
        "output_bias", [num_labels_sent], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
      if is_training:
        # I.e., 0.1 dropout
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits_sent = tf.matmul(output_layer, output_weights_sent, transpose_b=True)
    logits_sent = tf.nn.bias_add(logits_sent, output_bias_sent)
    probabilities_sent = tf.nn.softmax(logits_sent, axis=-1)
    log_probs_sent = tf.nn.log_softmax(logits_sent, axis=-1)

    # one_hot_labels_sent = tf.one_hot(labels_sent, depth=num_labels_sent, dtype=tf.float32)
    sent_proj_arr = tf.constant([0,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0], shape=[num_labels_mask*num_labels_sent, num_labels_sent], dtype=tf.float32)
    one_hot_labels_sent = tf.one_hot(labels, depth=num_labels_mask*num_labels_sent, dtype=tf.float32)
    one_hot_labels_sent = tf.matmul(one_hot_labels_sent,sent_proj_arr)
    print(one_hot_labels_sent.get_shape())

    per_example_loss_sent = -tf.reduce_sum(one_hot_labels_sent * log_probs_sent, axis=-1)
    loss_sent = tf.reduce_mean(per_example_loss_sent)

# ValueError: Shape must be rank 2 but is rank 1 for 'loss/MatMul_1' (op: 'MatMul') with input shapes: [2], [6,2].
    total_loss = loss_sent + loss_mask
    total_per_example_loss = per_example_loss_sent + per_example_loss_mask

       
    return (total_loss,  per_example_loss_mask,per_example_loss_sent, logits_sent,logits_mask, probabilities_sent,probabilities_mask)


def model_fn_builder(bert_config,
                     num_labels_mask,num_labels_sent,
                     init_checkpoint, learning_rate, num_train_steps,
                     num_warmup_steps, use_tpu,use_one_hot_embeddings):# bert_hub_module_handle):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    print('label_ids ar: ', label_ids)
    print('label ids shape: , ' ,label_ids.get_shape())

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (total_loss, per_example_loss_mask,per_example_loss_sent, logits_sent,logits_mask, probabilities_sent,probabilities_mask) =\
    create_model(
        bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,\
        num_labels_mask,num_labels_sent,
        use_one_hot_embeddings)

    
    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)


    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:
     
      train_op = create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(per_example_loss_sent, label_ids_sent, logits_sent, is_real_example):
        predictions_sent = tf.argmax(logits_sent, axis=-1, output_type=tf.int32)
        accuracy_sent = tf.metrics.accuracy(label_ids_sent, predictions_sent,weights=is_real_example)
        loss_sent = tf.metrics.mean(per_example_loss_sent,weights=is_real_example)
        return {
            "eval_accuracy": accuracy_sent,
            "eval_loss": loss_sent,
        }

      eval_metrics = (metric_fn, [per_example_loss_sent, label_ids_sent, logits_sent])
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.PREDICT:
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode, predictions={"probabilities": probabilities_sent},scaffold_fn=scaffold_fn)
    else:
      raise ValueError(
          "Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode))

    return output_spec

  return model_fn

In [0]:
# Force TF Hub writes to the GS bucket we provide.
## These two lines should be activated if ** is not activated
num_train_steps = int(len(train_joint_features) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Setup TPU related config
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
NUM_TPU_CORES = 8
# ITERATIONS_PER_LOOP = 100 # I don't know what it is doing just decrease it to smaller value
ITERATIONS_PER_LOOP = int(len(train_joint_InputExamples) / TRAIN_BATCH_SIZE) ## set as the number of iterations in each epoch 




os.environ['TFHUB_CACHE_DIR'] = OUTPUT_DIR
### Activate it if ** part is not activated 
model_fn = model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels_mask=2,
    num_labels_sent=3,
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    use_one_hot_embeddings=True
#   bert_hub_module_handle=BERT_MODEL_HUB
)

# estimator = tf.contrib.tpu.TPUEstimator(
#   use_tpu=True,
#   model_fn=model_fn,
#   config=get_run_config(OUTPUT_DIR),
#   train_batch_size=TRAIN_BATCH_SIZE,
#   eval_batch_size=EVAL_BATCH_SIZE,
#   predict_batch_size=PREDICT_BATCH_SIZE, 
# )
# #####################################################################
## No Error
run_config = tf.contrib.tpu.RunConfig(
    keep_checkpoint_max=15,
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
# model_fn = run_classifier.model_fn_builder(
#     bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
#     num_labels=len(label_list),
#     init_checkpoint=INIT_CHECKPOINT,
#     learning_rate=LEARNING_RATE,
#     num_train_steps=num_train_steps,
#     num_warmup_steps=num_warmup_steps,
#     use_tpu=use_tpu,
#     use_one_hot_embeddings=True)

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=use_tpu,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE)

# estimator_from_tfhub._export_to_tpu = False


In [0]:
# Train the model
# tf.logging.set_verbosity(tf.logging.FATAL) #DEBUG,ERROR,FATAL,INFO,WARN
def model_train(estimator,train_features=train_joint_features):
  # We'll set sequences to be at most 128 tokens long.

  print('***** Started training at {} *****'.format(datetime.datetime.now()))
  print('  Num examples = {}'.format(len(train_joint_features)))
  print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
  tf.logging.info("  Num steps = %d", num_train_steps)
  train_input_fn = run_classifier.input_fn_builder(
      features=train_joint_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)
  print('start running estimator')
#   estimator._export_to_tpu = False
  md = estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  print('***** Finished training at {} *****'.format(datetime.datetime.now()))
  return md



#Evaluation and Prediction 

In [0]:
def model_eval(estimator,eval_examples=None,eval_features=dev_joint_features):
  # Eval the model.
#   eval_examples = dev_InputExamples#processor.get_dev_examples(TASK_DATA_DIR)
#   eval_features = run_classifier.convert_examples_to_features(
#       eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  print('***** Started evaluation at {} *****'.format(datetime.datetime.now()))
  print('  Num examples = {}'.format(len(eval_examples)))
  print('  Batch size = {}'.format(EVAL_BATCH_SIZE))

  # Eval will be slightly WRONG on the TPU because it will truncate
  # the last batch.
  eval_steps = int(len(eval_examples) / EVAL_BATCH_SIZE)
  eval_input_fn = run_classifier.input_fn_builder(
      features=eval_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=False,
      drop_remainder=True)
  result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
  print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))
  output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
#   with tf.gfile.GFile(output_eval_file, "w") as writer:
  print("***** Eval results *****")
  for key in sorted(result.keys()):
      print('  {} = {}'.format(key, str(result[key])))
#       writer.write("%s = %s\n" % (key, str(result[key])))
      
  return result

In [0]:
import numpy as np
from sklearn import metrics
# labels = [1,0]
# def model_predict(estimator,prediction_examples):
#   # Make predictions on a subset of eval examples
# #   prediction_examples = processor.get_dev_examples(TASK_DATA_DIR)[:PREDICT_BATCH_SIZE]
#   input_features = run_classifier.convert_examples_to_features(prediction_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
#   predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
#   predictions = estimator.predict(predict_input_fn)
#   return [(sentence, prediction['probabilities']) for sentence, prediction in zip(prediction_examples, predictions)]

def model_predict(estimator,input_features,input_examples,checkpoint_path=None):
  # Make predictions on a subset of eval examples
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
  if checkpoint_path: 
    predictions = estimator.predict(predict_input_fn,checkpoint_path=checkpoint_path)
  else:
    predictions = estimator.predict(predict_input_fn)


  return [(sentence, prediction['probabilities']) for sentence, prediction in zip(input_examples, predictions)]



In [0]:
tf.logging.set_verbosity(tf.logging.FATAL)
model_train(estimator,train_features=train_joint_features)

***** Started training at 2019-11-25 00:13:19.236149 *****
  Num examples = 45944
  Batch size = 16
start running estimator
label_ids ar:  Tensor("InfeedQueue/dequeue:2", shape=(2,), dtype=int32, device=/device:TPU_REPLICATED_CORE:0)
label ids shape: ,  (2,)


E1125 00:14:29.805067 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/input_ids) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1125 00:14:29.806912 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/input_mask) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1125 00:14:29.813230 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/segment_ids) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1125 00:14:29.815362 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/mlm_positions) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1125 00:14:29.826102 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/bert/embeddings/word_embeddings) is not supported on the TPU. Execution will fail if this op is used in the grap

(2,)
(6, 2)
(2, 2)
(2, 3)
excluded trainable variables: >> [<tf.Variable 'module/bert/embeddings/word_embeddings:0' shape=(30522, 768) dtype=float32>, <tf.Variable 'module/bert/embeddings/token_type_embeddings:0' shape=(2, 768) dtype=float32>, <tf.Variable 'module/bert/embeddings/position_embeddings:0' shape=(512, 768) dtype=float32>, <tf.Variable 'module/bert/embeddings/LayerNorm/beta:0' shape=(768,) dtype=float32>, <tf.Variable 'module/bert/embeddings/LayerNorm/gamma:0' shape=(768,) dtype=float32>, <tf.Variable 'module/bert/encoder/layer_9/attention/self/query/kernel:0' shape=(768, 768) dtype=float32>, <tf.Variable 'module/bert/encoder/layer_9/attention/self/query/bias:0' shape=(768,) dtype=float32>, <tf.Variable 'module/bert/encoder/layer_9/attention/self/key/kernel:0' shape=(768, 768) dtype=float32>, <tf.Variable 'module/bert/encoder/layer_9/attention/self/key/bias:0' shape=(768,) dtype=float32>, <tf.Variable 'module/bert/encoder/layer_9/attention/self/value/kernel:0' shape=(768, 7

In [90]:
# dev_ent['joint_label'] = dev_ent['sentiment']
dev_ent['joint_label'] = 'False_'+dev_ent['polarity'].astype(str)

dev_ent_InputExamples = dev_ent.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

dev_ent_features = run_classifier.convert_examples_to_features(dev_ent_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

pd = model_predict(estimator,dev_ent_features,dev_ent_InputExamples)
true_label = list(dev_ent['polarity']+1)
labels_val = []
for item in pd:
    labels_val.append(np.argmax(item[1]))

print(labels_val)
print(true_label)
print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
print(metrics.classification_report(y_pred=labels_val,y_true = true_label))


[2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 

In [0]:
train['joint_label'] = 'False_'+train['polarity'].astype(str)

train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

pd = model_predict(estimator,train_features,train_InputExamples)
true_label = list(train['polarity']+1)
labels_val = []
for item in pd:
    labels_val.append(np.argmax(item[1]))

print(labels_val)
print(true_label)
print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
print(metrics.classification_report(y_pred=labels_val,y_true = true_label))

In [73]:
for i in range(2871,5743,2871):
  print('evaluating epoch: %d'%(i/2871))
  pd = model_predict(estimator,dev_joint_features,dev_joint_InputExamples,checkpoint_path=OUTPUT_DIR+'/model.ckpt-%d'%(i))
  true_label = list(dev_joint['label'])
  labels_val = []
  for item in pd:
    labels_val.append(np.argmax(item[1]))
  print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
  print(metrics.classification_report(y_pred=labels_val,y_true = true_label))


evaluating epoch: 1
label_ids ar:  Tensor("InfeedQueue/dequeue:2", shape=(1,), dtype=int32, device=/device:TPU_REPLICATED_CORE:0)
label ids shape: ,  (1,)


E1124 23:33:54.967506 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/input_ids) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1124 23:33:54.969516 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/input_mask) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1124 23:33:54.971601 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/segment_ids) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1124 23:33:54.974431 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/mlm_positions) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1124 23:33:54.986263 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/bert/embeddings/word_embeddings) is not supported on the TPU. Execution will fail if this op is used in the grap

(1,)
(6, 2)
(1, 2)
(1, 3)


Exception tensorflow.python.framework.errors_impl.CancelledError: CancelledError() in <generator object predict at 0x7fe668dd11e0> ignored
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


[[   0 1020 1541]
 [   0 1750 2876]
 [   0    0    0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2561
           1       0.63      0.38      0.47      4626
           2       0.00      0.00      0.00         0

   micro avg       0.24      0.24      0.24      7187
   macro avg       0.21      0.13      0.16      7187
weighted avg       0.41      0.24      0.30      7187

evaluating epoch: 2
label_ids ar:  Tensor("InfeedQueue/dequeue:2", shape=(1,), dtype=int32, device=/device:TPU_REPLICATED_CORE:0)
label ids shape: ,  (1,)


E1124 23:34:52.494195 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/input_ids) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1124 23:34:52.496404 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/input_mask) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1124 23:34:52.499064 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/segment_ids) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1124 23:34:52.502912 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/mlm_positions) is not supported on the TPU. Execution will fail if this op is used in the graph. 
E1124 23:34:52.511504 140629443692416 tpu.py:425] Operation of type Placeholder (module_apply_tokens/bert/embeddings/word_embeddings) is not supported on the TPU. Execution will fail if this op is used in the grap

(1,)
(6, 2)
(1, 2)
(1, 3)
[[   0 1097 1464]
 [   0 1832 2794]
 [   0    0    0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2561
           1       0.63      0.40      0.48      4626
           2       0.00      0.00      0.00         0

   micro avg       0.25      0.25      0.25      7187
   macro avg       0.21      0.13      0.16      7187
weighted avg       0.40      0.25      0.31      7187



Exception tensorflow.python.framework.errors_impl.CancelledError: CancelledError() in <generator object predict at 0x7fe65f269550> ignored


In [0]:
pd = model_predict(estimator,test_features_fixed,test_InputExamples_fixed)
true_label = list(test_fixed['label'])
labels_val = []
for item in pd:
    labels_val.append(np.argmax(item[1]))
print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
print(metrics.classification_report(y_pred=labels_val,y_true = true_label))

In [0]:
# for i in range(1,13):
  
#   print('processing epoch: %d'%i)
#   pd = model_predict(estimator,dev_features,dev_InputExamples,checkpoint_path=OUTPUT_DIR+'/saved_models/model.ckpt-%d'%(i*3000))
#   true_label = list(dev['label'])
#   labels_val = []
#   for item in pd:
#       labels_val.append(np.argmax(item[1]))
#   print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
#   print(metrics.classification_report(y_pred=labels_val,y_true = true_label))

In [0]:
import numpy as np
import pandas
pd = model_predict(estimator,dev_features,dev_InputExamples,checkpoint_path=OUTPUT_DIR+'/model.ckpt-%d'%(1392*3))
true_label = list(dev['label'])
labels_val = []
for item in pd:
      labels_val.append(np.argmax(item[1]))
print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
print(metrics.classification_report(y_pred=labels_val,y_true = true_label))
print(labels_val)
print(dev)
dev['predicted'] = pandas.Series(labels_val)
dev.to_csv('/content/dev_predicted.csv', encoding='utf8')


(1, 2)
[[ 969  259]
 [ 194 2058]]
              precision    recall  f1-score   support

       False       0.83      0.79      0.81      1228
        True       0.89      0.91      0.90      2252

   micro avg       0.87      0.87      0.87      3480
   macro avg       0.86      0.85      0.86      3480
weighted avg       0.87      0.87      0.87      3480

[1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,

# New Section

In [0]:
# ## Run it if you want to train for a range of epochs and see the validation error and save the prediction on than
# ## ** Part Name **
# # mds = []
# # evs = []
# pds_dev = []
# pds_tr = []
# tf.logging.set_verbosity(tf.logging.FATAL) 

# for i in range(1,11):
#   print('----------------------- Starting Epoch %d-----------------------'%i)

#   NUM_TRAIN_EPOCHS = i
  
#   num_train_steps = int(len(train_InputExamples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
#   num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# #   model_fn = model_fn_builder(
# #   num_labels=len(label_list),
# #   learning_rate=LEARNING_RATE,
# #   num_train_steps=num_train_steps,
# #   num_warmup_steps=num_warmup_steps,
# #   use_tpu=True,
# #   bert_hub_module_handle=BERT_MODEL_HUB)

#   model_fn = model_fn_builder(
#   bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
#   num_labels=len(label_list),
#   init_checkpoint=INIT_CHECKPOINT,
#   learning_rate=LEARNING_RATE,
#   num_train_steps=num_train_steps,
#   num_warmup_steps=num_warmup_steps,
#   use_tpu=True,
#   use_one_hot_embeddings=True)
  
  
#   estimator_from_tfhub = tf.contrib.tpu.TPUEstimator(
#   use_tpu=True,
#   model_fn=model_fn,
#   config=run_config,
#   train_batch_size=TRAIN_BATCH_SIZE,
#   eval_batch_size=EVAL_BATCH_SIZE,
#   predict_batch_size=PREDICT_BATCH_SIZE)
  
#   model_train(estimator_from_tfhub)
# #   ev = model_eval(estimator_from_tfhub)

# #   print(' -------------------- Train Prediction --------------------')
# #   pd = model_predict(estimator_from_tfhub,train_features,train_InputExamples)
# #   true_label = list(train['label'])
# #   pds_tr.append(pd)
# #   labels_val = []
# #   for item in pd:
# #     labels_val.append(np.argmax(item[1]))
# #   print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
# #   print(metrics.classification_report(y_pred=labels_val,y_true = true_label))
  
  
#   print(' -------------------- Dev Prediction --------------------')
#   pd = model_predict(estimator_from_tfhub,dev_features[:3100],dev_InputExamples[:3100])
#   true_label = list(dev['label'][:3100])
#   pds_dev.append(pd)
#   labels_val = []
#   for item in pd:
#     labels_val.append(np.argmax(item[1]))
#   print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
#   print(metrics.classification_report(y_pred=labels_val,y_true = true_label))




# for tr,dv in zip(pds_tr,pds_dev):
#   labels_val = []
#   true_label = list(train['label'])
#   for item in tr:
#     labels_val.append(np.argmax(item[1]))
#   print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
#   print(metrics.classification_report(y_pred=labels_val,y_true = true_label))
  
#   labels_val = []
#   true_label = list(dev['label'][:3100])
#   for item in dv:
#     labels_val.append(np.argmax(item[1]))
#   print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
#   print(metrics.classification_report(y_pred=labels_val,y_true = true_label))


----------------------- Starting Epoch 1-----------------------
***** Started training at 2019-05-15 17:57:14.080193 *****
  Num examples = 122150
  Batch size = 8
start running estimator
(1, 2)
excluded trainable variables: >> [<tf.Variable 'module/bert/embeddings/word_embeddings:0' shape=(30522, 768) dtype=float32>, <tf.Variable 'module/bert/embeddings/token_type_embeddings:0' shape=(2, 768) dtype=float32>, <tf.Variable 'module/bert/embeddings/position_embeddings:0' shape=(512, 768) dtype=float32>, <tf.Variable 'module/bert/embeddings/LayerNorm/beta:0' shape=(768,) dtype=float32>, <tf.Variable 'module/bert/embeddings/LayerNorm/gamma:0' shape=(768,) dtype=float32>, <tf.Variable 'module/bert/encoder/layer_10/attention/self/query/kernel:0' shape=(768, 768) dtype=float32>, <tf.Variable 'module/bert/encoder/layer_10/attention/self/query/bias:0' shape=(768,) dtype=float32>, <tf.Variable 'module/bert/encoder/layer_10/attention/self/key/kernel:0' shape=(768, 768) dtype=float32>, <tf.Variable

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [0]:
# prd = model_predict(estimator,train_features,train_InputExamples)
# prd = [item for item in predictions]
len(prd)

(4, 2)


InvalidArgumentError: ignored

In [0]:
len(a)
# import numpy as np
# from sklearn import metrics

# labels = ["Negative","Neutral", "Positive"]
# labels_val = []
# for item in predictions:
#   labels_val.append(labels[np.argmax(item[1])])
# true_label = list(dev['sentiment'])
# print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
# print(metrics.classification_report(y_pred=labels_val,y_true = true_label))
# labels_val = []
# for item in predictions:
#   labels_val.append(np.argmax(item[1]))
# true_label = list(train['label'])
# print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
# print(metrics.classification_report(y_pred=labels_val,y_true = true_label))

[[  0  48]
 [  0 152]]
              precision    recall  f1-score   support

       False       0.00      0.00      0.00        48
        True       0.76      1.00      0.86       152

   micro avg       0.76      0.76      0.76       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.58      0.76      0.66       200



In [0]:
predictions = model_predict(estimator_from_tfhub,test_InputExamples_fixed)
labels_val = []
for item in predictions:
  labels_val.append(labels[np.argmax(item[1])])
true_label = list(test_fixed['sentiment'])




NameError: ignored

In [0]:
print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
print(metrics.classification_report(y_pred=labels_val,y_true = true_label))

In [0]:
# tf.logging.set_verbosity(tf.logging.DEBUG) #DEBUG,ERROR,FATAL,INFO,WARN
# predictions = model_predict(estimator_from_tfhub,test_InputExamples)
labels_val = []
for item in predictions:
  labels_val.append(labels[np.argmax(item[1])])
true_label = list(test['sentiment'])


In [0]:
# len(labels_val)
# len(test_InputExamples)
# len(predictions)
model_eval(estimator_from_tfhub)

***** Started evaluation at 2019-05-09 17:10:37.771525 *****
  Num examples = 3479
  Batch size = 8
(1, 2)


RuntimeError: ignored

In [0]:
print(metrics.confusion_matrix(y_pred=labels_val,y_true=true_label))
print(metrics.classification_report(y_pred=labels_val,y_true = true_label))

In [0]:
####seq 128, Small BERT, Batchsize 32 for train, 8 for dev and test
# 1)I0423 16:57:01.507206 140076901431168 basic_session_run_hooks.py:249] loss = 0.40919852, step = 46
# 2)I0423 18:08:31.116359 140500355729280 basic_session_run_hooks.py:249] loss = 0.7756149, step = 92
# 4)Loss for final step: 0.6891643.
# 5)Loss for final step: 0.9730371
# 6)Loss for final step: 0.4775733
# 7)Loss for final step: 1.2802429.
# 8)Loss for final step: 0.509207
# 9)loss = 0.29262337, step = 414
# 10)Loss for final step: 0.47267017


# 2)***** Eval results *****
#   eval_accuracy = 0.5857143
#   eval_loss = 0.8630275
#   global_step = 92
#   loss = 0.79767215
# 3)***** Eval results *****
#   eval_accuracy = 0.5857143
#   eval_loss = 0.8630275
#   global_step = 138
#   loss = 0.79767215
# 4)***** Eval results *****
#   eval_accuracy = 0.5857143
#   eval_loss = 0.86550355
#   global_step = 184
#   loss = 0.83540887
# 5)***** Eval results *****
#   eval_accuracy = 0.5964286
#   eval_loss = 0.88053304
#   global_step = 230
#   loss = 0.91362196
# 6)***** Eval results *****
#   eval_accuracy = 0.5857143
#   eval_loss = 0.90498805
#   global_step = 276
#   loss = 1.0806552
# 7)***** Eval results *****
#   eval_accuracy = 0.56785715
#   eval_loss = 0.92742974
#   global_step = 322
#   loss = 1.0180835
# 8)***** Eval results *****
#   eval_accuracy = 0.5607143
#   eval_loss = 0.9436406
#   global_step = 368
#   loss = 0.9721719
# 9)***** Eval results *****
#   eval_accuracy = 0.5607143
#   eval_loss = 0.9714315
#   global_step = 414
#   loss = 0.5798999
# 10)***** Eval results *****
#   eval_accuracy = 0.54285717
#   eval_loss = 1.0072392
#   global_step = 460
#   loss = 1.053762
  
#   DEV Info:
# 1)[[  0   0  23]
#  [  0   0  97]
#  [  0   0 164]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.00      0.00      0.00        97
#     Positive       0.58      1.00      0.73       164

#    micro avg       0.58      0.58      0.58       284
#    macro avg       0.19      0.33      0.24       284
# weighted avg       0.33      0.58      0.42       284
# 2)[[  0   9  14]
#  [  0  25  72]
#  [  0  21 143]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.45      0.26      0.33        97
#     Positive       0.62      0.87      0.73       164

#    micro avg       0.59      0.59      0.59       284
#    macro avg       0.36      0.38      0.35       284
# weighted avg       0.52      0.59      0.53       284
# 3)[[  0   9  14]
#  [  0  25  72]
#  [  0  21 143]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.45      0.26      0.33        97
#     Positive       0.62      0.87      0.73       164

#    micro avg       0.59      0.59      0.59       284
#    macro avg       0.36      0.38      0.35       284
# weighted avg       0.52      0.59      0.53       284
# 4)[[  0   8  42]
#  [  0  16 100]
#  [  0  17 157]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        50
#      Neutral       0.39      0.14      0.20       116
#     Positive       0.53      0.90      0.66       174

#    micro avg       0.51      0.51      0.51       340
#    macro avg       0.31      0.35      0.29       340
# weighted avg       0.40      0.51      0.41       340
# 5)[[  0  17   6]
#  [  0  40  57]
#  [  0  35 129]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.43      0.41      0.42        97
#     Positive       0.67      0.79      0.72       164

#    micro avg       0.60      0.60      0.60       284
#    macro avg       0.37      0.40      0.38       284
# weighted avg       0.54      0.60      0.56       284
# 6)[[  0  19   4]
#  [  0  44  53]
#  [  0  42 122]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.42      0.45      0.44        97
#     Positive       0.68      0.74      0.71       164

#    micro avg       0.58      0.58      0.58       284
#    macro avg       0.37      0.40      0.38       284
# weighted avg       0.54      0.58      0.56       284
# 7)[[  0  18   5]
#  [  0  46  51]
#  [  0  47 117]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.41      0.47      0.44        97
#     Positive       0.68      0.71      0.69       164

#    micro avg       0.57      0.57      0.57       284
#    macro avg       0.36      0.40      0.38       284
# weighted avg       0.53      0.57      0.55       284
# 8)[[  0  18   5]
#  [  0  48  49]
#  [  0  53 111]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.40      0.49      0.44        97
#     Positive       0.67      0.68      0.67       164

#    micro avg       0.56      0.56      0.56       284
#    macro avg       0.36      0.39      0.37       284
# weighted avg       0.53      0.56      0.54       284
# 9)[[  0  18   5]
#  [  0  48  49]
#  [  0  54 110]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.40      0.49      0.44        97
#     Positive       0.67      0.67      0.67       164

#    micro avg       0.56      0.56      0.56       284
#    macro avg       0.36      0.39      0.37       284
# weighted avg       0.52      0.56      0.54       284
# 10)[[  0  18   5]
#  [  0  51  46]
#  [  0  62 102]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.39      0.53      0.45        97
#     Positive       0.67      0.62      0.64       164

#    micro avg       0.54      0.54      0.54       284
#    macro avg       0.35      0.38      0.36       284
# weighted avg       0.52      0.54      0.52       284


In [0]:
###### Seq:256, small BERT, batch size 32 for train, 8 for test and dev
# 3) {'loss': 0.79650426, 'eval_accuracy': 0.58214283, 'eval_loss': 0.86125755, 'global_step': 138}
# [[  0   0  23]
#  [  0   0  97]
#  [  0   0 164]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.00      0.00      0.00        97
#     Positive       0.58      1.00      0.73       164

#    micro avg       0.58      0.58      0.58       284
#    macro avg       0.19      0.33      0.24       284
# weighted avg       0.33      0.58      0.42       284

# 4) {'loss': 0.73700047, 'eval_accuracy': 0.5928571, 'eval_loss': 0.8223035, 'global_step': 184}
# [[  0  11  12]
#  [  0  20  77]
#  [  0  17 147]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.42      0.21      0.28        97
#     Positive       0.62      0.90      0.73       164

#    micro avg       0.59      0.59      0.59       284
#    macro avg       0.35      0.37      0.34       284
# weighted avg       0.50      0.59      0.52       284

# 5) {'loss': 0.71594626, 'eval_accuracy': 0.5928571, 'eval_loss': 0.82239425, 'global_step': 230}
# [[  0  19   4]
#  [  0  51  46]
#  [  0  48 116]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.43      0.53      0.47        97
#     Positive       0.70      0.71      0.70       164

#    micro avg       0.59      0.59      0.59       284
#    macro avg       0.38      0.41      0.39       284
# weighted avg       0.55      0.59      0.57       284

# 6) {'loss': 0.7227276, 'eval_accuracy': 0.5857143, 'eval_loss': 0.8479867, 'global_step': 276}
# [[  0  21   2]
#  [  2  51  44]
#  [  1  49 114]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.42      0.53      0.47        97
#     Positive       0.71      0.70      0.70       164

#    micro avg       0.58      0.58      0.58       284
#    macro avg       0.38      0.41      0.39       284
# weighted avg       0.56      0.58      0.57       284

# 7){'loss': 0.83304703, 'eval_accuracy': 0.5642857, 'eval_loss': 0.90589315, 'global_step': 322}
# [[  0  19   4]
#  [  4  40  53]
#  [  3  42 119]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.40      0.41      0.40        97
#     Positive       0.68      0.73      0.70       164

#    micro avg       0.56      0.56      0.56       284
#    macro avg       0.36      0.38      0.37       284
# weighted avg       0.53      0.56      0.54       284

# 8){'loss': 0.84049994, 'eval_accuracy': 0.575, 'eval_loss': 0.93640614, 'global_step': 368}
# [[  2  18   3]
#  [  5  42  50]
#  [  2  44 118]]
#               precision    recall  f1-score   support

#     Negative       0.22      0.09      0.12        23
#      Neutral       0.40      0.43      0.42        97
#     Positive       0.69      0.72      0.70       164

#    micro avg       0.57      0.57      0.57       284
#    macro avg       0.44      0.41      0.42       284
# weighted avg       0.55      0.57      0.56       284

# 9){'loss': 0.8601472, 'eval_accuracy': 0.5642857, 'eval_loss': 0.95250976, 'global_step': 414}
# [[  0  18   5]
#  [  4  37  56]
#  [  1  41 122]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.39      0.38      0.38        97
#     Positive       0.67      0.74      0.70       164

#    micro avg       0.56      0.56      0.56       284
#    macro avg       0.35      0.38      0.36       284
# weighted avg       0.52      0.56      0.54       284


# 10){'loss': 0.9091368, 'eval_accuracy': 0.5535714, 'eval_loss': 0.9715489, 'global_step': 460}
# [[  0  19   4]
#  [  4  37  56]
#  [  1  44 119]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.37      0.38      0.38        97
#     Positive       0.66      0.73      0.69       164

#    micro avg       0.55      0.55      0.55       284
#    macro avg       0.34      0.37      0.36       284
# weighted avg       0.51      0.55      0.53       284

# 11){'loss': 0.9756337, 'eval_accuracy': 0.5714286, 'eval_loss': 1.0129306, 'global_step': 506}
# [[  3  16   4]
#  [  5  36  56]
#  [  1  41 122]]
#               precision    recall  f1-score   support

#     Negative       0.33      0.13      0.19        23
#      Neutral       0.39      0.37      0.38        97
#     Positive       0.67      0.74      0.71       164

#    micro avg       0.57      0.57      0.57       284
#    macro avg       0.46      0.42      0.42       284
# weighted avg       0.55      0.57      0.55       284

# 12){'loss': 0.9551747, 'eval_accuracy': 0.56785715, 'eval_loss': 1.0222387, 'global_step': 552}
# [[  3  17   3]
#  [  5  38  54]
#  [  2  43 119]]
#               precision    recall  f1-score   support

#     Negative       0.30      0.13      0.18        23
#      Neutral       0.39      0.39      0.39        97
#     Positive       0.68      0.73      0.70       164

#    micro avg       0.56      0.56      0.56       284
#    macro avg       0.45      0.42      0.42       284
# weighted avg       0.55      0.56      0.55       284

# 13){'loss': 1.0276382, 'eval_accuracy': 0.5714286, 'eval_loss': 1.0547612, 'global_step': 598}
# [[  3  17   3]
#  [  5  36  56]
#  [  1  41 122]]
#               precision    recall  f1-score   support

#     Negative       0.33      0.13      0.19        23
#      Neutral       0.38      0.37      0.38        97
#     Positive       0.67      0.74      0.71       164

#    micro avg       0.57      0.57      0.57       284
#    macro avg       0.46      0.42      0.42       284
# weighted avg       0.55      0.57      0.55       284

# 14){'loss': 1.0036505, 'eval_accuracy': 0.55714285, 'eval_loss': 1.0697843, 'global_step': 644}
# [[  3  17   3]
#  [  7  36  54]
#  [  2  44 118]]
#               precision    recall  f1-score   support

#     Negative       0.25      0.13      0.17        23
#      Neutral       0.37      0.37      0.37        97
#     Positive       0.67      0.72      0.70       164

#    micro avg       0.55      0.55      0.55       284
#    macro avg       0.43      0.41      0.41       284
# weighted avg       0.54      0.55      0.54       284

# 15){'loss': 1.0295725, 'eval_accuracy': 0.5642857, 'eval_loss': 1.0809377, 'global_step': 690}
# [[  3  17   3]
#  [  5  39  53]
#  [  1  46 117]]
#               precision    recall  f1-score   support

#     Negative       0.33      0.13      0.19        23
#      Neutral       0.38      0.40      0.39        97
#     Positive       0.68      0.71      0.69       164

#    micro avg       0.56      0.56      0.56       284
#    macro avg       0.46      0.42      0.42       284
# weighted avg       0.55      0.56      0.55       284

# 16){'loss': 1.0889318, 'eval_accuracy': 0.54642856, 'eval_loss': 1.1056138, 'global_step': 736}
# [[  3  17   3]
#  [  8  33  56]
#  [  2  44 118]]
#               precision    recall  f1-score   support

#     Negative       0.23      0.13      0.17        23
#      Neutral       0.35      0.34      0.35        97
#     Positive       0.67      0.72      0.69       164

#    micro avg       0.54      0.54      0.54       284
#    macro avg       0.42      0.40      0.40       284
# weighted avg       0.52      0.54      0.53       284

# 17){'loss': 1.1312778, 'eval_accuracy': 0.54642856, 'eval_loss': 1.1440269, 'global_step': 782}
# [[  3  17   3]
#  [  9  33  55]
#  [  2  44 118]]
#               precision    recall  f1-score   support

#     Negative       0.21      0.13      0.16        23
#      Neutral       0.35      0.34      0.35        97
#     Positive       0.67      0.72      0.69       164

#    micro avg       0.54      0.54      0.54       284
#    macro avg       0.41      0.40      0.40       284
# weighted avg       0.52      0.54      0.53       284

# 18){'loss': 1.1436831, 'eval_accuracy': 0.55, 'eval_loss': 1.1613237, 'global_step': 828}
# [[  3  16   4]
#  [  8  35  54]
#  [  2  45 117]]
#               precision    recall  f1-score   support

#     Negative       0.23      0.13      0.17        23
#      Neutral       0.36      0.36      0.36        97
#     Positive       0.67      0.71      0.69       164

#    micro avg       0.55      0.55      0.55       284
#    macro avg       0.42      0.40      0.41       284
# weighted avg       0.53      0.55      0.54       284

# 19){'loss': 1.1817628, 'eval_accuracy': 0.55, 'eval_loss': 1.1834545, 'global_step': 874}
# [[  3  16   4]
#  [  8  34  55]
#  [  1  45 118]]
#               precision    recall  f1-score   support

#     Negative       0.25      0.13      0.17        23
#      Neutral       0.36      0.35      0.35        97
#     Positive       0.67      0.72      0.69       164

#    micro avg       0.55      0.55      0.55       284
#    macro avg       0.42      0.40      0.41       284
# weighted avg       0.53      0.55      0.53       284

# 20){'loss': 1.1921012, 'eval_accuracy': 0.54642856, 'eval_loss': 1.1976833, 'global_step': 920}
# [[  3  16   4]
#  [  9  35  53]
#  [  2  47 115]]
#               precision    recall  f1-score   support

#     Negative       0.21      0.13      0.16        23
#      Neutral       0.36      0.36      0.36        97
#     Positive       0.67      0.70      0.68       164

#    micro avg       0.54      0.54      0.54       284
#    macro avg       0.41      0.40      0.40       284
# weighted avg       0.53      0.54      0.53       284

# 21){'loss': 1.2514663, 'eval_accuracy': 0.53571427, 'eval_loss': 1.2244278, 'global_step': 966}
# [[  3  16   4]
#  [ 11  32  54]
#  [  2  47 115]]
#               precision    recall  f1-score   support

#     Negative       0.19      0.13      0.15        23
#      Neutral       0.34      0.33      0.33        97
#     Positive       0.66      0.70      0.68       164

#    micro avg       0.53      0.53      0.53       284
#    macro avg       0.40      0.39      0.39       284
# weighted avg       0.51      0.53      0.52       284

# 22){'loss': 1.2630298, 'eval_accuracy': 0.5321429, 'eval_loss': 1.2527977, 'global_step': 1012}
# [[  5  14   4]
#  [ 14  32  51]
#  [  9  43 112]]
#               precision    recall  f1-score   support

#     Negative       0.18      0.22      0.20        23
#      Neutral       0.36      0.33      0.34        97
#     Positive       0.67      0.68      0.68       164

#    micro avg       0.52      0.52      0.52       284
#    macro avg       0.40      0.41      0.41       284
# weighted avg       0.52      0.52      0.52       284

# 23){'loss': 1.3330169, 'eval_accuracy': 0.53571427, 'eval_loss': 1.2815608, 'global_step': 1058}
# [[  6  13   4]
#  [ 14  31  52]
#  [  9  42 113]]
#               precision    recall  f1-score   support

#     Negative       0.21      0.26      0.23        23
#      Neutral       0.36      0.32      0.34        97
#     Positive       0.67      0.69      0.68       164

#    micro avg       0.53      0.53      0.53       284
#    macro avg       0.41      0.42      0.42       284
# weighted avg       0.53      0.53      0.53       284

# 24){'loss': 1.3111317, 'eval_accuracy': 0.5321429, 'eval_loss': 1.2898273, 'global_step': 1104}
# [[  5  14   4]
#  [ 14  32  51]
#  [  9  43 112]]
#               precision    recall  f1-score   support

#     Negative       0.18      0.22      0.20        23
#      Neutral       0.36      0.33      0.34        97
#     Positive       0.67      0.68      0.68       164

#    micro avg       0.52      0.52      0.52       284
#    macro avg       0.40      0.41      0.41       284
# weighted avg       0.52      0.52      0.52       284

In [0]:
## Larg BERT seq:256, batchsize 8 for train, dev , test
# {'loss': 1.0679293, 'eval_accuracy': 0.5955882, 'eval_loss': 0.8965841, 'global_step': 93}
# [[  0   0  23]
#  [  0   0  97]
#  [  0   0 164]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.00      0.00      0.00        97
#     Positive       0.58      1.00      0.73       164

#    micro avg       0.58      0.58      0.58       284
#    macro avg       0.19      0.33      0.24       284
# weighted avg       0.33      0.58      0.42       284

# {'loss': 1.0362501, 'eval_accuracy': 0.5955882, 'eval_loss': 0.88540924, 'global_step': 186}
# [[  0   0  23]
#  [  0   0  97]
#  [  0   0 164]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.00      0.00      0.00        97
#     Positive       0.58      1.00      0.73       164

#    micro avg       0.58      0.58      0.58       284
#    macro avg       0.19      0.33      0.24       284
# weighted avg       0.33      0.58      0.42       284

# {'loss': 1.0290864, 'eval_accuracy': 0.5955882, 'eval_loss': 0.8812368, 'global_step': 279}
# [[  0   0  23]
#  [  0   0  97]
#  [  0   0 164]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.00      0.00      0.00        97
#     Positive       0.58      1.00      0.73       164

#    micro avg       0.58      0.58      0.58       284
#    macro avg       0.19      0.33      0.24       284
# weighted avg       0.33      0.58      0.42       284

# {'loss': 1.0271251, 'eval_accuracy': 0.5955882, 'eval_loss': 0.88323754, 'global_step': 372}
# [[  0   0  23]
#  [  0   0  97]
#  [  0   0 164]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.00      0.00      0.00        97
#     Positive       0.58      1.00      0.73       164

#    micro avg       0.58      0.58      0.58       284
#    macro avg       0.19      0.33      0.24       284
# weighted avg       0.33      0.58      0.42       284

# {'loss': 1.02654, 'eval_accuracy': 0.5955882, 'eval_loss': 0.8804489, 'global_step': 465}
# [[  0   0  23]
#  [  0   0  97]
#  [  0   0 164]]
#               precision    recall  f1-score   support

#     Negative       0.00      0.00      0.00        23
#      Neutral       0.00      0.00      0.00        97
#     Positive       0.58      1.00      0.73       164

#    micro avg       0.58      0.58      0.58       284
#    macro avg       0.19      0.33      0.24       284
# weighted avg       0.33      0.58      0.42       284

# SAME FOR 10 EPOCHS!!!!!

In [0]:
predictions

In [0]:
len(data_train)

22284

In [0]:
len(data_train['DOCUMENT'])

22284