### Code to load amazon review data into a format compatible with keras

In [42]:

# import gzip
# import json
# import numpy as np
# data_path='/home/hani/Downloads/Software_5.json.gz'
# destination_dir="/home/hani/Data/amazon-review"
# f = gzip.open(data_path, 'rb')
# file_content = f.readline()
# counter=0
# while file_content:
#     try:
        
#         line = json.loads(file_content)
#         line["reviewText"]!="" and line["overall"]!=""
        
#         if np.random.binomial(2, 0.15, 1)[0]==0:#train
#             f2 = open(destination_dir+"/train/"+str(int(line["overall"]))+"/"+str(counter)+".txt" , "w")
#         else:
#             f2 = open(destination_dir+"/test/"+str(int(line["overall"]))+"/"+str(counter)+".txt" , "w")
    
#         f2.write(line["reviewText"])
#         f2.close()
#         file_content = f.readline()
#         counter+=1
#     except:
#         file_content = f.readline()
#         counter+=1
# f.close()


In [1]:
conf={
    'epochs':5,
    'init_lr':1e-5,
    'drop_out_rate':0.05,
    'n_classes': 5,
    'batch_size':32
}

### Load data into keras text processing

In [2]:
import numpy as np
import tensorflow as tf

AUTOTUNE = tf.data.experimental.AUTOTUNE
# batch_size = 32
seed = 42
data_path="/home/hani/Data/amazon-review"

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    data_path+'/train',
    batch_size=conf['batch_size'],
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    data_path+'/train',
    batch_size=conf['batch_size'],
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    data_path+'/test',
    batch_size=conf['batch_size'])

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)



Found 9250 files belonging to 5 classes.
Using 7400 files for training.
Found 9250 files belonging to 5 classes.
Using 1850 files for validation.
Found 3554 files belonging to 5 classes.


In [3]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(3):
        print(f'Review: {text_batch.numpy()[i]}')
        label = label_batch.numpy()[i]
        print(f'Label : {label} ({class_names[label]})')

Review: b'NOT REALLY FREE. WAS ANOYED WITH THAT.'
Label : 0 (1)
Review: b'Works Great'
Label : 4 (5)
Review: b'The basic essentials are covered in this package providing word processing, spreadsheets and presentations. Installation from the USB stick is a simple task.\n\nCorel Home Office is a fine product for its price point. The applications look and operate much like their Microsoft Office [2007] counterparts and provide adequately comparable functionality - there are fewer transitions for presentations, fewer charts for spreadsheets, less complex word processing features, a slightly less streamlined and feature-filled interface. Core features and tools are included; most of what\'s lacking would be missed primarily by the power user.\n\nIt\'s a fine product for small-scale business or for anyone unwilling to shell out big bucks for productivity software - not a bad choice for students on a budget.\n\nTransitioning to and from a larger suite is easy enough: Corel Write\'s "Tools" me

In [46]:
# next(iter(train_ds))[1]

### Test Pre-processing and bert 

In [47]:
# !pip install -q tensorflow-text
# !pip install -q tf-models-official
# !pip install pydot
# !pip install graphviz

In [3]:
import os
# import shutil

import time
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

# import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [5]:
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1'
tfhub_handle_encoder    = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [50]:
text_test = ['it is a great product.']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [ 101 2009 2003 1037 2307 4031 1012  102    0    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 0 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [51]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
# print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
# print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Sequence Outputs Shape:(1, 128, 512)


### Build a sentiment class using last layer of bert with dropout and one classifier softmax layer

In [52]:
class SentimentPredictor(tf.keras.Model):
    def __init__(self,drop_out_rate=0.1,n_classes=5,
                 tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1',
                 tfhub_handle_encoder    = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'):
        super(SentimentPredictor, self).__init__()
        self.n_classes=n_classes
        self.preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
        self.encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
        self.dropout_layer = tf.keras.layers.Dropout(drop_out_rate)
        self.out_layer = tf.keras.layers.Dense(n_classes, activation=None, name='classifier')

    def call(self, inp):
        self.encoder_inputs = self.preprocessing_layer(inp)
        self.bert_out = self.encoder(self.encoder_inputs)['pooled_output']
        x =self.dropout_layer(self.bert_out)
        self.final_output = self.out_layer(x)
        return self.final_output

In [53]:


classifier = SentimentPredictor(conf['drop_out_rate'],conf['n_classes'])
classifier(tf.constant(['it is a great product.']))

<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
array([[ 1.0428904, -0.7776122, -0.9397965,  1.1107899,  0.7062758]],
      dtype=float32)>

In [4]:

def loss_function(real, pred):
    loss_object    = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss_ = loss_object(real, pred)
    return tf.reduce_sum(loss_)



train_acc = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1)
train_loss = tf.keras.metrics.Mean(name='train_loss')

valid_acc = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1)
valid_loss = tf.keras.metrics.Mean(name='valid_loss')
# train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

init_lr = conf['init_lr']


epochs=conf['epochs']
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')



In [8]:
int(0.1*num_train_steps)

116

In [55]:
# class HaniSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
#       def __init__(self, warmup_steps=3, initial_lr=0.001):
#         super(HaniSchedule, self).__init__()

#         self.bvalid = np.inf
#         self.initial_lr= initial_lr
#         self.warmup_steps = warmup_steps
#         self.not_imp= 0
#         self.div=1
#       def check(self,cval):
#         if cval<self.bvalid:
#           self.bvalid=cval
#           self.not_imp= 0
#           print(self.bvalid)
#         else:
#           self.not_imp+=1
#           print(self.bvalid,".")
#           if self.not_imp>self.warmup_steps:
#             self.div+=1
#             self.not_imp=0
#             print(self.bvalid,"**")

#       def __call__(self, step):
#         return self.initial_lr /self.div

# lr_schedule = HaniSchedule(warmup_steps=num_train_steps*0.1,initial_lr=conf['init_lr'])
# optimizer=tf.keras.optimizers.RMSprop(learning_rate=lr_schedule,rho=0.7)

In [56]:
# m = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1)
# met = tf.keras.metrics.Mean('tmp')
# print(m([1, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]))
# print(m([0, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]))
# print(met(m.result()))
# print(m.result().numpy())
# m.update_state([0, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
# print(m.result().numpy())

# print(met(m))

In [57]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(classifier=classifier,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)

# # if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [58]:
train_step_signature = [
    tf.TensorSpec(shape=(None, ), dtype=tf.string),
    tf.TensorSpec(shape=(None, ), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    
    with tf.GradientTape() as tape:
        predictions = classifier(inp)
        loss = loss_function(tar, predictions)
        
    gradients = tape.gradient(loss, classifier.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, classifier.trainable_variables))
    
    train_loss(loss)
    train_acc(tar, predictions)
    
    
@tf.function(input_signature=train_step_signature)
def valid_step(inp, tar):
    predictions = classifier(inp)    
    
    valid_loss(loss_function(tar, predictions))
    valid_acc(tar, predictions)

def reset_stats():
    train_loss.reset_states()
    train_acc.reset_states()
    
    valid_loss.reset_states()
    valid_acc.reset_states()

In [59]:
# best_val_loss=np.inf

# for epoch in range(epochs):
#     start = time.time()

#     reset_stats()

#     for (batch, (inp, tar)) in enumerate(train_ds):
#         train_step(inp, tar)
#         if (batch+1) %100==0:
#             print('{}/{}'.format(batch+1,steps_per_epoch), end="\r")
        
#     print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(
#           epoch + 1, train_loss.result(), train_acc.result()))
# #     s=[tf.reduce_sum(w).numpy() for w in classifier.weights  if w.shape!=[]  ]
# #     print(np.sum(s))
#     if (epoch + 1) % 1 == 0:
#         for (batch, (inp, tar)) in enumerate(val_ds):
#             valid_step(inp, tar)
#         lr_schedule.check(valid_loss.result())
#         print ('Epoch {} ValLoss {:.4f} ValAccuracy {:.4f}'.format(
#           epoch + 1,valid_loss.result() , valid_acc.result()))
        
#         if valid_loss.result()<best_val_loss:
#             best_val_loss = valid_loss.result()
#             ckpt_save_path = ckpt_manager.save()
#             print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
#                                                              ckpt_save_path))
#     print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

In [60]:
best_val_loss=np.inf

for epoch in range(epochs):
    start = time.time()

    reset_stats()

    for (batch, (inp, tar)) in enumerate(train_ds):
        train_step(inp, tar)
        if (batch+1) %100==0:
            print('{}/{}'.format(batch+1,steps_per_epoch), end="\r")
        
    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, train_loss.result(), train_acc.result()))
#     s=[tf.reduce_sum(w).numpy() for w in classifier.weights  if w.shape!=[]  ]
#     print(np.sum(s))
    if (epoch + 1) % 1 == 0:
        for (batch, (inp, tar)) in enumerate(val_ds):
            valid_step(inp, tar)
        
        print ('Epoch {} ValLoss {:.4f} ValAccuracy {:.4f}'.format(
          epoch + 1,valid_loss.result() , valid_acc.result()))
        
        if valid_loss.result()<best_val_loss:
            best_val_loss = valid_loss.result()
            ckpt_save_path = ckpt_manager.save()
            print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))
    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Loss 24.9421 Accuracy 0.7027


KeyboardInterrupt: 

### Inference

In [1]:
import numpy as np
import tensorflow as tf
import os
import shutil

import time 

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

class SentimentPredictor(tf.keras.Model):
    def __init__(self,drop_out_rate=0.1,n_classes=5,
                 tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1',
                 tfhub_handle_encoder    = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'):
        super(SentimentPredictor, self).__init__()
        self.n_classes=n_classes
        self.preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
        self.encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='BERT_encoder')
        self.dropout_layer = tf.keras.layers.Dropout(drop_out_rate)
        self.out_layer = tf.keras.layers.Dense(n_classes, activation=None, name='classifier')

    def call(self, inp):
        self.encoder_inputs = self.preprocessing_layer(inp)
        self.bert_out = self.encoder(self.encoder_inputs)['pooled_output']
        x =self.dropout_layer(self.bert_out)
        self.final_output = self.out_layer(x)
        return self.final_output


classifier = SentimentPredictor()

optimizer = optimization.create_optimizer(init_lr=0.1,
                                          num_train_steps=10,
                                          num_warmup_steps=1,
                                          optimizer_type='adamw')

In [2]:
classifier

<__main__.SentimentPredictor at 0x7fe17d69fa58>

In [4]:

checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(classifier=classifier,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)


ckpt.restore(ckpt_manager.latest_checkpoint)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe0a0040ac8>

In [5]:

# for (batch, (inp, tar)) in enumerate(val_ds):
#         valid_step(inp, tar)
        
# print ('Epoch {} ValLoss {:.4f} ValAccuracy {:.4f}'.format(
#                           epoch + 1,valid_loss.result() , valid_acc.result()))

classifier(["The result is good"])

<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
array([[-1.1011397 , -1.5700848 ,  0.21170707,  0.35314897,  2.1451359 ]],
      dtype=float32)>

In [18]:
# # np.log(0.95)+np.log(0.1)

# y_true = [1, 2]
# y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
# # Using 'auto'/'sum_over_batch_size' reduction type.
# scce = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')
# print(scce(y_true, y_pred).numpy())

# scce2 = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM)
# scce2(y_true, y_pred).numpy()


# valid_acc.reset_states()
# for (batch, (inp, tar)) in enumerate(val_ds):
#     valid_step(inp, tar)
# print ( valid_acc.result())

ckpt_manager.latest_checkpoint

'./checkpoints/train/ckpt-4'

In [None]:
loss_function= lambda real, pred: tf.reduce_sum(loss_object(real, pred))

### This code transforms the json reviews orginial data into a a new smaller file with fewer attributes for simplicity ( in order not to implement them all in  java objectMapper)

In [7]:
import json


def process(txt):
    txt = ''.join(txt.split('\n'))
    return txt.replace("\"","").replace("\'","")

lst_keep=[
        'overall',
        'reviewerID',
        'asin',
        'reviewText',
        'unixReviewTime']

dest_json=open("/home/hani/Desktop/tmp/reviews2.json",'w')
f=open("/home/hani/Desktop/tmp/reviews.json")

l = f.readline()
while l:
    
    res=json.loads(l)
    lst_k=list(res.keys()).copy()
    for k in lst_k:
        if k not in lst_keep:
            del res[k]
    txt= "\"overall\": {0}, \"reviewerID\": \"{1}\", \"asin\": \"{2}\", \"reviewText\": \"{3}\", \"unixReviewTime\": \"{4}\"".format(
                                                                                                                              res["overall"],
                                                                                                                              res["reviewerID"],
                                                                                                                              res["asin"],
                                                                                                                              process(res["reviewText"]),
                                                                                                                              res["unixReviewTime"])
    dest_json.write("{"+txt+"}\n")
    l = f.readline()
f.close()
dest_json.close()