In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cols = ["sentiment", "title", "text"]
data = pd.read_csv(
    "/content/drive/MyDrive/DATA/Playstore_app.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [None]:
data.head()

Unnamed: 0,sentiment,title,text
0,1,Smart News,Read news on the go
1,1,NewsBreak,Reconnect with your neighborhood! Daily news a...
2,1,CNN,Stay informed on breaking news with the global...
3,1,Fox News,Read latest breaking news! watch current event...
4,1,Google News,Stay informed with latest world and local news...


In [None]:
data.drop(["title"],
          axis=1,
          inplace=True)

In [None]:
data.head()

Unnamed: 0,sentiment,text
0,1,Read news on the go
1,1,Reconnect with your neighborhood! Daily news a...
2,1,Stay informed on breaking news with the global...
3,1,Read latest breaking news! watch current event...
4,1,Stay informed with latest world and local news...


In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet,"lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+",' ',tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data.text

0                                   Read news on the go
1     Reconnect with your neighborhood! Daily news a...
2     Stay informed on breaking news with the global...
3     Read latest breaking news! watch current event...
4     Stay informed with latest world and local news...
5     Stay informed. Get all your global news with A...
6     Get breaking news alerts,headlines and live st...
7     World news, sport, business and opinion. Read ...
8     Official BBC News app for international audiences
9     Watch the CBS News 24/7 live stream for breaki...
10    Relax your brain with this fun word puzzle gam...
11                     Solitaire Games- Card Games 2022
12    Roblox lets you create,share experiences and b...
13    The sweetest puzzle game! Switch,match and bla...
14     Welcome to the funniest cartoon puzzle adventure
15    The ultimate matching puzzle game with unique ...
16        Logic puzzle and Brain Games to Test your IQ!
17                 A beautiful and relaxing puzz

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_labels = data.sentiment.values

Tokenization

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file =   bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)                  

In [None]:
def encode_sentence(sent):
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

DATASET CREATION

In [None]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens,"[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id
    return seg_ids


In [None]:
data_with_len = [[sent, data_labels[i], len(sent)] for i,sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x:x[2])
sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])], sent_lab[1])
                for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((3, None), ()))

In [None]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

MODEL BUILDING


In [None]:
my_sent = ["[CLS]"] + tokenizer.tokenize("Roses are red.") + ["[SEP]"]
bert_layer([tf.expand_dims(tf.cast(get_ids(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_mask(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_segments(my_sent), tf.int32), 0)])

[<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[-9.27935660e-01, -4.10335541e-01, -9.65755165e-01,
          9.07317996e-01,  8.12914073e-01, -1.74174517e-01,
          9.11234498e-01,  3.41952384e-01, -8.74521434e-01,
         -9.99989271e-01, -7.78410196e-01,  9.69385266e-01,
          9.86160576e-01,  6.36963367e-01,  9.48631287e-01,
         -7.51193345e-01, -4.58339781e-01, -7.08104551e-01,
          4.62098479e-01, -6.57927036e-01,  7.60414541e-01,
          9.99994755e-01, -3.96861315e-01,  3.44166130e-01,
          6.16488814e-01,  9.94400024e-01, -7.76633859e-01,
          9.38316524e-01,  9.59452212e-01,  7.32879341e-01,
         -6.93436682e-01,  2.93080747e-01, -9.93785441e-01,
         -1.64551854e-01, -9.67019618e-01, -9.95549619e-01,
          5.32935679e-01, -6.88060939e-01,  1.34714423e-02,
          2.98193675e-02, -9.18356538e-01,  4.20526356e-01,
          9.99988973e-01,  2.52676398e-01,  6.06235802e-01,
         -3.50750148e-01, -1.00000000e+00,  4.975

In [None]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)
        
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs
    
    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)

        x_1 = self.bigram(x) # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

TRAINING

In [None]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [None]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "/content/drive/MyDrive/BERT_PROJECTS/trainingandtestdata/ckpt_bert_embedding/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest Checkpoint restored!")

Latest Checkpoint restored!


In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
        callbacks=[MyCustomCallback()])


Epoch 1/5
      1/Unknown - 16s 16s/step - loss: 1.3453e-05 - accuracy: 1.0000Checkpoint saved at /content/drive/MyDrive/BERT_PROJECTS/trainingandtestdata/ckpt_bert_embedding/.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f25bd2bead0>

EVALUATION

In [None]:
results = Dcnn.evaluate(train_dataset)
print(results)

[4.069913757120958e-06, 1.0]


In [None]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)

    input_ids = get_ids(tokens)
    input_mask = get_mask(tokens)
    segment_ids = get_segments(tokens)

    inputs = tf.stack(
        [tf.cast(input_ids, dtype=tf.int32),
         tf.cast(input_mask, dtype=tf.int32),
         tf.cast(segment_ids, dtype=tf.int32)],
         axis=0)
    inputs = tf.expand_dims(inputs, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Output of the model: {}\nPredicted app.: Not a news app".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted app: A news app".format(
            output))

In [None]:
get_prediction("Read news on the go.")


Output of the model: [[0.99754965]]
Predicted app: A news app
