In [None]:
!pip install tensorflow_hub

In [None]:
!pip install bert-tensorflow==1.0.1

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
!pip install sentencepiece

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import logging
# logging.basicConfig(level=logging.INFO)

In [2]:
import keras

In [3]:
# import bert
# from bert import optimization
# from bert import tokenization
import tokenization

In [3]:
from keras.utils.np_utils import to_categorical

In [3]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [14]:
train = pd.read_json("github_prediction_data/embold_train.json").reset_index(drop=True)
test = pd.read_json("github_prediction_data/embold_test.json").reset_index(drop=True)

train = train[:1000]
test = test[:50]

In [15]:
test.shape

(50, 2)

In [16]:
train.shape

(1000, 3)

In [17]:
train.columns

Index(['title', 'body', 'label'], dtype='object')

In [18]:
train['Review'] = (train['title'].map(str) +' '+ train['body']).apply(lambda row: row.strip())
test['Review'] = (test['title'].map(str) +' '+ test['body']).apply(lambda row: row.strip())

In [19]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [20]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for index, text in enumerate(texts):
        text = tokenizer.tokenize(text)
        if index == 0:
            print(text)
        text = text[:max_len-2]
        if index == 0:
            print('text truncated:', text)
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [34]:
# https://www.kaggle.com/questions-and-answers/86510
# sequence outout - [batch_size, max_seq_length, 768]
# pooled output - [batch_size, 768] 

'''
the first token of output sequence is from the first of input ，i e. [CLS]. 
the [CLS] is regarded as the represition of the whole input sequence. u can read the original paper to understand it better.
https://stackoverflow.com/questions/63377198/what-is-the-difference-between-pulled-output-and-sequence-output-in-bert-layer
https://stackoverflow.com/questions/60293712/how-is-bert-layer-sequence-output-used
'''

def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(3, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [46]:
def build_model_pooled_output(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    net = tf.keras.layers.Dense(64, activation='relu')(pooled_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(3, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [26]:
from keras.utils.np_utils import to_categorical

In [28]:
max_len = 150
train_input = bert_encode(train.Review.values, tokenizer, max_len=max_len)
test_input = bert_encode(test.Review.values, tokenizer, max_len=max_len)
train_labels = to_categorical(train.label.values, num_classes=3)

['y', '-', 'zoom', 'piano', 'roll', 'a', 'y', '-', 'zoom', 'on', 'the', 'piano', 'roll', 'would', 'be', 'useful', '.']
text truncated: ['y', '-', 'zoom', 'piano', 'roll', 'a', 'y', '-', 'zoom', 'on', 'the', 'piano', 'roll', 'would', 'be', 'useful', '.']
['con', '##fi', '##g', 'question', 'path', '-', 'specific', 'environment', 'variables', 'issue', 'description', 'or', 'question', '\\', 'r', '\\', 'r', 'hey', '@', 'arte', '##mg', '##ovo', '##rov', '!', 'thanks', 'for', 'your', 'previous', 'help', 'with', 'the', 'module', 'alias', '##ing', 'in', 'my', 'le', '##rna', 'rep', '##o', '.', 'i', "'", 'm', 'still', 'trying', 'to', 'work', 'out', 'more', 'of', 'the', 'kin', '##ks', '.', '\\', 'r', '\\', 'r', 'is', 'there', 'any', 'way', 'to', 'set', 'up', 'the', 'en', '##v', 'variables', 'before', 'tests', 'in', 'each', 'file', 'are', 'run', '?', 'the', 'setup', 'requires', ':', '\\', 'r', 'tests', 'in', 'packages', '/', 'module', '-', 'a', 'need', 'my', '_', 'en', '##v', 'to', 'be', 'test', '\

In [35]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 150)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [36]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('bert_multi_class_classifier_github.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

In [37]:
train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint, earlystopping],
    batch_size=32,
    verbose=1)

Epoch 1/3

Epoch 00001: val_accuracy improved from -inf to 0.62500, saving model to bert_multi_class_classifier_github.h5
Epoch 2/3

Epoch 00002: val_accuracy improved from 0.62500 to 0.71000, saving model to bert_multi_class_classifier_github.h5
Epoch 3/3

Epoch 00003: val_accuracy did not improve from 0.71000


In [47]:
model_pooled = build_model_pooled_output(bert_layer, max_len=max_len)
model_pooled.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 150)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [48]:
train_history = model_pooled.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint, earlystopping],
    batch_size=32,
    verbose=1)

Epoch 1/3

Epoch 00001: val_accuracy did not improve from 0.71000
Epoch 2/3

Epoch 00002: val_accuracy improved from 0.71000 to 0.75500, saving model to bert_multi_class_classifier_github.h5
Epoch 3/3

Epoch 00003: val_accuracy did not improve from 0.75500


In [49]:
# works better with pooled output

In [42]:
a = np.array([[['123']]])

In [43]:
b = a[:, 0, :]

In [44]:
b

array([['123']], dtype='<U3')

In [45]:
b.shape

(1, 1)

In [4]:
# Language Model

In [5]:
df = pd.read_csv('kohls_data/kohlscatalog-poc-master.csv', index_col=0)

In [6]:
df.columns

Index(['productId', 'title', 'Category(Fashion, Beauty, Home)', 'description',
       'url'],
      dtype='object')

In [7]:
df = df[['title']]

In [8]:
df.columns

Index(['title'], dtype='object')

In [9]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.models import Model, Input
 
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

# source text
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """

In [10]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat_preds = model.predict(encoded, verbose=0)
        yhat = np.argmax(yhat_preds,axis=-1)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [9]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

In [10]:
encoded = tokenizer.texts_to_sequences([data])[0]

In [11]:
encoded

[2,
 1,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 2,
 14,
 15,
 1,
 16,
 17,
 18,
 1,
 3,
 19,
 20,
 21]

In [12]:
tokenizer.texts_to_sequences([data])

[[2,
  1,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  2,
  14,
  15,
  1,
  16,
  17,
  18,
  1,
  3,
  19,
  20,
  21]]

In [13]:
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)

In [14]:
sequences

[[2, 1, 3],
 [1, 3, 4],
 [3, 4, 5],
 [4, 5, 6],
 [5, 6, 7],
 [6, 7, 8],
 [7, 8, 9],
 [8, 9, 10],
 [9, 10, 11],
 [10, 11, 12],
 [11, 12, 13],
 [12, 13, 2],
 [13, 2, 14],
 [2, 14, 15],
 [14, 15, 1],
 [15, 1, 16],
 [1, 16, 17],
 [16, 17, 18],
 [17, 18, 1],
 [18, 1, 3],
 [1, 3, 19],
 [3, 19, 20],
 [19, 20, 21]]

In [15]:
max_length = max([len(seq) for seq in sequences])
max_length

3

In [16]:
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')

In [17]:
sequences

array([[ 2,  1,  3],
       [ 1,  3,  4],
       [ 3,  4,  5],
       [ 4,  5,  6],
       [ 5,  6,  7],
       [ 6,  7,  8],
       [ 7,  8,  9],
       [ 8,  9, 10],
       [ 9, 10, 11],
       [10, 11, 12],
       [11, 12, 13],
       [12, 13,  2],
       [13,  2, 14],
       [ 2, 14, 15],
       [14, 15,  1],
       [15,  1, 16],
       [ 1, 16, 17],
       [16, 17, 18],
       [17, 18,  1],
       [18,  1,  3],
       [ 1,  3, 19],
       [ 3, 19, 20],
       [19, 20, 21]], dtype=int32)

In [18]:
sequences = array(sequences)
sequences

array([[ 2,  1,  3],
       [ 1,  3,  4],
       [ 3,  4,  5],
       [ 4,  5,  6],
       [ 5,  6,  7],
       [ 6,  7,  8],
       [ 7,  8,  9],
       [ 8,  9, 10],
       [ 9, 10, 11],
       [10, 11, 12],
       [11, 12, 13],
       [12, 13,  2],
       [13,  2, 14],
       [ 2, 14, 15],
       [14, 15,  1],
       [15,  1, 16],
       [ 1, 16, 17],
       [16, 17, 18],
       [17, 18,  1],
       [18,  1,  3],
       [ 1,  3, 19],
       [ 3, 19, 20],
       [19, 20, 21]], dtype=int32)

In [6]:
# # source text
# data = """ Jack and Jill went up the hill\n
# 		To fetch a pail of water\n
# 		Jack fell down and broke his crown\n
# 		And Jill came tumbling after\n """

# integer encode sequences of words
#https://stackoverflow.com/questions/51956000/what-does-keras-tokenizer-method-exactly-do
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
# define model



word_in = Input(shape=(max_length-1,))
emb_word = Embedding(input_dim=vocab_size, output_dim=10,
                     input_length=max_length-1, mask_zero=True)(word_in)
main_lstm = LSTM(units=50, return_sequences=False,
                               recurrent_dropout=0.6)(emb_word)

outputs = Dense(vocab_size, activation='softmax')(main_lstm)
model = Model(word_in, outputs)

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, batch_size=32, epochs=15, validation_split=0.2, verbose=1)






'''
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, batch_size=32, epochs=500, verbose=2)
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Jack and', 5))
print(generate_seq(model, tokenizer, max_length-1, 'And Jill', 3))
print(generate_seq(model, tokenizer, max_length-1, 'fell down', 5))
print(generate_seq(model, tokenizer, max_length-1, 'pail of', 5))
'''

Vocabulary Size: 22
Total Sequences: 23
Max Sequence Length: 3
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


"\nmodel = Sequential()\nmodel.add(Embedding(vocab_size, 10, input_length=max_length-1))\nmodel.add(LSTM(50))\nmodel.add(Dense(vocab_size, activation='softmax'))\nprint(model.summary())\n# compile network\nmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n# fit network\nmodel.fit(X, y, batch_size=32, epochs=500, verbose=2)\n# evaluate model\nprint(generate_seq(model, tokenizer, max_length-1, 'Jack and', 5))\nprint(generate_seq(model, tokenizer, max_length-1, 'And Jill', 3))\nprint(generate_seq(model, tokenizer, max_length-1, 'fell down', 5))\nprint(generate_seq(model, tokenizer, max_length-1, 'pail of', 5))\n"

In [10]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Jack and', 1))
print(generate_seq(model, tokenizer, max_length-1, 'And Jill', 1))
print(generate_seq(model, tokenizer, max_length-1, 'fell down', 5))
print(generate_seq(model, tokenizer, max_length-1, 'pail of', 5))

And Jill jack


In [11]:
# Language Model with our data
data = list(df['title'])

# data = data[:200]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
encoded_list = tokenizer.texts_to_sequences(data)

In [12]:
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [13]:
print(vocab_size, len(data))

1892 1000


In [14]:
encoded_list[:5]

[[447, 859, 860, 85, 598, 106, 599, 861, 862, 863],
 [18, 27, 288, 202, 448, 449, 864, 95, 236, 119, 5],
 [1, 865, 866, 96, 107, 867, 868, 97, 355, 5],
 [148, 450, 237, 13],
 [15, 25, 26, 451, 203, 600, 172, 601, 452, 869, 5]]

In [15]:
data[:5]

['Buffalo Games 1000-pc. Vivid Collection Sky Roads Jigsaw Puzzle',
 'Girls Toddler Colosseum Crimson Oklahoma Sooners Scooter Plaid Button-Up Dress',
 "Women's Refried Apparel Navy New England Patriots Maxi Tank Dress",
 "Candie's® Shawl-Collar Blazer",
 "Juniors' Plus Size Lily Rose Lantern-Sleeve French Terry Shift Dress"]

In [16]:
data_reconstructed = tokenizer.sequences_to_texts(encoded_list)

In [17]:
data_reconstructed[:5]

['buffalo games 1000 pc vivid collection sky roads jigsaw puzzle',
 'girls toddler colosseum crimson oklahoma sooners scooter plaid button up dress',
 "women's refried apparel navy new england patriots maxi tank dress",
 "candie's® shawl collar blazer",
 "juniors' plus size lily rose lantern sleeve french terry shift dress"]

In [18]:
n_gram_length = 5
sequences = []
for encoded in encoded_list:
    for i in range(n_gram_length-1, len(encoded)):
        sequence = encoded[i-(n_gram_length-1):i+1]
        sequences.append(sequence)

In [19]:
len(sequences)

3774

In [20]:
sequences[:10]

[[447, 859, 860, 85, 598],
 [859, 860, 85, 598, 106],
 [860, 85, 598, 106, 599],
 [85, 598, 106, 599, 861],
 [598, 106, 599, 861, 862],
 [106, 599, 861, 862, 863],
 [18, 27, 288, 202, 448],
 [27, 288, 202, 448, 449],
 [288, 202, 448, 449, 864],
 [202, 448, 449, 864, 95]]

In [21]:
sequences = pad_sequences(sequences, maxlen=n_gram_length, padding='pre')

In [22]:
sequences[:5]

array([[447, 859, 860,  85, 598],
       [859, 860,  85, 598, 106],
       [860,  85, 598, 106, 599],
       [ 85, 598, 106, 599, 861],
       [598, 106, 599, 861, 862]], dtype=int32)

In [23]:
len(sequences)

3774

In [24]:
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [25]:
X[0]

array([447, 859, 860,  85], dtype=int32)

In [26]:
y[0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [27]:
len(y[0])

1892

In [28]:
from keras.layers import Bidirectional

In [43]:
# # define model
# model = Sequential()
# model.add(Embedding(vocab_size, 10, input_length=n_gram_length-1))
# model.add(LSTM(50))
# model.add(Dense(vocab_size, activation='softmax'))
# print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 10)             18920     
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 1892)              96492     
Total params: 127,612
Trainable params: 127,612
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
vocab_size

1892

In [30]:
word_in = Input(shape=(n_gram_length-1,))
emb_word = Embedding(input_dim=vocab_size, output_dim=10,
                     input_length=n_gram_length-1, mask_zero=True)(word_in)
main_lstm = LSTM(units=50, return_sequences=False,
                               recurrent_dropout=0.6)(emb_word)

outputs = Dense(vocab_size, activation='softmax')(main_lstm)
model = Model(word_in, outputs)

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, batch_size=32, epochs=15, validation_split=0.2, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f356838d8e0>

In [31]:
model.fit(X, y, batch_size=32, epochs=250, validation_split=0.2, verbose=1)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

<keras.callbacks.History at 0x7f355c5f4ca0>

In [32]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat_preds = model.predict(encoded, verbose=0)
        yhat = np.argmax(yhat_preds,axis=-1)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [33]:
# Add code to evaluate stuffs

In [34]:
#Lets try language model with BERT

In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 72.9 MB/s eta 0:00:01
Collecting filelock
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 107.9 MB/s eta 0:00:01
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 110.3 MB/s eta 0:00:01
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: filelock, tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed filelock-3.0.12 huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.9.1
You should consider upgrading via the '/home/ubuntu/env

In [13]:
!pip install ipywidgets==7.4.2

Collecting ipywidgets==7.4.2
  Downloading ipywidgets-7.4.2-py2.py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 22.1 MB/s eta 0:00:01
Collecting widgetsnbextension~=3.4.0
  Downloading widgetsnbextension-3.4.2-py2.py3-none-any.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 62.6 MB/s eta 0:00:01
Installing collected packages: widgetsnbextension, ipywidgets
  Attempting uninstall: widgetsnbextension
    Found existing installation: widgetsnbextension 3.5.1
    Uninstalling widgetsnbextension-3.5.1:
      Successfully uninstalled widgetsnbextension-3.5.1
  Attempting uninstall: ipywidgets
    Found existing installation: ipywidgets 7.6.3
    Uninstalling ipywidgets-7.6.3:
      Successfully uninstalled ipywidgets-7.6.3
Successfully installed ipywidgets-7.4.2 widgetsnbextension-3.4.2
You should consider upgrading via the '/home/ubuntu/env/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import logging
import keras
import tokenization

In [2]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.models import Model, Input

In [3]:
from transformers import BertTokenizer, BertForMaskedLM

In [4]:
from transformers import TFBertForMaskedLM

In [5]:
PRETRAINED_MODEL = 'bert-base-uncased'

In [6]:
model = TFBertForMaskedLM.from_pretrained(PRETRAINED_MODEL)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [7]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

INFO:absl:Using /tmp/tfhub_modules to cache modules.


In [9]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def get_all_predictions(text_sentence, top_clean=5):
    # ========================= BERT =================================
    input_ids, mask_idx = encode(bert_tokenizer, text_sentence)
    with torch.no_grad():
        predict = bert_model(input_ids)[0]
    bert = decode(bert_tokenizer, predict[0, mask_idx, :].topk(top_k).indices.tolist(), top_clean)
    return {'bert': bert}

In [None]:
def get_prediction_eos(input_text):
    try:
        input_text += ' <mask>'
        res = get_all_predictions(input_text, top_clean=int(top_k))
        return res
    except Exception as error:
        pass

In [11]:
 inputs = bert_tokenizer("The capital of France is [MASK].", return_tensors="tf")

In [12]:
inputs

{'input_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=
array([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [17]:
tokenized_texts = [bert_tokenizer.tokenize(sent) for sent in "The capital of France is [MASK].".split()]

In [18]:
tokenized_texts

[['the'], ['capital'], ['of'], ['france'], ['is'], ['[MASK]', '.']]

In [19]:
tokenized_texts = bert_tokenizer("The capital of France is [MASK].")

In [20]:
tokenized_texts

{'input_ids': [101, 1996, 3007, 1997, 2605, 2003, 103, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
model.summary()

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108891648 
_________________________________________________________________
mlm___cls (TFBertMLMHead)    multiple                  24459834  
Total params: 109,514,298
Trainable params: 109,514,298
Non-trainable params: 0
_________________________________________________________________


In [24]:
outputs = model.predict([tokenized_texts['input_ids'], tokenized_texts['attention_mask'], tokenized_texts['token_type_ids']], verbose=1)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method








Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.




In [25]:
outputs

TFMaskedLMOutput(loss=None, logits=array([[[ -6.4346046,  -6.4063444,  -6.4097404, ...,  -5.7691364,
          -5.6326175,  -3.788285 ],
        [-14.011925 , -14.724042 , -14.211972 , ..., -11.697638 ,
         -10.730408 , -12.761747 ],
        [ -9.656142 , -10.312491 ,  -9.745864 , ...,  -8.77816  ,
          -6.603594 , -12.659599 ],
        ...,
        [ -3.7861156,  -3.857192 ,  -3.5644355, ...,  -2.5592554,
          -3.109321 ,  -4.3819613],
        [-11.659789 , -11.427393 , -11.926661 , ...,  -9.877244 ,
         -10.210293 ,  -4.7594104],
        [-11.72665  , -11.750851 , -11.803964 , ..., -10.594329 ,
         -10.940653 ,  -7.5151176]],

       [[ -4.9389725,  -5.101129 ,  -5.156227 , ...,  -4.9596376,
          -6.8469996,   0.8288686],
        [ -4.7209363,  -4.92056  ,  -4.9808407, ...,  -4.663267 ,
          -6.56871  ,   0.7187822],
        [ -4.357745 ,  -4.537199 ,  -4.620675 , ...,  -4.39608  ,
          -6.212288 ,   0.9897884],
        ...,
        [ -3.999114

In [27]:
logits = outputs.logits

In [28]:
logits

array([[[ -6.4346046,  -6.4063444,  -6.4097404, ...,  -5.7691364,
          -5.6326175,  -3.788285 ],
        [-14.011925 , -14.724042 , -14.211972 , ..., -11.697638 ,
         -10.730408 , -12.761747 ],
        [ -9.656142 , -10.312491 ,  -9.745864 , ...,  -8.77816  ,
          -6.603594 , -12.659599 ],
        ...,
        [ -3.7861156,  -3.857192 ,  -3.5644355, ...,  -2.5592554,
          -3.109321 ,  -4.3819613],
        [-11.659789 , -11.427393 , -11.926661 , ...,  -9.877244 ,
         -10.210293 ,  -4.7594104],
        [-11.72665  , -11.750851 , -11.803964 , ..., -10.594329 ,
         -10.940653 ,  -7.5151176]],

       [[ -4.9389725,  -5.101129 ,  -5.156227 , ...,  -4.9596376,
          -6.8469996,   0.8288686],
        [ -4.7209363,  -4.92056  ,  -4.9808407, ...,  -4.663267 ,
          -6.56871  ,   0.7187822],
        [ -4.357745 ,  -4.537199 ,  -4.620675 , ...,  -4.39608  ,
          -6.212288 ,   0.9897884],
        ...,
        [ -3.999114 ,  -4.148432 ,  -4.211567 , ...,  

In [29]:
logits.shape

(3, 9, 30522)

In [30]:
outputs[0]

array([[[ -6.4346046,  -6.4063444,  -6.4097404, ...,  -5.7691364,
          -5.6326175,  -3.788285 ],
        [-14.011925 , -14.724042 , -14.211972 , ..., -11.697638 ,
         -10.730408 , -12.761747 ],
        [ -9.656142 , -10.312491 ,  -9.745864 , ...,  -8.77816  ,
          -6.603594 , -12.659599 ],
        ...,
        [ -3.7861156,  -3.857192 ,  -3.5644355, ...,  -2.5592554,
          -3.109321 ,  -4.3819613],
        [-11.659789 , -11.427393 , -11.926661 , ...,  -9.877244 ,
         -10.210293 ,  -4.7594104],
        [-11.72665  , -11.750851 , -11.803964 , ..., -10.594329 ,
         -10.940653 ,  -7.5151176]],

       [[ -4.9389725,  -5.101129 ,  -5.156227 , ...,  -4.9596376,
          -6.8469996,   0.8288686],
        [ -4.7209363,  -4.92056  ,  -4.9808407, ...,  -4.663267 ,
          -6.56871  ,   0.7187822],
        [ -4.357745 ,  -4.537199 ,  -4.620675 , ...,  -4.39608  ,
          -6.212288 ,   0.9897884],
        ...,
        [ -3.999114 ,  -4.148432 ,  -4.211567 , ...,  

In [31]:
outputs[0].shape


(3, 9, 30522)

In [32]:
bert_tokenizer.mask_token

'[MASK]'

In [33]:
bert_tokenizer('[MASK]')

{'input_ids': [101, 103, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [34]:
mask_idx = 6

In [37]:
prediction = logits[0, mask_idx, :]

In [38]:
prediction

array([-3.7861156, -3.857192 , -3.5644355, ..., -2.5592554, -3.109321 ,
       -4.3819613], dtype=float32)

In [39]:
prediction.shape

(30522,)

In [40]:
vocab_index = np.argmax(prediction)

In [41]:
vocab_index

3000

In [44]:
bert_tokenizer.decode([vocab_index])

'paris'

In [45]:
'''
>>> inputs = tokenizer("The capital of France is [MASK].", return_tensors="tf")
>>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]

>>> outputs = model(inputs)
>>> loss = outputs.loss
>>> logits = outputs.logits
'''

'\ninputs = tokenizer("The capital of France is [MASK].", return_tensors="tf")\ninputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]\n\noutputs = model(inputs)\nloss = outputs.loss\nlogits = outputs.logits\n'

In [46]:
import pandas as pd

In [48]:
df = pd.read_csv('ALL.csv')

In [57]:
data_list = df[:5]

In [58]:
data_list

Unnamed: 0,request_id,question_id,index,created_at,feedback,metadata,searchimageurl,mathpix_response,grade,feedbackimageurl,...,infographics_value,infographics_confidence,topic_value,topic_confidence,regr_data_type,section_v2_value,section_v2_confidence,infographics_v2_value,infographics_v2_confidence,rn
0,00004d3f-c0e7-406e-adbd-4629e318c1a1,M3190541E001,0.0,2021-01-20 14:01:28.17445,exact,"{""tags"": [], ""searchId"": """", ""searchImageUrl"":...",https://coln-prd-sg-s3-ads-pub.s3.ap-southeast...,"{""request_id"": ""cb71c470a45db5eb9d419e079274f8...",8.0,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,...,no_infographics,1.0,ruang sampel,0.6,positive,probability,0.94,no_infographics,1.0,
1,00004d3f-c0e7-406e-adbd-4629e318c1a1,M0251861P002,3.0,2020-10-16 14:01:36.579959,exact,"{""tags"": [], ""searchId"": """", ""searchImageUrl"":...",https://coln-prd-sg-s3-ads-pub.s3.ap-southeast...,"{""request_id"": ""cb71c470a45db5eb9d419e079274f8...",8.0,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,...,no_infographics,1.0,ruang sampel,0.6,positive,probability,0.94,no_infographics,1.0,
2,00004d3f-c0e7-406e-adbd-4629e318c1a1,M1350532E019,4.0,2020-08-22 02:01:45.250414,exact,"{""tags"": [], ""searchId"": """", ""searchImageUrl"":...",https://coln-prd-sg-s3-ads-pub.s3.ap-southeast...,"{""request_id"": ""cb71c470a45db5eb9d419e079274f8...",8.0,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,...,no_infographics,1.0,ruang sampel,0.6,positive,probability,0.94,no_infographics,1.0,
3,00004d3f-c0e7-406e-adbd-4629e318c1a1,M1591111P001,7.0,2021-05-20 14:01:51.214624,exact,"{""tags"": [], ""searchId"": """", ""searchImageUrl"":...",https://coln-prd-sg-s3-ads-pub.s3.ap-southeast...,"{""request_id"": ""cb71c470a45db5eb9d419e079274f8...",8.0,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,...,no_infographics,1.0,ruang sampel,0.6,positive,probability,0.94,no_infographics,1.0,
4,000050b6-21d3-41d8-b51c-ba712add7962,M2502031E002,0.0,2020-11-28 02:01:20.181711,exact,"{""tags"": [], ""searchId"": """", ""searchImageUrl"":...",https://coln-prd-sg-s3-ads-pub.s3.ap-southeast...,"{""request_id"": ""c70363000ab00da3b7031de502eb17...",8.0,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,...,cube/cuboid,0.82,,,positive,geometry,0.6,no_infographics,0.68,


In [61]:
data_list.loc[0,'metadata']

'{"tags": [], "searchId": "", "searchImageUrl": "https://coln-prd-sg-s3-ads-pub.s3.ap-southeast-1.amazonaws.com/images/search-questions/b0de1941-1c97-481b-80ec-6c0aa56f3463", "feedbackImageUrl": "", "comment": "", "image_type": ["More than one question in one image"], "picture_taken": "Screenshot", "subject": ["maths"], "section": ["STATISTIKA"], "chapter": ["PELUANG"], "topic": ["Ruang Sampel"], "flagged": false, "sample_type": "D0_"}___1,{"tags": [], "searchId": "", "searchImageUrl": "https://coln-prd-sg-s3-ads-pub.s3.ap-southeast-1.amazonaws.com/images/search-questions/b0de1941-1c97-481b-80ec-6c0aa56f3463", "feedbackImageUrl": "", "comment": "", "image_type": ["More than one question in one image"], "picture_taken": "Screenshot", "subject": ["maths"], "section": ["STATISTIKA"], "chapter": ["PELUANG"], "topic": ["Ruang Sampel"], "flagged": false, "sample_type": "D0_"}___2'

In [62]:
data_list.loc[0,:]

request_id                                 00004d3f-c0e7-406e-adbd-4629e318c1a1
question_id                                                        M3190541E001
index                                                                       0.0
created_at                                            2021-01-20 14:01:28.17445
feedback                                                                  exact
metadata                      {"tags": [], "searchId": "", "searchImageUrl":...
searchimageurl                https://coln-prd-sg-s3-ads-pub.s3.ap-southeast...
mathpix_response              {"request_id": "cb71c470a45db5eb9d419e079274f8...
grade                                                                       8.0
feedbackimageurl              https://msd-iq.s3-ap-southeast-1.amazonaws.com...
feedback_image_text           Sebuah dadu dan sebuah uang logam dilemparkan ...
created_date                                                2021-05-27 22:49:32
subject_value                           

In [63]:
data_list.loc[0,'mathpix_response']

'{"request_id": "cb71c470a45db5eb9d419e079274f893", "detected_alphabets": {"hi": false, "zh": false, "ja": false, "ko": false, "en": true, "ru": false, "th": false}, "is_printed": true, "is_handwritten": false, "auto_rotate_confidence": 0.00020815567736676144, "auto_rotate_degrees": 0, "confidence": 0.2724175613547004, "confidence_rate": 0.7061789593565754, "text": "2\\n)\\nSebuah mata uang di lempar, maka himpunan ruang sampelnya adalah....\\nA. \\\\{angka\\\\}\\nC. \\\\{gambar, angka\\\\}\\nB. \\\\{gambar\\\\}\\nD. 11\\nDua buah dadu logam di lempar bersama-sama. Banyaknya titik sampel adalah....\\nA. 36\\nB. 12\\nC. 8\\nD. 6", "html": "<div>2<br>\\n)<br>\\nSebuah mata uang di lempar, maka himpunan ruang sampelnya adalah....<br>\\nA. {angka}<br>\\nC. {gambar, angka}<br>\\nB. {gambar}<br>\\nD. 11<br>\\nDua buah dadu logam di lempar bersama-sama. Banyaknya titik sampel adalah....<br>\\nA. 36<br>\\nB. 12<br>\\nC. 8<br>\\nD. 6</div>\\n", "data": []}'

In [71]:
df_index_data = pd.read_csv('indexData.csv', sep=',', engine='python', error_bad_lines=False)

Skipping line 179052: unexpected end of data


In [72]:
df_index_data.columns

Index(['question_id', 's3_path', 'mathpix_response', 'created_at', 'extras',
       'subject', 'rn', 'image_url', 'question_text'],
      dtype='object')

In [73]:
df_index_data.head()

Unnamed: 0,question_id,s3_path,mathpix_response,created_at,extras,subject,rn,image_url,question_text
0,A0010103P003,s3://msd-iq/client_images/2020-11-07 02:01:15....,"{""request_id"": ""c16f527231a2bfa1e016bff8ec6d7f...",2020-11-07 02:01:15.947613,"{""source"": {""websiteDetails"": {""grade"": null, ...","[""maths""]",1,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,Diketahui \\( \\mathrm{X}=\\{\\mathrm{x} \\mid...
1,A0010103P004,s3://msd-iq/client_images/2021-05-20 14:01:51....,"{""request_id"": ""cbdd9656a6aa0b29cef0fb3e303ab9...",2021-05-20 14:01:51.214624,"{""description"": """", ""tags"": {""primaryTopic"": {...","[""maths""]",1,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,"Jika \\( \\mathrm{n}(\\mathrm{A})=10, \\mathrm..."
2,A0010109P001,s3://msd-iq/client_images/2021-05-20 14:01:51....,"{""request_id"": ""ae6403766208ebf0234b5b7c693e42...",2021-05-20 14:01:51.214624,"{""description"": """", ""tags"": {""primaryTopic"": {...","[""maths""]",1,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,Perhatikan persamaan-persamaan berikut !\n(i) ...
3,A0010109P002,s3://msd-iq/client_images/2021-05-20 14:01:51....,"{""request_id"": ""e05e1b73bfc8c070d099d9becb65c5...",2021-05-20 14:01:51.214624,"{""description"": """", ""tags"": {""primaryTopic"": {...","[""maths""]",1,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,Perhatikan persamaan-persamaan berikut !\n(i) ...
4,A0010109P003,s3://msd-iq/client_images/2020-11-06 02:01:18....,"{""request_id"": ""03ed91e0ba125482fec03cfd21d029...",2020-11-06 02:01:18.015423,"{""source"": {""websiteDetails"": {""grade"": null, ...","[""maths""]",1,https://msd-iq.s3-ap-southeast-1.amazonaws.com...,Rina membeli 3 kg apel dan 2 kg jeruk. Uang ya...


In [74]:
len(df_index_data)

179050

In [75]:
df_index_data.loc[0,'extras']

'{"source": {"websiteDetails": {"grade": null, "curriculum": null, "publication": "www.juraganles.com", "title": "Matematika Juragan Les"}, "type": "Website"}, "description": "", "timestamp": 1616997407079, "tags": {"chapter": ["HIMPUNAN"], "questionType": ["Uncategorized"], "section": ["ALJABAR"], "difficultyLevel": "D1", "imageInQuestion": false, "primaryTopic": {"grade": "7", "topicId": "07AN20207", "semester": "1", "streamType": "SMP"}, "topic": ["Operasi Himpunan"], "bloomsTaxonomy": ["C3 Aplikasi"], "subject": ["Maths"]}}'

In [76]:
from transformers import BertTokenizer, TFBertModel

model_name='cahya/bert-base-indonesian-522M'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name)
text = "Silakan diganti dengan text apa saja."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/468 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/545M [00:00<?, ?B/s]

Some layers from the model checkpoint at cahya/bert-base-indonesian-522M were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at cahya/bert-base-indonesian-522M.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [77]:
output

TFBaseModelOutputWithPooling(last_hidden_state=<tf.Tensor: shape=(1, 10, 768), dtype=float32, numpy=
array([[[-0.07926931, -0.41667727,  0.01540737, ...,  0.322052  ,
         -0.47354588, -0.7253795 ],
        [ 0.0719403 , -0.4571825 ,  0.8287917 , ...,  0.29290366,
         -1.1612344 ,  0.3168729 ],
        [-0.15491223, -0.94698083,  0.3294129 , ...,  1.2914243 ,
         -1.4274261 ,  0.27289003],
        ...,
        [-2.4184837 ,  0.13898858,  0.6879961 , ...,  2.126189  ,
          0.05418604,  0.00736356],
        [ 0.04243735,  0.31144488,  0.42113376, ...,  0.51701397,
          0.47521394, -0.764553  ],
        [-0.26480502,  0.497233  , -0.89467394, ...,  0.9380146 ,
         -0.48444355, -1.0249774 ]]], dtype=float32)>, pooler_output=<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-0.19222063, -0.5219983 , -0.8855573 , -0.13142143, -0.26546386,
         0.56655884,  0.13956298, -0.3470618 ,  0.4399549 ,  0.03775845,
        -0.05163716,  0.28938904,  0.666814  

In [78]:
model.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  110617344 
Total params: 110,617,344
Trainable params: 110,617,344
Non-trainable params: 0
_________________________________________________________________


In [85]:
output['pooler_output'].shape

TensorShape([1, 768])

In [87]:
pooled_output = output['pooler_output']

In [89]:
type(pooled_output)

tensorflow.python.framework.ops.EagerTensor

In [91]:
pooled_output_numpy_array = pooled_output.numpy()

In [92]:
type(pooled_output_numpy_array)

numpy.ndarray

In [93]:
pooled_output_numpy_array.shape

(1, 768)

In [121]:
text_1 = "ibu ku sedang bekerja di supermarket"
encoded_input_1 = tokenizer(text_1, return_tensors='tf')
output_1 = model(encoded_input_1)

text_2 = "ibu ku sedang bekerja sebagai supermarket"
encoded_input_2 = tokenizer(text_2, return_tensors='tf')
output_2 = model(encoded_input_2)
                 
text_3 = "ibu ku sedang bekerja dengan supermarket"
encoded_input_3 = tokenizer(text_3, return_tensors='tf')
output_3 = model(encoded_input_3)

text_4 = "Silakan diganti dengan text apa saja."
encoded_input_4 = tokenizer(text_4, return_tensors='tf')
output_4 = model(encoded_input_4)

In [122]:
pooled_output_numpy_array_1 = output_1['pooler_output'].numpy()
pooled_output_numpy_array_2 = output_2['pooler_output'].numpy()
pooled_output_numpy_array_3 = output_3['pooler_output'].numpy()
pooled_output_numpy_array_4 = output_4['pooler_output'].numpy()

In [123]:
pooled_output_numpy_array.shape

(1, 768)

In [124]:
target_numpy_array = np.array((pooled_output_numpy_array_2, pooled_output_numpy_array_3))

In [125]:
target_numpy_array.shape

(2, 1, 768)

In [126]:
reshaped_array = target_numpy_array.reshape(2, -1)

In [127]:
reshaped_array.shape

(2, 768)

In [128]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(
    pooled_output_numpy_array_1,
    pooled_output_numpy_array_2
)

array([[0.99274313]], dtype=float32)

In [129]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(
    pooled_output_numpy_array_1,
    pooled_output_numpy_array_3
)

array([[0.99489963]], dtype=float32)

In [130]:
cosine_similarity(
    pooled_output_numpy_array_1,
    pooled_output_numpy_array_4
)

array([[0.8794708]], dtype=float32)

In [132]:
# pooled_output_numpy_array_1

In [97]:
source_array = np.array([1,2,3])
target_arrays = np.array([[1,2,3], [4,5,6], [7,8,9]])
dotted_product = np.dot(source_array, target_arrays)

In [98]:
dotted_product

array([30, 36, 42])

In [99]:
# (1+8+21, 2+10+24, 3+12+27) - (30, 36, 42)

In [100]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)