In [13]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import numpy as np
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForMaskedLM

from tqdm import tqdm

import tensorflow as tf
import os
print(f"Tensorflow version: {tf.__version__}")

Tensorflow version: 2.6.0


In [14]:
# Restrict TensorFlow to only allocate 4GBs of memory on the first GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
    #tf.config.experimental.set_memory_growth(gpus[0], True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(f"The system contains '{len(gpus)}' Physical GPUs and '{len(logical_gpus)}' Logical GPUs")
  except RuntimeError as e:
    print(e)
else:
    print(f"Your system does not contain a GPU that could be used by Tensorflow!")

The system contains '1' Physical GPUs and '1' Logical GPUs


In [54]:
data = pd.read_json('./data/data.json')

data = data.filter(['title', 'total_comments'])
data.head()

Unnamed: 0,title,total_comments
0,"WHO na Kitajsko pošilja strokovnjake, ki bodo ...",15
1,Podgoršek: Prehranska varnost v Sloveniji tren...,44
2,"""Moramo se rešiti"": na desettisoče ljudi prote...",11
3,Andrijanič za mobilno aplikacijo z vsemi stori...,131
4,Ursula von der Leyen zaradi madžarskega zakona...,172


In [57]:
X = (np.array(data['title']))
y = (np.array(data['total_comments']))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
print("Train dataset shape: {0}, \nTest dataset shape: {1} \nValidation dataset shape: {2}".format(X_train.shape, X_test.shape, X_val.shape))

[ 15  44  11 131 172 131   3  43 180   2]
Train dataset shape: (7318,), 
Test dataset shape: (915,) 
Validation dataset shape: (915,)
[76, 27, 17, 20, 4, 4, 0, 2, 13, 0]


In [21]:
tokenizer = AutoTokenizer.from_pretrained("EMBEDDIA/sloberta")
model = AutoModelForMaskedLM.from_pretrained("EMBEDDIA/sloberta")

In [25]:
def get_token_ids(texts):
    return tokenizer.batch_encode_plus(texts, add_special_tokens=True, padding = True)["input_ids"]

train_token_ids = get_token_ids(list(X_train))
test_token_ids = get_token_ids(list(X_test))

In [26]:
train_data = tf.data.Dataset.from_tensor_slices((tf.constant(train_token_ids), tf.constant(y_train))).batch(12)
test_data = tf.data.Dataset.from_tensor_slices((tf.constant(test_token_ids), tf.constant(y_test))).batch(12)

In [45]:
from transformers import TFCamembertForMaskedLM, TFBertMainLayer
from tensorflow.keras import layers

import tensorflow as tf
class SloBertEmbeddingModel(TFCamembertForMaskedLM):
    def __init__(self, config,
                 cnn_filters=50,
                 dnn_units=512,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model",
                 *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.bert = TFBertMainLayer(config, name="bert", trainable = False)
        
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.last_dense = layers.Dense(units=1, activation="linear")

    def call(self, inputs, training = False, **kwargs):        
        bert_outputs = self.bert(inputs, training = training, **kwargs)
        
        l_1 = self.cnn_layer1(bert_outputs[0]) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(bert_outputs[0]) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(bert_outputs[0])
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [47]:
CNN_FILTERS = 100
DNN_UNITS = 256
DROPOUT_RATE = 0.2
NB_EPOCHS = 5

text_model = SloBertEmbeddingModel.from_pretrained('EMBEDDIA/sloberta',
                        from_pt=True,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        dropout_rate=DROPOUT_RATE)

text_model.compile(optimizer='adam',
                loss='mse',
                metrics=['mae'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model SloBertEmbeddingModel: ['roberta.encoder.layer.4.output.dense.bias', 'lm_head.layer_norm.bias', 'roberta.encoder.layer.5.attention.self.value.weight', 'roberta.encoder.layer.5.output.LayerNorm.bias', 'roberta.encoder.layer.11.intermediate.dense.weight', 'roberta.encoder.layer.2.attention.self.key.weight', 'roberta.encoder.layer.10.attention.self.query.bias', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.1.attention.self.key.bias', 'roberta.encoder.layer.3.attention.self.value.weight', 'roberta.encoder.layer.6.attention.output.LayerNorm.weight', 'roberta.encoder.layer.8.attention.output.LayerNorm.weight', 'roberta.encoder.layer.5.attention.self.key.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.8.attention.self.key.bias', 'roberta.encoder.layer.9.attention.self.key.weight', 'roberta.encoder.l

In [48]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1c34193a0d0>

In [61]:
results_predicted = [round(x[0]) for x in text_model.predict(test_data)]
results_true = list(y_test)

print(f"Predicted: {results_predicted[:20]}")
print(f"Truth    : {results_true[:20]}")

Predicted: [116, 53, 49, 49, 47, 67, 54, 93, 52, 49, 79, 129, 56, 87, 65, 182, 56, 102, 53, 60]
Truth    : [76, 27, 17, 20, 4, 4, 0, 2, 13, 0, 70, 1575, 16, 25, 7, 32, 40, 52, 17, 1]


In [62]:
from sklearn.metrics import mean_absolute_error

print(f"MAE score: {mean_absolute_error(results_true, results_predicted)}")

MAE score: 76.39562841530055
