In [None]:
import keras.backend as K
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import os
from IPython.display import FileLink
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tokenizers import BertWordPieceTokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm

**Загрузка данных**

In [None]:
negative = pd.read_csv('../input/sentiments/negative.csv', sep=';', header=None)
# заменяем -1 на 0 в отрицательно окрашенных сообщениях
negative[4] = 0
positive = pd.read_csv('../input/sentiments/positive.csv', sep=';', header=None)
sentiments = pd.concat([negative, positive]).sample(frac=1).reset_index(drop=True)

In [None]:
# Проверяем все ли данные строкового типа в столбце твитов
any(sentiments[3].map(type) == str)

In [None]:
# Находим максимальную длину твита
max_str_len = sentiments[3].str.len().max()
print(max_str_len)

In [None]:
# Проверка на дупликаты
sentiments.duplicated().any()

In [None]:
# Проверка на missing values
print(sentiments[3].isnull().any())
print(sentiments[4].isnull().any())

In [None]:
# Делим данные на training, valid, test
X = sentiments.drop(4, axis=1)
y = sentiments[4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=17)

**Активация TPU**

On the settings box, bottom-right, select TPU v3-8 and accept the conditions. Execute the next cell, you should see an output message like Running on TPU: grpc://10.0.0.2:8470.

The code:

1. Initialize the TPU
2. Instantiate a distribution strategy, this will permit to run the model in parallel on multiple TPU replicas
3. Return the TPU object containing the distribution strategy settings

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# hyperparameters
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = max_str_len
LEARNING_RATE = 3e-5

In [None]:
# Загрузка BertWordPieceTokenizer-а
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
tokenizer.save_pretrained('.')
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i : i + chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
x_train = fast_encode(X_train[3], fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(X_valid[3], fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(X_test[3], fast_tokenizer, maxlen=MAX_LEN)

In [None]:
train_dataset = (tf.data.Dataset
                   .from_tensor_slices((x_train, y_train))
                   .repeat()
                   .shuffle(17)
                   .batch(BATCH_SIZE)
                   .prefetch(AUTO))

valid_dataset = (tf.data.Dataset
                   .from_tensor_slices((x_valid, y_valid))
                   .batch(BATCH_SIZE)
                   .cache()
                   .prefetch(AUTO))

test_dataset = (tf.data.Dataset
                  .from_tensor_slices(x_test)
                  .batch(BATCH_SIZE))

In [None]:
# F1-score
def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    return 2 * precision * recall / (precision + recall + K.epsilon())

In [None]:
def build_model(transformer, max_len=512):
    """
    Function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy',
                                                                               Precision(),
                                                                               Recall(),
                                                                               f1_score])
    
    return model

**Обучение модели**

In [None]:
%%time
with strategy.scope():
    transformer_layer = (transformers.TFDistilBertModel
                                     .from_pretrained('distilbert-base-multilingual-cased'))
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE

train_history = model.fit(train_dataset,
                          steps_per_epoch=n_steps,
                          validation_data=valid_dataset,
                          epochs=EPOCHS)

In [None]:
n_steps = X_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(valid_dataset.repeat(),
                            steps_per_epoch=n_steps,
                            epochs=EPOCHS*2)

**Confusion matrix на тестовых данных**

In [None]:
y_pred = model.predict(test_dataset)
tf.math.confusion_matrix(y_test.tolist(),
                         y_pred.round().tolist(),
                         num_classes=2)

In [None]:
model.save_weights('./model_weights.h5')

In [None]:
os.chdir(r'./')
FileLink(r'model_weights.h5')