In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tokenizers import BertWordPieceTokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm



**Загрузка данных**

In [2]:
negative = pd.read_csv('../input/sentiments/negative.csv', sep=';', header=None)
# заменяем -1 на 0 в отрицательно окрашенных сообщениях
negative[4] = 0
positive = pd.read_csv('../input/sentiments/positive.csv', sep=';', header=None)
sentiments = pd.concat([negative, positive]).sample(frac=1).reset_index(drop=True)

In [3]:
# Проверяем все ли данные строкового типа в столбце твитов
any(sentiments[3].map(type) == str)

True

In [4]:
# Находим максимальную длину твита
max_str_len = sentiments[3].str.len().max()
print(max_str_len)

189


In [5]:
# Проверка на дупликаты
sentiments.duplicated().any()

False

In [6]:
# Проверка на missing values
print(sentiments[3].isnull().any())
print(sentiments[4].isnull().any())

False
False


In [7]:
# Делим данные на training, valid, test
X = sentiments.drop(4, axis=1)
y = sentiments[4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

**Активация TPU**

On the settings box, bottom-right, select TPU v3-8 and accept the conditions. Execute the next cell, you should see an output message like Running on TPU: grpc://10.0.0.2:8470.

The code:

1. Initialize the TPU
2. Instantiate a distribution strategy, this will permit to run the model in parallel on multiple TPU replicas
3. Return the TPU object containing the distribution strategy settings

In [8]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [9]:
AUTO = tf.data.experimental.AUTOTUNE

# hyperparameters
EPOCHS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 256
LEARNING_RATE = 1e-5

In [10]:
# Загрузка BertWordPieceTokenizer-а
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
tokenizer.save_pretrained('.')
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [11]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i : i + chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [12]:
x_train = fast_encode(X_train[3], fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(X_test[3], fast_tokenizer, maxlen=MAX_LEN)

100%|██████████| 621/621 [00:16<00:00, 37.46it/s]
100%|██████████| 266/266 [00:06<00:00, 43.51it/s]


In [13]:
train_dataset = (tf.data.Dataset
                   .from_tensor_slices((x_train, y_train))
                   .repeat()
                   .shuffle(17)
                   .batch(BATCH_SIZE)
                   .prefetch(AUTO))

test_dataset = (tf.data.Dataset
                  .from_tensor_slices((x_test, y_test))
                  .batch(BATCH_SIZE)
                  .cache()
                  .prefetch(AUTO))

In [14]:
def build_model(transformer, max_len=512):
    """
    Function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

**Обучение модели**

In [15]:
%%time
with strategy.scope():
    transformer_layer = (transformers.TFDistilBertModel
                                     .from_pretrained('distilbert-base-multilingual-cased'))
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=910749124.0, style=ProgressStyle(descri…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 256)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 256, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________
CPU times: user 35.8 s, sys: 13.6 s, total: 49.4 s
Wall time: 1min 3s


In [16]:
n_steps = x_train.shape[0] // BATCH_SIZE

train_history = model.fit(train_dataset,
                          steps_per_epoch=n_steps,
                          validation_data=test_dataset,
                          epochs=EPOCHS)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [17]:
n_steps = X_test.shape[0] // BATCH_SIZE

train_history_2 = model.fit(test_dataset.repeat(),
                            steps_per_epoch=n_steps,
                            epochs=EPOCHS*2)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
