In [1]:
import keras.backend as K
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tokenizers import BertWordPieceTokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import classification_report



In [2]:
TEXT_CHUNK_SIZE = 64

In [3]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
tokenizer.save_pretrained('.')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




('./vocab.txt', './special_tokens_map.json', './added_tokens.json')

In [4]:
file = open('../input/terrorism-data/positive.txt', mode='r')
positive = tokenizer.tokenize(file.read())
len_positive = len(positive)
positive = pd.DataFrame([tokenizer.decode(tokenizer.convert_tokens_to_ids(positive[i : i + TEXT_CHUNK_SIZE])) 
                         for i in range(0, len_positive, TEXT_CHUNK_SIZE)])
positive[1] = 1
file.close()

In [5]:
file = open('../input/terrorism-data/negative.txt', mode='r')
negative = tokenizer.tokenize(file.read())
len_negative = len(negative)
negative = pd.DataFrame([tokenizer.decode(tokenizer.convert_tokens_to_ids(negative[i : i + TEXT_CHUNK_SIZE])) 
                         for i in range(0, len_negative, TEXT_CHUNK_SIZE)])
negative[1] = 0
negative = negative.sample(n=positive.shape[0], random_state=17)
file.close()

In [6]:
# random undersampling
texts = pd.concat([negative, positive]).sample(frac=1, random_state=17).reset_index(drop=True)
texts[1].value_counts()

1    4116
0    4116
Name: 1, dtype: int64

In [7]:
# Делим данные на training, valid, test
X = texts[0]
y = texts[1]
x_train_raw, x_test_raw, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.3,
                                                            random_state=17)
x_valid_raw, x_test_raw, y_valid, y_test = train_test_split(x_test_raw,
                                                            y_test,
                                                            test_size=0.33,
                                                            random_state=17)

In [8]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [9]:
AUTO = tf.data.experimental.AUTOTUNE

# hyperparameters
EPOCHS = 4
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 128
LEARNING_RATE = 3e-5

In [10]:
# Загрузка BertWordPieceTokenizer-а
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [11]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i : i + chunk_size].tolist()
        encodings = tokenizer.encode_batch(text_chunk)
        all_ids.extend([encoding.ids for encoding in encodings])
    
    return np.array(all_ids)

In [12]:
x_train_encoded = fast_encode(x_train_raw, fast_tokenizer, maxlen=MAX_LEN)
x_valid_encoded = fast_encode(x_valid_raw, fast_tokenizer, maxlen=MAX_LEN)
# x_test_encoded = fast_encode(x_test_raw, fast_tokenizer, maxlen=MAX_LEN)

100%|██████████| 23/23 [00:00<00:00, 35.53it/s]
100%|██████████| 7/7 [00:00<00:00, 37.86it/s]


In [13]:
train_dataset = (tf.data.Dataset
                   .from_tensor_slices((x_train_encoded, y_train))
                   .repeat()
                   .shuffle(17)
                   .batch(BATCH_SIZE)
                   .prefetch(AUTO))

valid_dataset = (tf.data.Dataset
                   .from_tensor_slices((x_valid_encoded, y_valid))
                   .batch(BATCH_SIZE)
                   .cache()
                   .prefetch(AUTO))

# test_dataset = (tf.data.Dataset
#                   .from_tensor_slices(x_test_encoded)
#                   .batch(BATCH_SIZE))

In [14]:
# F1-score
def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    return 2 * precision * recall / (precision + recall + K.epsilon())

In [15]:
def build_terrorism_model(transformer, max_len=512):
    """
    Function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=LEARNING_RATE), loss='binary_crossentropy')
    
    return model

In [16]:
%%time
with strategy.scope():
    transformer_layer = (transformers.TFDistilBertModel
                                     .from_pretrained('distilbert-base-multilingual-cased'))
    terrorism_model = build_terrorism_model(transformer_layer, max_len=MAX_LEN)
terrorism_model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=910749124.0, style=ProgressStyle(descri…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 128)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 128, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________
CPU times: user 36.8 s, sys: 11.5 s, total: 48.3 s
Wall time: 1min 13s


In [17]:
n_steps = x_train_encoded.shape[0] // BATCH_SIZE

train_history = terrorism_model.fit(train_dataset,
                                    steps_per_epoch=n_steps,
                                    validation_data=valid_dataset,
                                    epochs=EPOCHS)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [18]:
# y_pred = model.predict(test_dataset)
# tf.math.confusion_matrix(y_test.tolist(),
#                          y_pred.round().tolist(),
#                          num_classes=2)

In [19]:
def build_sentiment_model(transformer, max_len=512):
    """
    Function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=3e-5), loss='binary_crossentropy')
    return model


with strategy.scope():
    transformer_layer = (transformers.TFDistilBertModel
                                     .from_pretrained('distilbert-base-multilingual-cased'))
    sentiment_model = build_sentiment_model(transformer_layer, max_len=189)
sentiment_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 189)]             0         
_________________________________________________________________
tf_distil_bert_model_1 (TFDi ((None, 189, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 768)]             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________


In [20]:
sentiment_model.load_weights('../input/bert-weights/model_weights.h5')

In [21]:
def classify(x_test_raw):
    x_test_encoded = fast_encode(x_test_raw, fast_tokenizer, maxlen=MAX_LEN)
    test_dataset = (tf.data.Dataset
                      .from_tensor_slices(x_test_encoded)
                      .batch(BATCH_SIZE))
    terrorism_predictions = terrorism_model.predict(test_dataset).round()
    
    print(classification_report(y_test, terrorism_predictions, target_names=['positive', 'negative']))
    print(tf.math.confusion_matrix(y_test.tolist(),
                                   terrorism_predictions.round().tolist(),
                                   num_classes=2))
   
    sentiment_classes = pd.Series(sentiment_model.predict(test_dataset).round().ravel())
    classifications = pd.concat([pd.Series(terrorism_predictions.ravel()), sentiment_classes], axis=1)
    classifications[0] = classifications[0].map({0: 'not_terrorism',
                                                 1: 'terrorism'})
    classifications[1] = classifications[1].map({0: 'negative',
                                                 1: 'positive'})
    classifications.to_csv('./classifications.csv', sep=';', header=None)

In [22]:
classify(x_test_raw)

100%|██████████| 4/4 [00:00<00:00, 43.96it/s]


              precision    recall  f1-score   support

    positive       0.91      0.97      0.94       415
    negative       0.97      0.90      0.93       401

    accuracy                           0.94       816
   macro avg       0.94      0.94      0.94       816
weighted avg       0.94      0.94      0.94       816

tf.Tensor(
[[403  12]
 [ 40 361]], shape=(2, 2), dtype=int32)


In [23]:
from IPython.display import FileLink
FileLink(r'classifications.csv')