In [51]:
import keras.backend as K
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import re
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tokenizers import BertWordPieceTokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import classification_report
from os import listdir
from os.path import isfile, join, isdir

In [52]:
input_path = ''
output_path = ''
classifiers_path = ''
results_path = ''
to_predict_path = ''
text_chunk_size = 64
epochs = 4
batch_size = 16
max_len = 128
learning_rate = 3e-5

In [53]:
def train_model(input_path, output_path, text_chunk_size, epochs, batch_size, max_len, learning_rate):
    
    # encoder for encoding the text into sequence of integers for BERT Input
    def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):

        tokenizer.enable_truncation(max_length=maxlen)
        tokenizer.enable_padding(max_length=maxlen)
        all_ids = []

        for i in tqdm(range(0, len(texts), chunk_size)):
            text_chunk = texts[i : i + chunk_size].tolist()
            encodings = tokenizer.encode_batch(text_chunk)
            all_ids.extend([encoding.ids for encoding in encodings])

        return np.array(all_ids)
    
    
    # function for training the BERT model
    def build_model(transformer, max_len=512):
        input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
        sequence_output = transformer(input_word_ids)[0]
        cls_token = sequence_output[:, 0, :]
        out = Dense(1, activation='sigmoid')(cls_token)

        model = Model(inputs=input_word_ids, outputs=out)
        model.compile(Adam(lr=learning_rate), loss='binary_crossentropy')

        return model
    
    
    # TPU
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None
        
    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:

        strategy = tf.distribute.get_strategy()
    print("REPLICAS: ", strategy.num_replicas_in_sync)
    
    AUTO = tf.data.experimental.AUTOTUNE
    
    # batch size according to TPU
    batch_size *= strategy.num_replicas_in_sync
    
    # importing tokenizer
    tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
    tokenizer.save_pretrained('.')
    
    # importing files and creating dataframe
    files = [file for file in listdir(input_path) if isfile(join(input_path, file))]
    files.sort()
    
    df = pd.DataFrame([])
    label = 0
    classes = [file.split('.')[0] for file in files]

    for file in files:
        file = open(input_path + file, mode='r')
        label_data = tokenizer.tokenize(file.read())
        len_label_data = len(label_data)
        label_df = pd.DataFrame([tokenizer.decode(tokenizer.convert_tokens_to_ids(label_data[i : i + text_chunk_size])) 
                                 for i in range(0, len_label_data, text_chunk_size)])
        label_df[1] = label
        df = pd.concat([df, label_df], ignore_index=True)
        label += 1
        file.close()
    
    # random undersampling
    min_class_n = df[1].value_counts().min()
    df = df.groupby(1).apply(lambda x: x.sample(min_class_n)).reset_index(drop=True)
    
    # split data
    X = df[0]
    y = df[1]
    x_train_raw, x_test_raw, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.3,
                                                                random_state=17)
    x_valid_raw, x_test_raw, y_valid, y_test = train_test_split(x_test_raw,
                                                                y_test,
                                                                test_size=0.33,
                                                                random_state=17)
   
    # import BertWordPieceTokenizer
    fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
    
    # dataset encoding
    x_train_encoded = fast_encode(x_train_raw, fast_tokenizer, maxlen=max_len)
    x_valid_encoded = fast_encode(x_valid_raw, fast_tokenizer, maxlen=max_len)
    x_test_encoded = fast_encode(x_test_raw, fast_tokenizer, maxlen=max_len)
    
    
    train_dataset = (tf.data.Dataset
                       .from_tensor_slices((x_train_encoded, y_train))
                       .repeat()
                       .shuffle(17)
                       .batch(batch_size)
                       .prefetch(AUTO))

    valid_dataset = (tf.data.Dataset
                       .from_tensor_slices((x_valid_encoded, y_valid))
                       .batch(batch_size)
                       .cache()
                       .prefetch(AUTO))

    test_dataset = (tf.data.Dataset
                      .from_tensor_slices(x_test_encoded)
                      .batch(batch_size))
    
    with strategy.scope():
        transformer_layer = (transformers.TFDistilBertModel
                                         .from_pretrained('distilbert-base-multilingual-cased'))
        model = build_model(transformer_layer, max_len)
    print(model.summary())
    
    n_steps = x_train_encoded.shape[0] // batch_size

    train_history = model.fit(train_dataset,
                              steps_per_epoch=n_steps,
                              validation_data=valid_dataset,
                              epochs=epochs)
    print(train_history)
    predictions = model.predict(test_dataset).round()
    
    # assessments
    assessments = open(output_path + 'assessments.txt', 'w')
    assessments.write(classification_report(y_test,
                                            predictions,
                                            target_names=classes) + \
                                            '\nConfusion Matrix\n' + \
                      str(tf.math.confusion_matrix(y_test.tolist(),
                          predictions.round().tolist(),
                          num_classes=len(classes)).numpy()))
    assessments.close()

    # save model's hyperparameters
    parameters = open(output_path + 'model.properties', 'w')
    parameters.write(f'text_chunk_size={text_chunk_size}\n' \
                     f'epochs={epochs}\n' \
                     f'batch_size={batch_size // strategy.num_replicas_in_sync}\n' \
                     f'max_len={max_len}\n' \
                     f'learning_rate={learning_rate}\n' \
                     f'labels={",".join(classes)}')
    parameters.close()
    
    # save model's weights
    model.save_weights(output_path + 'model_weights.h5')

In [54]:
train_model(input_path, output_path, text_chunk_size, epochs, batch_size, max_len, learning_rate)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


100%|██████████| 23/23 [00:00<00:00, 35.48it/s]
100%|██████████| 7/7 [00:00<00:00, 38.51it/s]
100%|██████████| 4/4 [00:00<00:00, 44.00it/s]


Model: "model_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 128)]             0         
_________________________________________________________________
tf_distil_bert_model_18 (TFD ((None, 128, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice_16 [(None, 768)]             0         
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
<tensorflow.python.keras.callbacks.History object at 0x7f44f899c410>


In [55]:
def classification_model(classifiers_path, to_predict_path, results_path):
    
    # restore model structure
    def build_model(transformer, max_len=512):
        
        input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
        sequence_output = transformer(input_word_ids)[0]
        cls_token = sequence_output[:, 0, :]
        out = Dense(1, activation='sigmoid')(cls_token)

        model = Model(inputs=input_word_ids, outputs=out)
        model.compile(Adam(lr=learning_rate), loss='binary_crossentropy')
        return model

    
    # encoder for encoding the text into sequence of integers for BERT Input
    def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):

        tokenizer.enable_truncation(max_length=maxlen)
        tokenizer.enable_padding(max_length=maxlen)
        all_ids = []

        for i in tqdm(range(0, len(texts), chunk_size)):
            text_chunk = texts[i : i + chunk_size].tolist()
            encodings = tokenizer.encode_batch(text_chunk)
            all_ids.extend([encoding.ids for encoding in encodings])

        return np.array(all_ids)
    

    # TPU
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None
        
    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:

        strategy = tf.distribute.get_strategy()
    print("REPLICAS: ", strategy.num_replicas_in_sync)
    
    AUTO = tf.data.experimental.AUTOTUNE

    # importing tokenizer
    tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
    tokenizer.save_pretrained('.')

    # get all directories
    directories = [directory for directory in listdir(classifiers_path) 
                   if isdir(join(classifiers_path, directory))]
    
    # import BertWordPieceTokenizer
    fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
    
    # import models
    with strategy.scope():
        transformer_layer = (transformers.TFDistilBertModel
                                         .from_pretrained('distilbert-base-multilingual-cased'))

        classifications = pd.DataFrame([])
        
        for directory in directories:
            parameters = {}
            with open(classifiers_path + directory + '/model.properties', 'r') as file:
                lines = file.readlines()
                for line in lines:
                    parameter = line.strip('\n').split('=')
                    parameters[parameter[0]] = parameter[1]
            
            model = build_model(transformer_layer,
                                int(parameters['max_len']))
            
            model.load_weights(classifiers_path + directory + '/model_weights.h5')
            print(model.summary())
            
            # import dataset
            x_test_raw = pd.read_csv(to_predict_path, sep=';', index_col=0, header=None)
            x_test_encoded = fast_encode(x_test_raw.iloc[:, 0], fast_tokenizer, maxlen=int(parameters['max_len']))
            test_dataset = (tf.data.Dataset
                                   .from_tensor_slices(x_test_encoded)
                                   .batch(int(parameters['batch_size']) * strategy.num_replicas_in_sync))
            # predictions
            predictions = model.predict(test_dataset).round()

            # get labels
            labels = parameters['labels'].split(',')
            labels = {index: labels[index] for index in range(len(labels))}

            # add and map predictions
            classifications = pd.concat([classifications, pd.Series(predictions.ravel()).map(labels)], axis=1)
    classifications.to_csv(results_path + 'classifications.csv', sep=';', header=None)

In [56]:
classification_model(classifiers_path, to_predict_path, results_path)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


100%|██████████| 1/1 [00:00<00:00, 324.71it/s]

Model: "model_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 128)]             0         
_________________________________________________________________
tf_distil_bert_model_19 (TFD ((None, 128, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice_17 [(None, 768)]             0         
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________
None



100%|██████████| 1/1 [00:00<00:00, 386.82it/s]

Model: "model_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 128)]             0         
_________________________________________________________________
tf_distil_bert_model_19 (TFD ((None, 128, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice_18 [(None, 768)]             0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________
None



