In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tokenizers import BertWordPieceTokenizer
from tqdm import tqdm
from os import listdir
from os.path import join, isdir

In [None]:
classifiers_path = ''
results_path = ''
to_predict_path = ''
text_chunk_size = 64
epochs = 4
batch_size = 16
max_len = 128
learning_rate = 3e-5

In [None]:
def classification_model(classifiers_path, to_predict_path, results_path):
    
    # restore model structure
    def build_model(transformer, max_len=512):
        
        input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
        sequence_output = transformer(input_word_ids)[0]
        cls_token = sequence_output[:, 0, :]
        out = Dense(1, activation='sigmoid')(cls_token)

        model = Model(inputs=input_word_ids, outputs=out)
        model.compile(Adam(lr=learning_rate), loss='binary_crossentropy')
        return model

    
    # encoder for encoding the text into sequence of integers for BERT Input
    def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):

        tokenizer.enable_truncation(max_length=maxlen)
        tokenizer.enable_padding(max_length=maxlen)
        all_ids = []

        for i in tqdm(range(0, len(texts), chunk_size)):
            text_chunk = texts[i : i + chunk_size].tolist()
            encodings = tokenizer.encode_batch(text_chunk)
            all_ids.extend([encoding.ids for encoding in encodings])

        return np.array(all_ids)
    

    # TPU
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None
        
    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:

        strategy = tf.distribute.get_strategy()
    print("REPLICAS: ", strategy.num_replicas_in_sync)
    
    AUTO = tf.data.experimental.AUTOTUNE

    # importing tokenizer
    tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
    tokenizer.save_pretrained('.')

    # get all directories
    directories = [directory for directory in listdir(classifiers_path) 
                   if isdir(join(classifiers_path, directory))]
    
    # import BertWordPieceTokenizer
    fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
    
    # import models
    with strategy.scope():
        transformer_layer = (transformers.TFDistilBertModel
                                         .from_pretrained('distilbert-base-multilingual-cased'))

        classifications = pd.DataFrame([])
        
        for directory in directories:
            parameters = {}
            with open(classifiers_path + directory + '/model.properties', 'r') as file:
                lines = file.readlines()
                for line in lines:
                    parameter = line.strip('\n').split('=')
                    parameters[parameter[0]] = parameter[1]
            
            model = build_model(transformer_layer,
                                int(parameters['max_len']))
            
            model.load_weights(classifiers_path + directory + '/model_weights.h5')
            print(model.summary())
            
            # import dataset
            x_test_raw = pd.read_csv(to_predict_path, sep=';', index_col=0, header=None)
            x_test_encoded = fast_encode(x_test_raw.iloc[:, 0], fast_tokenizer, maxlen=int(parameters['max_len']))
            test_dataset = (tf.data.Dataset
                                   .from_tensor_slices(x_test_encoded)
                                   .batch(int(parameters['batch_size']) * strategy.num_replicas_in_sync))
            # predictions
            predictions = model.predict(test_dataset).round()

            # get labels
            labels = parameters['labels'].split(',')
            labels = {index: labels[index] for index in range(len(labels))}

            # add and map predictions
            classifications = pd.concat([classifications, pd.Series(predictions.ravel()).map(labels)], axis=1)
    classifications.to_csv(results_path + 'classifications.csv', sep=';', header=None)

In [None]:
classification_model(classifiers_path, to_predict_path, results_path)