In [None]:
import sys
import os
import tensorflow as tf
import shutil
sys.path.append('../')
from ner.embeddings_resolver import BertEmbeddingsResolver
from ner.ner_model_saver import NerModelSaver

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

# Manully add sparknlp developer library
sparknlp_path = '../../'
if sparknlp_path:
    sys.path.append(sparknlp_path)

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.embeddings import *
import sparknlp 

import time
import zipfile
#Setting location of resource Directory
resource_path= "../../../src/test/resources/"

In [None]:
spark = sparknlp.start()

print("Spark NLP version")
sparknlp.version()
print("Apache Spark version")
spark.version

In [None]:
def download_model(url, folder):
    import os
    from pathlib import Path
    import urllib.request
    import zipfile
    zip_file = folder + ".zip"
    if not Path(zip_file).is_file():
        print("Downloading " + url)
        urllib.request.urlretrieve(url, zip_file)
    if not os.path.exists(folder):
        print("Unziping ")
        zip_ref = zipfile.ZipFile(zip_file, 'r')
        zip_ref.extractall("./")
        zip_ref.close()


def get_service_token_ids(source_bert_folder):
    start_id = 0
    end_id = 0
    with open(os.path.join(source_bert_folder, "vocab.txt")) as f:
        for line, row in enumerate(f):
            row = row.strip()
            if row == '[CLS]':
                start_id = line
            if row == '[SEP]':
                end_id = line
    return (start_id, end_id)


def create_model(source_bert_folder, export_dir, max_length = 256, batch_size = 5):
    tf.reset_default_graph()
    is_cased = 'uncased' not in source_bert_folder.lower()
    print("source_bert_folder: {}".format(source_bert_folder))
    print("is_cased: {}".format(is_cased))
    resolver = BertEmbeddingsResolver(source_bert_folder, max_length, lowercase=is_cased)
    saver = NerModelSaver(resolver, None)
    saver.save_models(export_dir)
    resolver.session.close()
    shutil.copyfile(os.path.join(source_bert_folder, 'vocab.txt'),
                    os.path.join(export_dir, 'vocab.txt'))
    dim = resolver.config.hidden_size
    model = BertEmbeddings.loadFromPython(export_dir, spark) \
        .setMaxSentenceLength(max_length) \
        .setBatchSize(batch_size) \
        .setDimension(dim) \
        .setCaseSensitive(is_cased) \
        .setInputCols(["sentence", "wordpiece"]) \
        .setOutputCol("bert")
    return model


def download_and_convert(url, name, max_length = 256, batch_size = 5, dst_folder = 'models'):
    if not os.path.exists(dst_folder):
        os.makedirs(dst_folder)
    download_model(url, name)
    model = create_model(name, name + 'export_dir', 256, 5)
    # Remove but it's possible to use this model
    shutil.rmtree(name + 'export_dir')
    shutil.rmtree(name)
    model.write().overwrite().save(os.path.join(dst_folder, name))
    return model

## Find models and source code here https://github.com/google-research/bert 

In [None]:
# 1. Base uncased
url = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip'
name = 'uncased_L-12_H-768_A-12'
download_and_convert(url, name)

In [None]:
# 2. Large uncased
url = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip'
name = 'uncased_L-24_H-1024_A-16'
download_and_convert(url, name)

In [None]:
# 3. Base cased
url = 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip'
name = 'cased_L-12_H-768_A-12'
download_and_convert(url, name)

In [None]:
# 4. Large cased
url = 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip'
name = 'cased_L-24_H-1024_A-16'
download_and_convert(url, name)

In [None]:
print('upload all generated models from folder "models"')