In [9]:
import sys, os, zipfile, json, pandas as pd, shutil, boto3, datetime, tabulate, pytz
from collections import OrderedDict

sys.path.append("/home/fernandrez/JSL/repos/spark-nlp/python")
sys.path.append("/home/fernandrez/JSL/repos/spark-nlp-internal/python")

In [10]:
s3 = boto3.client('s3')
bucket = 'auxdata.johnsnowlabs.com'
bucket_url = f"https://s3.console.aws.amazon.com/s3/object/{bucket}"
download_url = f"s3://{bucket}/"

In [11]:
def get_s3_metadata(prefix="clinical/models"):
    return s3.list_objects_v2(
    Bucket=bucket,
    EncodingType='url',
    Prefix=prefix)

In [12]:
def filter_desired_names(content):
    return [k["Key"] for k in content
         if k["Key"][-3:]=="zip" and 
         "_en_2" in k["Key"] and 
         "2ng" not in k["Key"]and 
         "icdoem" not in k["Key"] and
         "snomed_l" not in k["Key"] and
         "rxnorm_l" not in k["Key"] and
         "/resolve" not in k["Key"] and
         "noncontrib" not in k["Key"] and
         "embeddings_icd10_base" not in k["Key"] and
         "icdem" not in k["Key"] and
         "demo" not in k["Key"] and
         "_n2c" not in k["Key"] and
         "people_disambiguator" not in k["Key"]
        ]

In [13]:
def get_latest(model_list, lang="en"):
    model_tuples = [(*m.split(f"_{lang}_"),m) for m in model_list if lang in m]
    model_tuples_versions = [(*m[0].rsplit("/",1),*(m[1].split("_")),m[2]) for m in model_tuples]
    model_names = [(m[0],m[1],lang,f"`{m[2]}`",f"`{m[3]}`",m[4][:-4],
                    f"[:floppy_disk:]({bucket_url}/{m[5]} 'Download')",
                    f"[:computer:]({download_url}/{m[5]} 'S3')") 
                   for m in model_tuples_versions]
    all_models = pd.DataFrame(model_names, 
                      columns=["Collection","Name","Lang","Build","Spark Version","Timestamp","Download","S3"])
    latest = pd.DataFrame(all_models.groupby("Name")["Timestamp"].max()).reset_index()\
                        .set_index(["Name","Timestamp"])
    names = all_models.set_index(["Name","Timestamp"]).join(latest, how="inner")
    names.reset_index(inplace=True)
    names["Name"] = names["Name"].str.split("/").apply(lambda x: x[-1])
    names["Date"] = names["Timestamp"].apply(lambda x: datetime.datetime.fromtimestamp(int(x)/1000).strftime("%Y-%m-%d"))
    names.reset_index(inplace=True)
    return names

In [38]:
def write_github_table(outpath="/home/fernandrez/JSL/repos/spark-nlp-models/enterprise.md"):
    metadata = pd.read_csv("/home/fernandrez/JSL/repos/spark-nlp-models/entrerprise/model_metadata.csv")
    names = filter_desired_names(get_s3_metadata()["Contents"])
    model_list = get_latest(names).set_index("Name").join(metadata.set_index("Name"), how="inner")\
    .reset_index().fillna("")
    model_list["Name"] = model_list["Name"].apply(lambda x: f"`{x}`")
    model_list["Model"] = model_list["Model"].apply(lambda x: f"`{x}`")
    model_list["Build"] = model_list["Build"].apply(lambda x: f"{x}")
    model_list["TrainedOn"] = model_list[["TrainedOn", "DatasetLink"]].apply(
        lambda x: f"[:clipboard:]({(x.DatasetLink if x.DatasetLink else '#')} '{x.TrainedOn}')", axis=1)
    model_list["Extracts"] = model_list["Extracts"].apply(lambda x: f"[:mag:](# 'Extracts: {x}')" if x else "")
    model_list.drop("DatasetLink", axis=1, inplace=True)
    model_list.sort_values("Model", inplace=True)
    selhdrs=["Model", "Name", "Build", "Extracts", "TrainedOn", "Download"]
    table = tabulate.tabulate(model_list[selhdrs],
                              headers = selhdrs[:-3]+["","",""],
                              tablefmt="github",
                             showindex="false")
    with open(outpath,"w") as f:
        f.write(table)

In [39]:
def print_aux_dict(prefix, model_prefix):
    response = get_s3_metadata(prefix)
    if prefix[-1] != "/":
        prefix = prefix+"/"
    print("{")
    for d in response["Contents"]:
        print('"'+d["Key"].replace(prefix,"")+f'":"{model_prefix}_",')
    print("}")

## WRITE TABLE

In [40]:
write_github_table()

## PUBLISH MODELS

In [43]:
source_prefix = "clinical/resources/temp_models"
model_prefix = "ner"
print_aux_dict(prefix, model_prefix)


{
"JNLPBA_NER_model_20200420_30e_b32.zip":"ner_",
"anatomy_NER_model_20200314_30e_b8.zip":"ner_",
"deid_NER_model_20200326_7_labels_20e_b8.zip":"ner_",
"deid_NER_model_20200326_enriched_labels_50e_b8.zip":"ner_",
"deid_NER_model_large_20200406_7_labels_30e_b32.zip":"ner_",
"disease%2Bi2b2_NER_model_20200306.zip":"ner_",
"drug_NER_model_20200306.zip":"ner_",
"jsl_internal_NER_model_20200325_50e_b16.zip":"ner_",
"jsl_internal_NER_model_20200325_52_labels_30e_b8.zip":"ner_",
"posology_NER_2018_large_model_v2_20200403_7_labels_30e_b32.zip":"ner_",
"posology_NER_2018_v2_model_20200402_7_labels_30e_b32.zip":"ner_",
"posology_NER_model_large_20200309_10e.zip":"ner_",
"risk_factor_NER_model_20200401_19_labels_50e_b8.zip":"ner_",
}


In [48]:
names = {
"JNLPBA_NER_model_20200420_30e_b32.zip":"ner_cellular",
"anatomy_NER_model_20200314_30e_b8.zip":"ner_anatomy",
"deid_NER_model_20200326_enriched_labels_50e_b8.zip":"ner_deid_enriched",
"deid_NER_model_large_20200406_7_labels_30e_b32.zip":"ner_deid_large",
"jsl_internal_NER_model_20200325_50e_b16.zip":"ner_jsl",
"jsl_internal_NER_model_20200325_52_labels_30e_b8.zip":"ner_jsl_enriched",
"posology_NER_2018_large_model_v2_20200403_7_labels_30e_b32.zip":"ner_posology_large",
"posology_NER_2018_v2_model_20200402_7_labels_30e_b32.zip":"ner_posology_small",
"risk_factor_NER_model_20200401_19_labels_50e_b8.zip":"ner_risk_factors",
}

In [102]:
def publish_models(names, lang, libv, sparkv, correctly_zipped=True):
    metas=[]
    ts = int(datetime.datetime.timestamp(datetime.datetime.now())*1000)
    libparts = '{{"parts":[{}]}}'.format(",".join(libv.split(".")))
    sparkparts = '{{"parts":[{}]}}'.format(",".join(sparkv.split(".")))
    for p, n in names.items():
        ts -= 1000
        tme = datetime.datetime.fromtimestamp(ts/1000,pytz.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]+"Z"
        nm = f'{{"name":"{n}","language":"{lang}","libVersion":{libparts},"sparkVersion":{sparkparts},"readyToUse":true,"time":"{tme}","isZipped":true,"category":"nd","checksum":""}}'
        fn = f"{n}_{lang}_{libv}_{sparkv}_{ts}"
        metas.append(nm)
        if correctly_zipped:
            s3.copy({"Bucket":bucket,"Key":os.path.join(prefix,p)}, 
                    bucket, os.path.join("clinical/models", fn+".zip"))
        else:
            s3.download_file(bucket, os.path.join(prefix,p), os.path.join("temp_models",n))
            with zipfile.ZipFile(os.path.join("temp_models",n), 'r') as zip_ref:
                zip_ref.extractall(os.path.join("temp_models",n+"_unzipped"))
            os.remove(os.path.join("temp_models",n))
            for root, dirs, files in os.walk(os.path.join("temp_models",n+"_unzipped"), topdown=True): 
                if "metadata" in dirs:
                    break
            shutil.make_archive(os.path.join("temp_models",fn), "zip", root)
            shutil. rmtree(os.path.join("temp_models",n+"_unzipped"))
            s3.upload_file(os.path.join("temp_models",fn+".zip"), bucket,  os.path.join("clinical/models", fn+".zip"))
            os.remove(os.path.join("temp_models",fn+".zip"))
    return metas

In [107]:
libv="2.4.2"
sparkv="2.4"
lang="en"
correctly_zipped = False

In [105]:
# metas = publish_models(names, lang, libv, sparkv, False)
# for m in metas:
#     print(m)

{"name":"ner_cellular","language":"en","libVersion":{"parts":[2,4,2]},"sparkVersion":{"parts":[2,4]},"readyToUse":true,"time":"2020-04-21T23:55:08.751Z","isZipped":true,"category":"nd","checksum":""}
{"name":"ner_anatomy","language":"en","libVersion":{"parts":[2,4,2]},"sparkVersion":{"parts":[2,4]},"readyToUse":true,"time":"2020-04-21T23:55:07.751Z","isZipped":true,"category":"nd","checksum":""}
{"name":"ner_deid_enriched","language":"en","libVersion":{"parts":[2,4,2]},"sparkVersion":{"parts":[2,4]},"readyToUse":true,"time":"2020-04-21T23:55:06.751Z","isZipped":true,"category":"nd","checksum":""}
{"name":"ner_deid_large","language":"en","libVersion":{"parts":[2,4,2]},"sparkVersion":{"parts":[2,4]},"readyToUse":true,"time":"2020-04-21T23:55:05.751Z","isZipped":true,"category":"nd","checksum":""}
{"name":"ner_jsl","language":"en","libVersion":{"parts":[2,4,2]},"sparkVersion":{"parts":[2,4]},"readyToUse":true,"time":"2020-04-21T23:55:04.751Z","isZipped":true,"category":"nd","checksum":""}

In [114]:
for v in names.values():
    print('"'+v+'","NerDLModel","","",""')

"ner_cellular","NerDLModel",""
"ner_anatomy","NerDLModel",""
"ner_deid_enriched","NerDLModel",""
"ner_deid_large","NerDLModel",""
"ner_jsl","NerDLModel",""
"ner_jsl_enriched","NerDLModel",""
"ner_posology_large","NerDLModel",""
"ner_posology_small","NerDLModel",""
"ner_risk_factors","NerDLModel",""


In [108]:
from sparknlp.annotator import *
for p,n in names.items():
    print(n)
    NerDLModel.pretrained(n, "en", "clinical/models")

ner_cellular
ner_cellular download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_anatomy
ner_anatomy download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_deid_enriched
ner_deid_enriched download started this may take some time.
Approximate size to download 14.2 MB
[OK!]
ner_deid_large
ner_deid_large download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_jsl
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_jsl_enriched
ner_jsl_enriched download started this may take some time.
Approximate size to download 14.1 MB
[OK!]
ner_posology_large
ner_posology_large download started this may take some time.
Approximate size to download 13.8 MB
[OK!]
ner_posology_small
ner_posology_small download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_risk_factors
ner_risk_factors download started this may take some time.
Approxim

In [11]:
def moveParam(json_data, origin, destination):
    params = list(json_data["paramMap"].keys())
    defParams = list(json_data["defaultParamMap"].keys())
    
    if origin not in params+defParams:
        print(f"{origin} not found")
    if origin in params:
        final_value = json_data["paramMap"][origin] if json_data["paramMap"][origin]!="embeddings_icd10cmo" else "embeddings_icdoem"
        json_data["paramMap"][destination] = final_value
        del json_data["paramMap"][origin]
        print(f"Moved {origin} to {destination}")
    if origin in defParams:
        final_value = json_data["defaultParamMap"][origin] if json_data["defaultParamMap"][origin]!="embeddings_icd10cmo" else "embeddings_icdoem"
        json_data["defaultParamMap"][destination] = final_value
        del json_data["defaultParamMap"]["embeddingsRef"]  
        
def deleteParam(json_data, param):
    params = list(json_data["paramMap"].keys())
    defParams = list(json_data["defaultParamMap"].keys())
    if param in params:
        del json_data["paramMap"][param]
        print(f"Deleted {param}")
    if param in defParams:
        del json_data["defaultParamMap"][param]
        
def addParam(json_data, param, value):
    params = list(json_data["paramMap"].keys())
    defParams = list(json_data["defaultParamMap"].keys())
    json_data["paramMap"][param] = value
    json_data["defaultParamMap"][param] = value
    print(f"Added {param}:{value}")

def update_json(json_data):
    json_data["timestamp"] = timestamp
    model_class = json_data["class"]
    
    moveParam(json_data, "embeddingsRef", "storageRef")
    moveParam(json_data, "includeEmbeddings", "includeStorage")
    
    if model_class in ["com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel",
                       "com.johnsnowlabs.nlp.annotators.assertion.dl.AssertionDLModel",
                      "com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegModel"]:
        addParam(json_data, "storageRef", "clinical")
        
    return json_data

In [9]:


metadata_dict = OrderedDict({
    "name":"textmatch_cpt_token",
    "language":"en",
    "libVersion":{"parts":[2,4,0]},
    "sparkVersion":{"parts":[2,4]},
    "readyToUse":True,
    "time":date,
    "isZipped":True,
    "category":"nd",
    "checksum":""
})

In [13]:
new_metadata = open(os.path.join(model_folder,"new_metadata.json"), "w")
for m in model_list:
    full_path = os.path.join(model_folder, m)
    name = m.split("_en")[0]
    nm = m[:-27]+"2.4.0_2.4_1580237286004"
    print(m,"-->",nm)
    extract_path = os.path.join(model_folder, nm)
    with zipfile.ZipFile(full_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    json_original = json.load(open(extract_path+"/metadata/part-00000","r"))
    new_json = update_json(json_original)
    json.dump(new_json, open(extract_path+"/metadata/part-00000","w"))
    os.remove(extract_path+"/metadata/.part-00000.crc")
    os.remove(extract_path+"/metadata/._SUCCESS.crc")
    shutil.make_archive(extract_path, 'zip', extract_path)
    metadata_dict["name"] = name
    new_metadata.write(json.dumps(metadata_dict).replace(" ",""))
    new_metadata.write("\n")
    print()
new_metadata.close()
    

deidentify_rb_en_2.0.2_2.4_1559672122511.zip --> deidentify_rb_en_2.4.0_2.4_1580237286004
embeddingsRef not found

nerdl_tumour_demo_en_2.0.2_2.4_1558466102322.zip --> nerdl_tumour_demo_en_2.4.0_2.4_1580237286004
embeddingsRef not found
Added storageRef:clinical

assertion_dl_en_2.3.4_2.4_1574888344402.zip --> assertion_dl_en_2.4.0_2.4_1580237286004
embeddingsRef not found
Added storageRef:clinical

chunkresolve_icdo_icdoem_en_2.3.4_2.4_1574890700988.zip --> chunkresolve_icdo_icdoem_en_2.4.0_2.4_1580237286004
Moved embeddingsRef to storageRef
Deleted includeEmbeddings

people_disambiguator_en_2.3.4_2.4_1574806205059.zip --> people_disambiguator_en_2.4.0_2.4_1580237286004
embeddingsRef not found

deidentify_dl_en_2.0.2_2.4_1559669094458.zip --> deidentify_dl_en_2.4.0_2.4_1580237286004
embeddingsRef not found
Added storageRef:clinical

ner_bionlp_en_2.3.4_2.4_1574889731300.zip --> ner_bionlp_en_2.4.0_2.4_1580237286004
embeddingsRef not found
Added storageRef:clinical

context_spell_med_e

In [26]:
path_new = "/home/fernandrez/JSL/notebooks/MetistreamRxNorm/models/resolvers_24"
for f in os.listdir(path_new):
    if not f.endswith("index"):
        try:
            os.remove(os.path.join(path_new,f,"metadata/.part-00000.crc"))
        except OSError:
            pass
        try:
            os.remove(os.path.join(path_new,f,"metadata/._SUCCESS.crc"))
        except OSError:
            pass
        shutil.make_archive(os.path.join(path_new,f), 'zip', os.path.join(path_new,f))

In [25]:
import os
model_folder = "/home/fernandrez/JSL/model_migration_240/a"
#model_folder = "/home/fernandrez/JSL/notebooks/MetistreamSNOMED/models/resolvers_24"
model_list = os.listdir(model_folder)

In [26]:
for m in model_list:
    if not m.endswith("index") and not m.endswith("zip"):
        full_path = os.path.join(model_folder, m)
        json_original = json.load(open(full_path+"/metadata/part-00000","r"))
        new_json = update_json(json_original)
        json.dump(new_json, open(full_path+"/metadata/part-00000","w"))
        try:
            os.remove(full_path+"/metadata/.part-00000.crc")
        except:
            pass
        try:
            os.remove(full_path+"/metadata/._SUCCESS.crc")
        except:
            pass
        shutil.make_archive(full_path, 'zip', full_path)
        print()

embeddingsRef not found
includeEmbeddings not found
Added storageRef:clinical

