In [1]:
from jinja2 import Environment, PackageLoader, select_autoescape, meta
import pandas as pd, boto3, re, pytz
from nltk.tokenize.regexp import RegexpTokenizer
from datetime import datetime
from tabulate import tabulate
pd.set_option("display.max_rows",1000)

In [2]:
tknzr = RegexpTokenizer(r"\w+")

In [3]:
s3 = boto3.client('s3')
bucket = 'auxdata.johnsnowlabs.com'
bucket_url = f"https://s3.console.aws.amazon.com/s3/object/{bucket}"
download_url = f"s3://{bucket}/"

In [4]:
langs = ["en","es","de","xx"]
splitter = "("+"|".join([f"_{x}_" for x in langs])+")"

def get_s3_metadata(prefix="clinical/models"):
    return s3.list_objects_v2(
    Bucket=bucket,
    EncodingType='url',
    Prefix=prefix)

def filter_desired_names(content):
    return [k["Key"] for k in content
         if k["Key"][-3:]=="zip" and 
         "2ng" not in k["Key"]and 
         "icdoem" not in k["Key"] and
         "snomed_l" not in k["Key"] and
         "rxnorm_l" not in k["Key"] and
         "/resolve" not in k["Key"] and
         "ensembleresolve" not in k["Key"] and
         "noncontrib" not in k["Key"] and
         "embeddings_icd10_base" not in k["Key"] and
         "icdem" not in k["Key"] and
         "demo" not in k["Key"] and
         "_n2c" not in k["Key"] and
         "people_disambiguator" not in k["Key"]           
        ]

def split_stuff(x):
    s3_meta = [x.split("/") for x in x]
    s3_meta = [("/".join(x[:2]), *re.split(splitter,x[-1])) for x in s3_meta]
    s3_meta = [(a,b,c.replace("_",""),*d.replace(".zip","").split("_")) for a,b,c,d in s3_meta]
    return s3_meta

def aggregate_stuff(x):
    x = pd.DataFrame(x, columns=["model_repo","model_name","model_lang","compatibility","spark_version","ts"])
    x["latest_date"] = x.ts.apply(lambda x: datetime.fromtimestamp(int(x)/1000,pytz.utc).strftime("%Y-%m-%d"))
    x = x.groupby(["model_repo","model_name","model_lang"]).agg({"compatibility":min, "latest_date":max}).reset_index()
    x = x[x.compatibility.str.startswith("2.4")|x.compatibility.str.startswith("2.5")|x.compatibility.str.startswith("2.6")].reset_index(drop=True)
    return x

def get_clean_metadata(prefix="clinical/models"):
    x = get_s3_metadata(prefix)
    x = filter_desired_names(x["Contents"])
    x = split_stuff(x)
    x = aggregate_stuff(x)
    return x

In [5]:
s3_meta = get_clean_metadata()

In [6]:
cls_meta = pd.read_csv("docs_module/metadata/class_metadata_licensed.csv").fillna("")
md_meta  = pd.read_csv("docs_module/metadata/model_metadata_licensed.csv").fillna("")
full_meta = pd.merge(pd.merge(md_meta, cls_meta, on="model_class"),s3_meta, on=["model_repo","model_name","model_lang"])

In [16]:
def tabulate_row(x):
    return tabulate(pd.DataFrame(x),tablefmt="github")
full_meta["table"] = \
    full_meta[["model_name","model_class","compatibility","license","edition","inputs","output","model_lang"]]\
    .apply(tabulate_row, axis=1)

In [17]:
s3_meta[~s3_meta.model_name.isin(md_meta.model_name)]

Unnamed: 0,model_repo,model_name,model_lang,compatibility,latest_date
0,clinical/models,assertion_dl_healthcare,en,2.6.0,2020-09-23
2,clinical/models,assertion_i2b2,en,2.4.2,2020-05-07
9,clinical/models,chunkresolve_ICD10GM,de,2.5.5,2020-09-06
10,clinical/models,chunkresolve_athena_conditions_healthcare,en,2.6.0,2020-09-16
17,clinical/models,chunkresolve_icd10cm_poison_ext_clinical,en,2.4.5,2020-04-28
25,clinical/models,chunkresolve_rxnorm_xsmall_clinical,en,2.5.2,2020-06-24
27,clinical/models,classifierdl_biobert_ade,en,2.6.0,2020-09-15
28,clinical/models,clinical_analysis,en,2.4.0,2020-02-01
29,clinical/models,clinical_deidentification,en,2.4.0,2020-01-31
30,clinical/models,clinical_ner_assertion,en,2.4.0,2020-01-31


In [18]:
env = Environment(
    loader=PackageLoader('docs_module', 'templates'),
    autoescape=select_autoescape(['html', 'xml'])
)

In [19]:
mdsrc = env.loader.get_source(env, 'model.md')[0]
parsed_content = env.parse(mdsrc)
#meta.find_undeclared_variables(parsed_content)

In [20]:
mdmd = env.get_template("model.md")

In [21]:
def generate_code(x, scala=False):
    ins = ",".join([f'"{y.strip()}"' for y in x.inputs.split(",")])
    c = f'model = {x.model_class}.pretrained("{x.model_name}","{x.model_lang}","{x.model_repo}")\n\t.setInputCols({ins})\n\t.setOutputCol("{x.output}")'
    return "val "+c if scala else c


full_meta["model_title_seo"] = full_meta.class_description + ": " +full_meta.model_name
full_meta["python_sample"] = full_meta.apply(generate_code, axis=1)
full_meta["scala_sample"] = full_meta.apply(lambda x: generate_code(x, True), axis=1)

In [22]:
for i, r in full_meta.iterrows():
    with open(f"docs_module/output/{r.latest_date.replace('/','')}_{r.model_class}_{r.model_name}_{r.model_lang}.md","w") as f:
        f.write(mdmd.render(**r))