In [1]:
from docs_module import langs
import pandas as pd, boto3, re, pytz
from datetime import datetime
from tabulate import tabulate
pd.set_option("display.max_rows",1000)

### First download real status from S3

In [2]:
s3 = boto3.client('s3')
bucket = 'auxdata.johnsnowlabs.com'
bucket_url = f"https://s3.console.aws.amazon.com/s3/object/{bucket}"
download_url = f"s3://{bucket}/"

In [3]:
langs_arr = dict(langs.lang_codes).keys()

In [9]:
splitter = "("+"|".join([f"_{x}_" for x in langs_arr])+")"

def get_s3_metadata(prefix="clinical/models"):
    return s3.list_objects_v2(
    Bucket=bucket,
    EncodingType='url',
    Prefix=prefix)

def filter_desired_names(content):
    return [k["Key"] for k in content
         if k["Key"][-3:]=="zip" and 
         "2ng" not in k["Key"]and 
         "icdoem" not in k["Key"] and
         "snomed_l" not in k["Key"] and
         "rxnorm_l" not in k["Key"] and
         "/resolve" not in k["Key"] and
         "ensembleresolve" not in k["Key"] and
         "noncontrib" not in k["Key"] and
         "embeddings_icd10_base" not in k["Key"] and
         "icdem" not in k["Key"] and
         "demo" not in k["Key"] and
         "stopwords" not in k["Key"] and
         "_use_xling" not in k["Key"]    and  
         ("2.4." in k["Key"] or "2.5." in k["Key"] or "2.6." in k["Key"]           )        
        ]

def split_stuff(x):
    s3_meta = [x.split("/") for x in x]
    s3_meta = [("/".join(x[:2]), *re.split(splitter,x[-1])) for x in s3_meta]
    s3_meta = [(a,b,c.replace("_",""),*d.replace(".zip","").split("_")) for a,b,c,d in s3_meta]
    return s3_meta

def aggregate_stuff(x):
    for y in x:
        if len(y)==7:
            print(y)
    x = pd.DataFrame(x, columns=["repo","name","language","compatibility","spark_version","ts"])
    x["latest_date"] = x.ts.apply(lambda x: datetime.fromtimestamp(int(x)/1000,pytz.utc).strftime("%Y-%m-%d"))
    x = x.groupby(["repo","name","language"]).agg({"compatibility":min, "spark_version":max, "latest_date":max, "ts":max}).reset_index()
    x = x[x.compatibility.str.startswith("2.4")|x.compatibility.str.startswith("2.5")|x.compatibility.str.startswith("2.6")].reset_index(drop=True)
    return x

def get_clean_metadata(prefix="clinical/models"):
    x = get_s3_metadata(prefix)
    x = filter_desired_names(x["Contents"])
    x = split_stuff(x)
    x = aggregate_stuff(x)
    return x

In [10]:
x = get_clean_metadata("public/models")

In [11]:
s3_meta = get_clean_metadata("public/models")
s3_meta["file"] = s3_meta.name+"_"+s3_meta.language+"_"+s3_meta.compatibility+"_"+s3_meta.spark_version+"_"+s3_meta.ts+".zip"

In [12]:
s3_meta.to_csv("models_metadata.csv")

### Then load the manually curated csv's
- Class Metadata (Manual from Andres and Christian)
- Models Metadata (Manual from Andres and Christian)
- Parsed from existing MDs in notebook #1 

In [13]:
s3_meta[~s3_meta.name.isin(md_meta.name)]

In [11]:
cls_meta = pd.read_csv("docs_module/metadata/class_metadata_all.csv").fillna("")
ex_meta  = pd.read_csv("docs_module/metadata/model_metadata_existing.csv").fillna("")

In [None]:
mdcls_meta = pd.merge(md_meta, cls_meta, on="model_class")
print(md_meta.shape, cls_meta.shape, mdcls_meta.shape)

In [None]:
mds3_meta = pd.merge(s3_meta, mdcls_meta, on=["repo","name","language"], how="left")
print(s3_meta.shape, mdcls_meta.shape, mds3_meta.shape)

In [None]:
full_meta = pd.concat([mds3_meta, ex_meta], axis=0, sort=False)
print(mds3_meta.shape, ex_meta.shape, full_meta.shape)

In [9]:
set(sorted(list(ex_meta.columns))).difference(sorted(list(md_meta.columns)))

{'case_sensitive',
 'colab_url',
 'demo_url',
 'dimension',
 'download_url',
 'file',
 'first_version',
 'included_models',
 'inputs',
 'latest_date',
 'output',
 'type'}

In [8]:
set(sorted(list(md_meta.columns))).difference(sorted(list(ex_meta.columns)))

{'model_class', 'reference_url', 'repo', 'upstream_deps'}

In [20]:
def tabulate_row(x):
    return tabulate(pd.DataFrame(x).dropna(),tablefmt="github")
full_meta["table"] = \
    full_meta[["name","model_class","compatibility","license","edition","inputs","output","language","dimension","case_sensitive"]]\
    .apply(tabulate_row, axis=1)