In [1]:
import pandas as pd, boto3, re, pytz
from datetime import datetime
from tabulate import tabulate
pd.set_option("display.max_rows",1000)

### First download real status from S3

In [2]:
s3 = boto3.client('s3')
bucket = 'auxdata.johnsnowlabs.com'
bucket_url = f"https://s3.console.aws.amazon.com/s3/object/{bucket}"
download_url = f"s3://{bucket}/"

In [3]:
langs = ["en","es","de","xx"]
splitter = "("+"|".join([f"_{x}_" for x in langs])+")"

def get_s3_metadata(prefix="clinical/models"):
    return s3.list_objects_v2(
    Bucket=bucket,
    EncodingType='url',
    Prefix=prefix)

def filter_desired_names(content):
    return [k["Key"] for k in content
         if (k["Key"][-3:]=="zip" and 
         "2ng" not in k["Key"]and 
         "icdoem" not in k["Key"] and
         "snomed_l" not in k["Key"] and
         "rxnorm_l" not in k["Key"] and
         "/resolve" not in k["Key"] and
         "ensembleresolve" not in k["Key"] and
         "noncontrib" not in k["Key"] and
         "embeddings_icd10_base" not in k["Key"] and
         "icdem" not in k["Key"] and
         "demo" not in k["Key"] and
         "_n2c" not in k["Key"] and
         "people_disambiguator" not in k["Key"] and  
         ("2.4." in k["Key"] or "2.5." in k["Key"] or "2.6." in k["Key"])) or
        "pos_clinical" in k["Key"] or "deidentify_rb" in k["Key"]
        ]

def split_stuff(x):
    s3_meta = [x.split("/") for x in x]
    s3_meta = [("/".join(x[:2]), *re.split(splitter,x[-1])) for x in s3_meta]
    s3_meta = [(a,b,c.replace("_",""),*d.replace(".zip","").split("_")) for a,b,c,d in s3_meta]
    return s3_meta

def aggregate_stuff(x):
    x = pd.DataFrame(x, columns=["repo","name","language","compatibility","spark_version","ts"])
    x["latest_version"] = x.compatibility
    x["latest_date"] = x.ts.apply(lambda x: datetime.fromtimestamp(int(x)/1000,pytz.utc).strftime("%Y-%m-%d"))
    x = x.groupby(["repo","name","language"]).agg({"compatibility":min, "latest_version":max, "spark_version":max, "latest_date":max, "ts":max}).reset_index()
#     x = x[x.compatibility.str.startswith("2.3")|x.compatibility.str.startswith("2.4")|x.compatibility.str.startswith("2.5")|x.compatibility.str.startswith("2.6")].reset_index(drop=True)
    return x

def get_clean_metadata(prefix="clinical/models"):
    x = get_s3_metadata(prefix)
    x = filter_desired_names(x["Contents"])
    x = split_stuff(x)
    x = aggregate_stuff(x)
    return x

In [25]:
s3_meta = get_clean_metadata()
s3_meta["file"] = s3_meta.name+"_"+s3_meta.language+"_"+s3_meta.latest_version+"_"+s3_meta.spark_version+"_"+s3_meta.ts+".zip"
s3_meta["dataset"] = "s3"

### Then load the manually curated csv's
- Class Metadata (Manual from Andres and Christian)
- Models Metadata (Manual from Andres and Christian)
- Parsed from existing MDs in notebook #1 

In [5]:
cls_meta = pd.read_csv("docs_module/metadata/class_metadata_all.csv").fillna("")
md_meta  = pd.read_csv("docs_module/metadata/model_metadata_licensed.csv").fillna("")
ex_meta  = pd.read_csv("docs_module/metadata/model_metadata_existing.csv").fillna("")
ex_meta = ex_meta[ex_meta.license=="Licensed"]

In [6]:
mdcls_meta = pd.merge(md_meta, cls_meta[["model_class","inputs","output"]], on="model_class")
print(md_meta.shape, cls_meta.shape, mdcls_meta.shape)

(65, 12) (36, 25) (65, 14)


In [7]:
mds3_meta = pd.merge(s3_meta, mdcls_meta, on=["repo","name","language"], how="outer")
print(s3_meta.shape, mdcls_meta.shape, mds3_meta.shape)

(100, 9) (65, 14) (100, 20)


In [8]:
ex_fields = ["name","language"]+list(set(ex_meta.columns).difference(mds3_meta.columns))
full_meta = pd.merge(mds3_meta, ex_meta[ex_fields], on=["name","language"], how="outer")
print(mds3_meta.shape, ex_meta.shape, full_meta.shape)

(100, 20) (25, 25) (100, 29)


In [9]:
full_meta.license.fillna("Licensed",inplace=True)
full_meta.repo.fillna("clinical/models",inplace=True)

In [10]:
full_meta["ts"] = full_meta.ts.astype(str)
full_meta["spark_version"] = full_meta.spark_version.astype(str)

In [11]:
def tabulate_row(x):
    return tabulate(pd.DataFrame(x).dropna(),tablefmt="github")
full_meta["table"] = \
    full_meta[["name","model_class","compatibility","license","edition","inputs","output","language","dimension","case_sensitive"]]\
    .apply(tabulate_row, axis=1)

In [12]:
full_meta.drop("table",axis=1).to_csv("docs_module/metadata/models_metadata_all.csv",index=False)