In [1]:
import pandas as pd, boto3, re, os
pd.set_option("display.max_rows",1000)
pd.set_option("display.max_colwidth",1000)

In [2]:
root = "../../spark-nlp/docs/_posts/"
files = os.listdir(root)

### Parse posts

In [3]:
data = []
for fp in files:
    state = ""
    with open(f"{root}{fp}", "r") as f:
        text = f.readlines()
        record = {"file": fp}
        for line in text:
            any_cond = any([line.startswith(t) for t in ["##", "---", "{:.btn-box}"]])
            clean_line = re.sub("[\n\r\f ]*{:.h2_title}","",line).strip()
            if any_cond:
                if line.startswith("---"):
                    state = "" if state=="---" else "---"
                elif line.startswith("{:.btn-box}"):
                    state = "links"
                else:
                    state = line.replace("\n","").strip()
            else:
                if state == "---":
                    key_val = line.split(":")
                    if len(key_val)==2:
                        record[key_val[0].strip()] =  key_val[1].strip()
                elif record.get(state, None) is None:
                    record[state] = clean_line
                else:
                    if clean_line:
                        record[state] = record[state] + "\n" + clean_line
        data.append(record)
        
            

In [4]:
df = pd.DataFrame(data)

### Coalesce different types of labels into `labels` and `dataset_info` fields and rename

In [5]:
df["labels"] = df["## Included Entities"].mask(pd.isnull,df["## Predicted Entities"].mask(pd.isnull,df["## Included Relations"].mask(pd.isnull,df["## Included Assertions"])))
df["dataset_info"] = df["## Dataset used for training"].mask(pd.isnull,df["## Source"])

In [6]:
print("mapping = {")
for c in df.columns:
    print('"'+c+'":"xxx",')
print("}")

mapping = {
"file":"xxx",
"layout":"xxx",
"title":"xxx",
"author":"xxx",
"name":"xxx",
"date":"xxx",
"tags":"xxx",
"article_header":"xxx",
"type":"xxx",
"use_language_switcher":"xxx",
"":"xxx",
"## Description":"xxx",
"links":"xxx",
"## How to use":"xxx",
"## Model Parameters":"xxx",
"## Source":"xxx",
"## Included Entities":"xxx",
"## Dataset used for training":"xxx",
"## Results":"xxx",
"## Predicted Entities":"xxx",
"## Included Models":"xxx",
"## Included Assertions":"xxx",
"## Included Relations":"xxx",
"labels":"xxx",
"dataset_info":"xxx",
}


In [7]:
mapping = {
"title":"title",
"labels":"labels",
"author":"author",
"name":"name",
"date":"latest_date",
"tags":"tags",
"links":"links",
"## Description":"description",
"## How to use":"code_samples",
"dataset_info":"dataset_info",
"## Included Models":"included_models",
"## Model Parameters":"model_info",
}

### Parse "model parameters" more specifically and add columns to the recordsand rename

In [8]:
df.rename(mapping, axis=1, inplace=True)
df = df[mapping.values()].copy()

In [9]:
def structure_info(x):
    ar = [ari.replace("|","").split(":") for ari in re.split("\n+",x) if ari!="" and len(ari.split(":|"))>1]
    dd = {ari[0]:re.sub("[\[\]]+","",ari[1]).replace("Spark NLP","").replace("for Healthcare","").strip() for ari in ar}
    return dd
    
str_info = df.model_info.apply(structure_info)

all_keys=[]
for s in str_info:
    for k in s.keys():
        all_keys.append(k)
all_keys = set(all_keys)

In [10]:
for k in all_keys:
    df[k] = str_info.apply(lambda x: x.get(k, None))

In [11]:
remapping = {
"Dimension":"dimension",
"Edition":"edition",
"Language":"language",
"License":"license",
"Compatibility":"compatibility",
"Input Labels":"inputs",
"Case sensitive":"case_sensitive",
"Output Labels":"output",
"Type":"type",
}

In [12]:
df.rename(remapping, axis=1, inplace=True)

### Fix manual `labels` and `description` field that contain the live demo button

In [13]:
def structure_labels(x):
    ar = [""] if pd.isna(x) else [ari for ari in re.split("\n\[//\]",x) if ari!=""]
    ret=re.sub("[\n-]+",",",ar[0]).strip()
    ret = ret.replace(", , ",",")
    ret = ret if ret and ret[-1]!="," else ret[:-1]
    ret = ret if ret and ret[0]!="," else ret[1:]
    ret = ret.strip()
    return ret

In [14]:
df["labels"] = df.labels.apply(structure_labels)
df["description"] = df.description.apply(structure_labels)

### Fix some records having the language in the `name` field

In [15]:
langs = ["en","es","de","xx"]
splitter = "("+"|".join([f"_{x}\\b" for x in langs])+")"
df["name"] = df["name"].apply(lambda x: re.split(splitter, x)[0])

### Parse all links from `links` field and add columns

In [16]:
link_map = {"Download":"download_url","Live Demo":"demo_url","Open in Colab":"colab_url"}
def structure_links(x):
    m = {link_map[k]:v for k,v in re.findall("\[(:?.+)\]\((:?.+)\)",x)}
    return m
df["links_arr"] = df.links.apply(structure_links)

In [17]:
for k in link_map.values():
    df[k] = df.links_arr.apply(lambda x: x.get(k, None))

In [18]:
df["file"] = df.download_url.str.split("/").apply(lambda x: x[-1])

### Remove `+` from compatibility

In [19]:
df["compatibility"] = df.compatibility.apply(lambda x: x.replace("+",""))
df["spark_version"] = "2.4"
df["ts"] = df.file.apply(lambda x: int(x.split("_")[-1].replace(".zip","")))
df["dataset"] = "mds"

### Persist metadata parsed from existing MDs

In [20]:
out = df.drop(["Model Name","code_samples","model_info","links","links_arr"],axis=1)

In [21]:
out.to_csv("docs_module/metadata/model_metadata_existing.csv", index=False)