In [122]:
from jinja2 import Environment, PackageLoader, select_autoescape, meta
import pandas as pd, boto3, re, pytz, os
from nltk.tokenize.regexp import RegexpTokenizer
from datetime import datetime
from tabulate import tabulate
pd.set_option("display.max_rows",1000)

In [123]:
root = "/home/fernandrez/JSL/repos/spark-nlp/docs/_posts"
files = os.listdir(root)

In [124]:
data = []
for fp in files:
    state = ""
    with open(f"/home/fernandrez/JSL/repos/spark-nlp/docs/_posts/{fp}", "r") as f:
        text = f.readlines()
        record = {"file": fp}
        for line in text:
            any_cond = line=="---\n" or any([t in line for t in ["##", "## Description","## Source","## How to use"]])
            if any_cond:
                if line == "---":
                    state = "" if state=="---" else "---"
                else:
                    state = line.replace("\n","").strip()
            else:
                if state == "---":
                    key_val = line.split(":")
                    if len(key_val)==2:
                        record[key_val[0].strip()] =  key_val[1].strip()
                elif record.get(state, None) is None:
                    record[state] = line
                else:
                    record[state] = record[state] + "\n" + line
        data.append(record)
        
            

In [125]:
data

[{'file': '2020-02-03-wikiner_6B_300_fr.md',
  'layout': 'model',
  'title': 'WikiNER 6B 300',
  'author': 'John Snow Labs',
  'name': 'wikiner_6B_300',
  'date': '2020-02-03',
  'tags': '[ner, fr, open_source]',
  'article_header': '',
  'type': 'cover',
  'use_language_switcher': '"Python-Scala-Java"',
  '## Description': 'WikiNER is a Named Entity Recognition (or NER) model, meaning it annotates text to find features like the names of people, places, and organizations. This NER model does not read words directly but instead reads word embeddings, which represent words as points such that more semantically similar words are closer together. WikiNER 6B 300 is trained with GloVe 6B 300 word embeddings, so be sure to use the same embeddings in the pipeline.\n\n\n\n{:.btn-box}\n\n[Live Demo](https://demo.johnsnowlabs.com/public/NER_FR){:.button.button-orange}{:target="_blank"}\n\n[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorial

In [126]:
df = pd.DataFrame(data)

In [127]:
df["labels"] = df["## Included Entities"].mask(pd.isnull,df["## Predicted Entities"].mask(pd.isnull,df["## Included Relations"].mask(pd.isnull,df["## Included Assertions"])))
df["model_dataset"] = df["## Dataset used for training"].mask(pd.isnull,df["## Source"])

In [128]:
print("mapping = {")
for c in df.columns:
    print('"'+c+'":"xxx",')
print("}")

mapping = {
"file":"xxx",
"layout":"xxx",
"title":"xxx",
"author":"xxx",
"name":"xxx",
"date":"xxx",
"tags":"xxx",
"article_header":"xxx",
"type":"xxx",
"use_language_switcher":"xxx",
"## Description":"xxx",
"## How to use":"xxx",
"## Model Parameters":"xxx",
"## Source":"xxx",
"## Included Entities":"xxx",
"## Dataset used for training":"xxx",
"## Results":"xxx",
"## Predicted Entities":"xxx",
"## Included Models":"xxx",
"## Included Assertions":"xxx",
"## Included Relations":"xxx",
"labels":"xxx",
"model_dataset":"xxx",
}


In [129]:
mapping = {
"title":"title",
"labels":"labels",
"author":"model_author",
"name":"model_name",
"date":"latest_date",
"tags":"tags",
"## Description":"description",
"## How to use":"code_samples",
"model_dataset":"model_dataset",
"## Included Models":"included_models",
"## Model Parameters":"model_info",
}

In [130]:
df.rename(mapping, axis=1, inplace=True)

In [131]:
df = df[mapping.values()]

In [132]:
def structure(x):
    ar = [ari.replace("|","").split(":") for ari in re.split("\n+",x) if ari!="" and len(ari.split(":|"))>1]
    dd = {ari[0]:ari[1] for ari in ar}
    return dd
    
str_info = df.model_info.apply(structure)

In [133]:
all_keys=[]
for s in str_info:
    for k in s.keys():
        all_keys.append(k)
all_keys = set(all_keys)

In [134]:
for k in all_keys:
    df[k] = str_info.apply(lambda x: x.get(k, None))

In [135]:
df

Unnamed: 0,title,labels,model_author,model_name,latest_date,tags,description,code_samples,model_dataset,included_models,...,Dimension,Edition,Language,License,Compatibility,Input Labels,Case sensitive,Model Name,Output Labels,Type
0,WikiNER 6B 300,,John Snow Labs,wikiner_6B_300,2020-02-03,"[ner, fr, open_source]",WikiNER is a Named Entity Recognition (or NER)...,"\n\n<div class=""tabs-box"" markdown=""1"">\n\n\n\...",The model is trained based on data from [https...,,...,,Official,fr,Open Source,Spark NLP 2.4.0,"sentence, token, embeddings",False,wikiner_6B_300,ner,ner
1,Deidentification NER (Enriched),,John Snow Labs,ner_deid_enriched,2020-03-04,"[ner, en, deidentify, licensed]",Deidentification NER (Enriched) is a Named Ent...,"\n\n<div class=""tabs-box"" markdown=""1"">\n\n\n\...",The model is imported from [https://portal.dbm...,,...,,Official,en,Licensed,Spark NLP for Healthcare 2.4.2+,"sentence, token, embeddings",False,ner_deid_enriched,ner,ner
2,WikiNER 840B 300,,John Snow Labs,wikiner_840B_300,2019-07-13,"[open_source, ner, fr]",WikiNER is a Named Entity Recognition (or NER)...,"\n\n<div class=""tabs-box"" markdown=""1"">\n\n\n\...",The model is trained based on data from [https...,,...,,Official,fr,Open Source,Spark NLP 2.1.0,"sentence, token, embeddings",False,wikiner_840B_300,ner,ner
3,Ner DL Model,"Age, Diagnosis, Dosage, Drug_name, Frequency, ...",John Snow Labs,ner_jsl_en,2020-04-22,"[ner, en, licensed]",\n\nPretrained named entity recognition deep l...,\n\nUse as part of an nlp pipeline with the fo...,Trained on data gathered and manually annotate...,,...,,Healthcare,[en],Licensed,Spark NLP 2.4.2,"[sentence,token, embeddings]",False,ner_jsl_en_2.4.2_2.4,[ner],ner
4,WikiNER 840B 300,,John Snow Labs,wikiner_840B_300,2019-07-13,"[open_source, ner, de]",WikiNER is a Named Entity Recognition (or NER)...,"\n\n<div class=""tabs-box"" markdown=""1"">\n\n\n\...",The model is imported from [https://de.wikiped...,,...,,Official,de,Open Source,Spark NLP 2.1.0,"sentence, token, embeddings",False,wikiner_840B_300,ner,ner
5,Clinical NER (Large),,John Snow Labs,ner_clinical_large,2020-05-10,"[ner, en, licensed]",Clinical NER (Large) is a Named Entity Recogni...,"\n\n<div class=""tabs-box"" markdown=""1"">\n\n\n\...",The model is imported from [https://portal.dbm...,,...,,Official,en,Licensed,Spark NLP for Healthcare 2.5.0+,"sentence, token, embeddings",False,ner_clinical_large,ner,ner
6,WikiNER 840B 300,,John Snow Labs,wikiner_840B_300,2020-03-16,"[ner, ru, open_source]",WikiNER is a Named Entity Recognition (or NER)...,"\n\n<div class=""tabs-box"" markdown=""1"">\n\n\n\...",The model is imported from [https://ru.wikiped...,,...,,Official,ru,Open Source,Spark NLP 2.4.4,"sentence, token, embeddings",False,wikiner_840B_300,ner,ner
7,NerDLModel Bionlp,"Amino_acid, Anatomical_system, Cancer, Cell, ...",John Snow Labs,ner_bionlp_en,2020-01-30,"[licensed, ner, en]",\n\nPretrained named entity recognition deep l...,\n\nUse as part of an nlp pipeline with the fo...,Trained on Cancer Genetics (CG) task of the Bi...,,...,,Healthcare,[en],Licensed,Spark NLP 2.4.0,"[sentence,token, embeddings]",False,ner_bionlp_en_2.4.0_2.4,[ner],ner
8,ALBERT Base Uncase,,John Snow Labs,albert_base_uncased,2020-04-28,"[embeddings, en, open_source]","ALBERT is ""A Lite"" version of BERT, a popular ...","\n\n<div class=""tabs-box"" markdown=""1"">\n\n\n\...",The model is imported from [https://tfhub.dev/...,,...,768.0,Official,[en],Open Source,Spark NLP 2.5.0,"[sentence, token]",False,albert_base_uncased,[word_embeddings],embeddings
9,WikiNER 6B 100,,John Snow Labs,wikiner_6B_100,2020-05-10,"[ner, pt, open_source]",WikiNER is a Named Entity Recognition (or NER)...,"\n\n<div class=""tabs-box"" markdown=""1"">\n\n\n\...",The model is imported from [https://pt.wikiped...,,...,,Official,pt,Open Source,Spark NLP 2.5.0,"sentence, token, embeddings",False,wikiner_6B_100,ner,ner


In [136]:
df.to_csv("docs_module/metadata/model_metadata_existing.csv", index=False)