src/pages/gallery/spacyio.py

"""
Application: Spacy IO Application
Author: [Ines Montani](https://gist.github.com/ines)
Source: [Github](https://gist.github.com/ines/b320cb8441b590eedf19137599ce6685)
"""
# pylint: disable=too-many-locals,invalid-name, too-many-statements
import pandas as pd
import spacy
import streamlit as st
from spacy import displacy

SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"]
DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook."
HTML_WRAPPER = (
    """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; """
    """padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
)


@st.cache(ignore_hash=True)
def load_model(name):
    """Load Model"""
    return spacy.load(name)


@st.cache(ignore_hash=True)
def process_text(model_name, text):
    """Process Text"""
    nlp = load_model(model_name)
    return nlp(text)


def write():
    """Writes the page in gallery.py"""
    st.sidebar.title("Interactive spaCy visualizer")
    st.sidebar.markdown(
        """
    Process text with [spaCy](https://spacy.io) models and visualize named entities,
    dependencies and more. Uses spaCy's built-in
    [displaCy](http://spacy.io/usage/visualizers) visualizer under the hood.
    """
    )
    st.write("Author: [Ines Montani](https://gist.github.com/ines)")
    st.write(
        "Source: [Github](https://gist.github.com/ines/b320cb8441b590eedf19137599ce6685)"
    )

    spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES)
    model_load_state = st.info(f"Loading model '{spacy_model}'...")
    nlp = load_model(spacy_model)
    model_load_state.empty()

    text = st.text_area("Text to analyze", DEFAULT_TEXT)
    doc = process_text(spacy_model, text)

    if "parser" in nlp.pipe_names:
        st.header("Dependency Parse & Part-of-speech tags")
        st.sidebar.header("Dependency Parse")
        split_sents = st.sidebar.checkbox("Split sentences", value=True)
        collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True)
        collapse_phrases = st.sidebar.checkbox("Collapse phrases")
        compact = st.sidebar.checkbox("Compact mode")
        options = {
            "collapse_punct": collapse_punct,
            "collapse_phrases": collapse_phrases,
            "compact": compact,
        }
        docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
        for sent in docs:
            html = displacy.render(sent, options=options)
            # Double newlines seem to mess with the rendering
            html = html.replace("\n\n", "\n")
            if split_sents and len(docs) > 1:
                st.markdown(f"> {sent.text}")
            st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)

    if "ner" in nlp.pipe_names:
        st.header("Named Entities")
        st.sidebar.header("Named Entities")
        default_labels = ["PERSON", "ORG", "GPE", "LOC"]
        labels = st.sidebar.multiselect(
            "Entity labels", nlp.get_pipe("ner").labels, default_labels
        )
        html = displacy.render(doc, style="ent", options={"ents": labels})
        # Newlines seem to mess with the rendering
        html = html.replace("\n", " ")
        st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
        attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
        if "entity_linker" in nlp.pipe_names:
            attrs.append("kb_id_")
        data = [
            [str(getattr(ent, attr)) for attr in attrs]
            for ent in doc.ents
            if ent.label_ in labels
        ]
        df = pd.DataFrame(data, columns=attrs)
        st.dataframe(df)

    if "textcat" in nlp.pipe_names:
        st.header("Text Classification")
        st.markdown(f"> {text}")
        df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score"))
        st.dataframe(df)

    vector_size = nlp.meta.get("vectors", {}).get("width", 0)
    if vector_size:
        st.header("Vectors & Similarity")
        st.code(nlp.meta["vectors"])
        text1 = st.text_input("Text or word 1", "apple")
        text2 = st.text_input("Text or word 2", "orange")
        doc1 = process_text(spacy_model, text1)
        doc2 = process_text(spacy_model, text2)
        similarity = doc1.similarity(doc2)
        if similarity > 0.5:
            st.success(similarity)
        else:
            st.error(similarity)

    st.header("Token attributes")

    if st.button("Show token attributes"):
        attrs = [
            "idx",
            "text",
            "lemma_",
            "pos_",
            "tag_",
            "dep_",
            "head",
            "ent_type_",
            "ent_iob_",
            "shape_",
            "is_alpha",
            "is_ascii",
            "is_digit",
            "is_punct",
            "like_num",
        ]
        data = [[str(getattr(token, attr)) for attr in attrs] for token in doc]
        df = pd.DataFrame(data, columns=attrs)
        st.dataframe(df)

    st.header("JSON Doc")
    if st.button("Show JSON Doc"):
        st.json(doc.to_json())

    st.header("JSON model meta")
    if st.button("Show JSON model meta"):
        st.json(nlp.meta)


if __name__ == "__main__":
    write()