In [10]:
import pandas as pd
from pathlib import Path

In [11]:
file_paths = list(Path("./data/processed/").glob("./**/*.json"))
file_paths

[WindowsPath('data/processed/te/train.json')]

In [12]:
file_paths[0].parts

('data', 'processed', 'te', 'train.json')

In [13]:
counts = {}
for file_path in file_paths:
    _, _, lang, split = file_path.parts
    split = split.split(".")[0]
    counts[(lang, split)] = pd.read_json(file_path, orient="records", lines=True).shape[0]

In [14]:
df_t = pd.DataFrame(
    [k + (v,) for k,v in counts.items()],
    columns=["lang", "split", "#docs"]
)
df_t

Unnamed: 0,lang,split,#docs
0,te,train,2200


In [15]:
df_t.pivot(index="split", columns="lang", values="#docs")

lang,te
split,Unnamed: 1_level_1
train,2200


In [16]:
print(df_t.pivot(index="split", columns="lang", values="#docs").to_latex())

\begin{tabular}{lr}
\toprule
lang &    te \\
split &       \\
\midrule
train &  2200 \\
\bottomrule
\end{tabular}



  print(df_t.pivot(index="split", columns="lang", values="#docs").to_latex())


In [18]:
row = pd.read_json(file_paths[0], orient="records", lines=True).head().loc[3,["tokens", "tags"]]
row

tokens    [10, చనిపోతారు, ఇన్, ఈజిప్ట్, చర్చి, దాడి., పద...
tags      [B-CASUALTIES-ARG, I-CASUALTIES-ARG, O, B-PLAC...
Name: 3, dtype: object

In [19]:
print(" ".join(f"{token}/{tag}" for token, tag in zip(row.tokens, row.tags)))

10/B-CASUALTIES-ARG చనిపోతారు/I-CASUALTIES-ARG ఇన్/O ఈజిప్ట్/B-PLACE-ARG చర్చి/I-PLACE-ARG దాడి./B-MAN_MADE_EVENT.TERRORIST_ATTACK పది/B-CASUALTIES-ARG వ్యక్తులు/I-CASUALTIES-ARG ఉన్నారు/I-CASUALTIES-ARG చంపబడింది/I-CASUALTIES-ARG మరియు/I-CASUALTIES-ARG ఎనిమిది/I-CASUALTIES-ARG ఇతరులు/I-CASUALTIES-ARG గాయపడ్డారు/I-CASUALTIES-ARG ఇన్/O ఎ/O ఉగ్రవాది/B-MAN_MADE_EVENT.TERRORIST_ATTACK దాడి/I-MAN_MADE_EVENT.TERRORIST_ATTACK ఆన్/O ఎ/O చర్చి/B-PLACE-ARG ఇన్/I-PLACE-ARG ఈజిప్ట్/I-PLACE-ARG మూలధనం/I-PLACE-ARG ఈ రోజు,/B-TIME-ARG ది/O ప్రభుత్వం/O అన్నారు./O ఎ/O ఆరోగ్యం/O మంత్రిత్వ శాఖ/O ప్రతినిధి/O అన్నారు/O ది/O దాడి/B-MAN_MADE_EVENT.TERRORIST_ATTACK టేక్/O స్థలం/O ఎప్పుడు/O ముష్కరులు/B-PARTICIPANT-ARG ఆన్/O ఎ/O మోటారుబైక్/O తెరిచింది/B-MAN_MADE_EVENT.TERRORIST_ATTACK అగ్ని/I-MAN_MADE_EVENT.TERRORIST_ATTACK వెలుపల/O మార్/B-PLACE-ARG మినా/I-PLACE-ARG చర్చి/I-PLACE-ARG ఇన్/I-PLACE-ARG హెల్వాన్/I-PLACE-ARG ప్రాంతం./I-PLACE-ARG వద్ద/B-CASUALTIES-ARG కనీసం/I-CASUALTIES-ARG 10/I-CASUALTIES-ARG వ్యక్తు

In [20]:
from spacy import displacy

In [21]:
def split_tag(tag):
    return tuple(tag.split("-", 1)) if tag != "O" else (tag, None)

def extract_entities(tags):
    tags = list(tags)
    curr_entity = []
    entities = []
    for i, tag in enumerate(tags + ["O"]):
        # Add dummy tag in end to ensure the last entity is added to entities
        boundary, label = split_tag(tag)
        if curr_entity:
            # Exit entity
            if boundary in {"B", "O"} or label != curr_entity[-1][1]:
                start = i - len(curr_entity)
                end = i
                entity_label = curr_entity[-1][1]
                entities.append((entity_label, start, end))
                curr_entity = []
            elif boundary == "I":
                curr_entity.append((boundary, label))
        if boundary == "B":
            # Enter or inside entity
            assert not curr_entity, f"Entity should be empty. Found: {curr_entity}"
            curr_entity.append((boundary, label))
    return entities


def get_entity_info(bio_labels, tokens, text=None, spans=None):
    entities_info = extract_entities(bio_labels)
    entities = []        
    for label, start, end in entities_info:
        entity_phrase = None
        start_char_idx = None
        end_char_idx = None
        if text and spans:
            start_char_idx = spans[start][0]
            end_char_idx = spans[end-1][1]
            entity_phrase = text[start_char_idx:end_char_idx]
        entities.append(dict(
            tokens=tokens[start:end],
            label=label,
            start=start,
            start_char_idx=start_char_idx,
            end_char_idx=end_char_idx,
            end=end,
            entity_phrase=entity_phrase))
    return entities

In [22]:
from IPython.display import display, display_html, HTML

In [23]:
def render(tokens, tags):
    text = " ".join(row.tokens)
    spans = []
    start_char_idx = 0
    for token in tokens:
        spans.append((start_char_idx, start_char_idx+len(token)))
        start_char_idx += len(token) + 1 # For space
    entity_info = get_entity_info(tags, tokens, text, spans)
    unique_labels = set([e["label"] for e in entity_info])
    pallet = "#48cd4a,#cb4f78,#e97e0b,#48f061,#8ff1df,#9ead18,#27e57b,#e051c0,#7add51,#dab304,#1dfafb".split(",")
    colors = {
        l:c for l,c in zip(unique_labels, pallet)
    }
    
    doc = {
        "text": text,
        "ents": [
            {"label": e["label"], "start": e["start_char_idx"], "end": e["end_char_idx"]}
            for e in entity_info
        ]
    }
    bio_text = " ".join(f"{token}/{tag}" for token, tag in zip(row.tokens, row.tags))
    display(HTML("<h3>Highlighted Entities</h3>"))
    displacy.render(doc, style="ent", manual=True, options=dict(colors=colors))
    display(HTML(f"<h3>BIO Format</h3><pre>{bio_text}</pre>"))

In [24]:
render(row.tokens, row.tags)