In [1]:
import json
import os
import numpy as np
import warnings
import pandas as pd
from IPython.display import display, HTML
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)


train_json = json.load(open(os.path.join("./data_generation/noise-openai-faker/lzc_noise_data_1500_0206.json")))
train_json = json.load(open(os.path.join("./kaggle_dataset/competition/train.json")))
train = pd.json_normalize(train_json)

In [2]:
train.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [3]:
# Check train has one row per document
assert train["document"].nunique() == train.shape[0]

In [4]:
full_ner_labels = [
    "B-NAME_STUDENT", "I-NAME_STUDENT",
    "B-URL_PERSONAL", "I-URL_PERSONAL",
    "B-ID_NUM", "I-ID_NUM",
    "B-EMAIL", "I-EMAIL",
    "B-STREET_ADDRESS", "I-STREET_ADDRESS",
    "B-PHONE_NUM", "I-PHONE_NUM",
    "B-USERNAME", "I-USERNAME"
]

In [5]:
train_eda = train.copy()
# Find documnets with high number of entities
train_eda["ner_labels"] = train_eda["labels"].apply(lambda x: [item for item in x if item != "O"])
train_eda["count_ner_labels"] = train_eda["ner_labels"].apply(len)
train_eda["count_distinct_ner_labels"] = train_eda["ner_labels"].apply(lambda x: len(set(x)))
train_eda.sort_values(by="count_distinct_ner_labels", inplace=True, ascending=False)

exploded_df = train_eda["ner_labels"].explode()
dummies = pd.get_dummies(exploded_df).reset_index()

frequency = dummies.sum().sort_values(ascending=False)
ordered_columns = frequency.index.tolist()
ordered_columns.remove("index")

counted = dummies.groupby("index").sum()
counted = counted.reindex(columns=full_ner_labels, fill_value=0)
counted = counted[ordered_columns + [i for i in full_ner_labels if i not in ordered_columns]]

train_eda = train_eda.join(counted)

In [6]:
ner_colors = {
    "B-NAME_STUDENT": "rgb(255, 179, 179)",
    "I-NAME_STUDENT": "rgb(139, 0, 0)",
    "B-URL_PERSONAL": "rgb(255, 223, 179)",
    "I-URL_PERSONAL": "rgb(255, 140, 0)",
    "B-ID_NUM": "rgb((255, 43, 46)",
    "I-ID_NUM":  "rgb(204, 153, 89)",
    "B-EMAIL": "rgb(179, 255, 179)",
    "I-EMAIL": "rgb(0, 100, 0)",
    "B-STREET_ADDRESS":  "rgb(179, 223, 255)",
    "I-STREET_ADDRESS": "rgb(204, 51, 255)",
    "B-PHONE_NUM": "rgb(223, 179, 255)",
    "I-PHONE_NUM": "rgb(75, 0, 130)",
    "B-USERNAME": "rgb(255, 179, 223)",
    "I-USERNAME": "rgb(231, 84, 128)",
}


def generate_legend():
    legend_html = "<div class='legend' style='margin-bottom: 10px;'>"
    for entity, color in ner_colors.items():
        legend_html += f"<span style='color: {color}; margin-right: 10px;'>{entity}</span>"
    legend_html += "</div>"
    return legend_html


def highlight_entities(text, tokens, labels):
    legend = generate_legend()  # Generate the legend
    soup = BeautifulSoup(legend, "html.parser")  # Start with the legend

    last_idx = 0
    for token, label in zip(tokens, labels):
        start, end = text.find(token, last_idx), text.find(
            token, last_idx) + len(token)

        if start != -1:
            soup.append(BeautifulSoup(text[last_idx:start], "html.parser"))

            if label != "O":
                token_span = soup.new_tag("span", style=f"background-color: {ner_colors.get(label, 'black')}; font-family: 'Tahoma'; padding: 0 2px; border-radius: 3px;")
                token_span.string = token
                soup.append(token_span)

                label_span = soup.new_tag("span", style=f"background-color: {ner_colors.get(label, 'black')}; font-family: 'Tahoma'; font-weight: bold; padding: 0 2px; border-radius: 3px;")
                label_span.string = f" [{label}]"
                soup.append(label_span)
            else:
                soup.append(token)

            last_idx = end

    soup.append(BeautifulSoup(text[last_idx:], "html.parser"))
    return str(soup).replace("\n", "<br/>")


def label_color(label):
    return ner_colors.get(label, "black")  # Default color

In [7]:
sample_train = train_eda.copy()

In [8]:
sample_train.loc[:, "html"] = sample_train.apply(lambda x: highlight_entities(x["full_text"], x["tokens"], x["labels"]), axis=1)

In [9]:
df = train[["document", "tokens", "labels"]].copy()
df = df.explode(["tokens", "labels"]).reset_index(drop=True).rename(columns={"tokens": "token", "labels": "label"})
df["token_str"] = df["token"]
df["token"] = df.groupby("document").cumcount()

label_list = df["label"].unique().tolist()

reference_df = df[df["label"] != "O"].copy()
reference_df = reference_df.reset_index().rename(columns={"index": "row_id"})
reference_df = reference_df[["row_id", "document", "token", "label", "token_str"]].copy()

In [10]:
reference_df.query("label == 'I-ID_NUM'")

Unnamed: 0,row_id,document,token,label,token_str
2737,3889615,19280,55,I-ID_NUM,30407059


In [11]:
def f(doc):
    idx = sample_train.query("document == @doc").index[0]
    display(HTML(sample_train.loc[idx, "html"]))

In [12]:
f(19280)