In [16]:
from bs4 import BeautifulSoup, Tag
import re
import json

# test input file
# input_path = 'named_ent_dtest.html'

# train input file
input_path = 'named_ent_train.html'

# Read the input file
with open(input_path, 'r', encoding='utf-8') as file:
    content = file.read()

print(content)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>
       Czech Named Entity Corpus 2.0
    </title>
    <style type="text/css"><!--
      span { color: #000; font-family: sans-serif; font-weight: bold; }

      span.namedent_A { color: #600; border-bottom: 1px solid #600; }
      span.namedent_C { color: #f50; border-bottom: 1px solid #f50; }
      span.namedent_T { color: #939; border-bottom: 1px solid #939; }
      span.namedent_P { color: #090; border-bottom: 1px solid #090; }

      span.namedent_ah, span.namedent_at, span.namedent_az
        { color:#00f; }

      span.namedent_g_, span.namedent_gc, span.namedent_gh, span.namedent_gl,
      span.namedent_gq, span.namedent_gr, span.namedent_gs, span.namedent_gt,
      span.namedent_gu
        { color:#600; }

      span.namedent_i_,

In [17]:
## Extract the content after the <hr> tag - remove the header 

soup = BeautifulSoup(content, 'html.parser')
hr_tag = soup.find('hr')
content_section = hr_tag.next_siblings
content_html = ''.join(str(sibling) for sibling in content_section)
print(content_html)


<p>
Jste světa znalý muž a víte stejně dobře jako já , že souvislost mezi současnými krutostmi v <span class="namedent_lower">Jihovýchodní</span> <span class="namedent_gt">Asii</span> a tou novou bankovní pobočkou hned vedle obchoďáku <span class="namedent_if">Zátoka</span> je přímá a bezprostřední ;<br/>
byl z toho už vzteklý jak uvázaný pes , protože zájemci o hodiny mu úplně narušili jeho denní režim a on si nemohl po obědě ani zdřímnout .<br/>
I s <span class="namedent_p_">Dubenkou</span> , na kterou <span class="namedent_if">U tygra</span> teď myslím . . .<br/>
Hodil si kulovnici přes rameno a vydal se s význačným loveckým hostem do stráně , do kopce na krytou kazatelnu , aby s ním alespoň nemusel moknout .<br/>
Já je normálně nosím tak " - a ukázal hřbetem dlaně na krajinu břišní .<br/>
Když zakrátko <span class="namedent_p_">Magora</span> zavřeli , šli <span class="namedent_ps">Němec</span> a <span class="namedent_ps">Jirousová</span> za <span class="namedent_pf">Václavem</span

In [18]:
def process_namedent_P(span):
    """
    Process a namedent_P span by combining all internal spans into one text.
    """
    # Get all text content, stripping whitespace but preserving spaces between parts
    texts = []
    for content in span.contents:
        if isinstance(content, Tag):
            texts.append(content.get_text().strip())
        else:
            # For non-tag content (like spaces), preserve only if it's not empty
            text = content
            if text:
                texts.append(text)
    
    # Create new span with combined text
    new_span = Tag(name='span')
    new_span['class'] = span['class']
    new_span.string = ''.join(texts)
    return new_span

def should_keep_span(tag):
    """
    Determine if a span tag should be kept based on its class.
    Keep spans that are related to person names (namedent_p or namedent_P).
    """
    if not isinstance(tag, Tag):
        return False
    
    if tag.name != 'span':
        return True
        
    classes = tag.get('class', [])
    if not classes:
        return False
        
    # Keep spans with person-related classes
    return any(cls.startswith('namedent_p') or cls.startswith('namedent_P') for cls in classes)

soup = BeautifulSoup(content_html, 'html.parser')
# First process namedent_P spans
for span in soup.find_all('span', class_='namedent_P'):
    new_span = process_namedent_P(span)
    span.replace_with(new_span)

# Then process remaining spans
spans_to_process = soup.find_all('span')
for span in spans_to_process:
    if not should_keep_span(span):
        # Replace span with its contents
        span.unwrap()

#soup = str(soup)

print(soup)



<p>
Jste světa znalý muž a víte stejně dobře jako já , že souvislost mezi současnými krutostmi v Jihovýchodní Asii a tou novou bankovní pobočkou hned vedle obchoďáku Zátoka je přímá a bezprostřední ;<br/>
byl z toho už vzteklý jak uvázaný pes , protože zájemci o hodiny mu úplně narušili jeho denní režim a on si nemohl po obědě ani zdřímnout .<br/>
I s <span class="namedent_p_">Dubenkou</span> , na kterou U tygra teď myslím . . .<br/>
Hodil si kulovnici přes rameno a vydal se s význačným loveckým hostem do stráně , do kopce na krytou kazatelnu , aby s ním alespoň nemusel moknout .<br/>
Já je normálně nosím tak " - a ukázal hřbetem dlaně na krajinu břišní .<br/>
Když zakrátko <span class="namedent_p_">Magora</span> zavřeli , šli <span class="namedent_ps">Němec</span> a <span class="namedent_ps">Jirousová</span> za <span class="namedent_pf">Václavem</span> a rozhodli se , že případ budou publikovat .<br/>
Když vyhraju , což je tutovka , dostanu od každého z vás litr slivovice já .<br/>
V

In [19]:
def get_entity_positions(soup, clean_text):
    """
    Get positions of entities in the clean text based on spans in the soup.
    Returns a list of (start_word_index, end_word_index) tuples.
    """
    positions = []
    words = clean_text.split()
    word_to_index = {}
    
    # Create a mapping of each word's occurrences to their indices
    for i, word in enumerate(words):
        if word not in word_to_index:
            word_to_index[word] = []
        word_to_index[word].append(i)
    
    # Track which occurrences we've used for each entity
    used_indices = {word: 0 for word in word_to_index}
    
    # Process each span (entity)
    for span in soup.find_all('span'):
        entity_text = span.get_text().strip()
        if not entity_text:
            continue
            
        entity_words = entity_text.split()
        if not entity_words:
            continue
            
        # For single-word entities
        if len(entity_words) == 1:
            word = entity_words[0]
            if word in word_to_index and used_indices[word] < len(word_to_index[word]):
                position = word_to_index[word][used_indices[word]]
                positions.append((position, position))
                used_indices[word] += 1
        # For multi-word entities
        else:
            first_word = entity_words[0]
            last_word = entity_words[-1]
            
            if (first_word in word_to_index and last_word in word_to_index and
                used_indices[first_word] < len(word_to_index[first_word]) and
                used_indices[last_word] < len(word_to_index[last_word])):
                
                start_pos = word_to_index[first_word][used_indices[first_word]]
                end_pos = word_to_index[last_word][used_indices[last_word]]
                
                # Only add if start comes before end (sanity check)
                if start_pos <= end_pos:
                    positions.append((start_pos, end_pos))
                    used_indices[first_word] += 1
                    used_indices[last_word] += 1
    
    return positions



In [20]:
def process_line(line, qas_id):
    """
    Process a single line and create entries for person entities.
    """
    soup = BeautifulSoup(line, 'html.parser')
    
    # Get clean text (without HTML tags)
    clean_text = ' '.join(soup.get_text().split())
    
    if not clean_text:
        return None
    
    # Get positions of person entities
    positions = get_entity_positions(soup, clean_text)
    
    # Create entry
    json_entry = {
        "context": clean_text,
        "end_position": [pos[1] for pos in positions],
        "entity_label": "PER",
        "impossible": "false" if positions else "true",
        "qas_id": f"{qas_id}.1",
        "query": "person entities are named persons or family.",
        "span_position": [f"{pos[0]};{pos[1]}" for pos in positions],
        "start_position": [pos[0] for pos in positions]
    }
    
    return json_entry

content = str(soup)
lines = re.split(r'<br\s*?/>', content)
id = 0
results = []
for line in lines:
    line = line.strip()
    if not line:
        continue
    
    json_entry = process_line(line, id)
    if json_entry:
        results.append(json_entry)
        id += 1
        


In [None]:
# Split the data into two parts - test_part_amount records for test and the rest for training
# whole parsed file is in records variable

def renumber_qas_ids(data):
    for i, record in enumerate(data):
        record["qas_id"] = f"{i}.1"
    return data

test_part_amount = 159
test_part = results[:test_part_amount]
train_part = results[test_part_amount:]

test_part = renumber_qas_ids(test_part)
train_part = renumber_qas_ids(train_part)

print(f"First part: {len(test_part)} records")
print(f"Second part: {len(train_part)} records")

print("First part sample:")
print(test_part[-1])

First part: 159 records
Second part: 7034 records
First part sample:
{'context': 'Opakované zápasy 3 . kola anglického Ligového poháru : Norwich - Arsenal 0 : 3 , Crystal Palace - Everton 1 : 4', 'end_position': [], 'entity_label': 'PER', 'impossible': 'true', 'qas_id': '158.1', 'query': 'person entities are named persons or family.', 'span_position': [], 'start_position': []}


In [None]:
output_path = 'named_ent_train.json'
file = results

with open(output_path, 'w', encoding='utf-8') as file:
    json.dump(file, file, ensure_ascii=False, indent=2)