In [1]:
import pandas as pd

In [25]:
pd.read_csv("enhance_ocod/data/training_data/ground_truth_dev_set_labels.csv")

Unnamed: 0,label,start,end,text,input:text,input:datapoint_id
0,street_number,0,3,207,"207 sloane street, london (sw1x 9qx)",53574
1,street_name,4,17,sloane street,"207 sloane street, london (sw1x 9qx)",53574
2,city,19,25,london,"207 sloane street, london (sw1x 9qx)",53574
3,postcode,27,35,sw1x 9qx,"207 sloane street, london (sw1x 9qx)",53574
4,unit_type,0,9,apartment,"apartment 533, block 11 spectrum, blackfriars ...",17501
...,...,...,...,...,...,...
73294,street_number,8,10,49,"flat a, 49 norcott road, stoke newington, (n16...",76011
73295,street_name,11,23,norcott road,"flat a, 49 norcott road, stoke newington, (n16...",76011
73296,street_number,25,30,stoke,"flat a, 49 norcott road, stoke newington, (n16...",76011
73297,street_number,31,40,newington,"flat a, 49 norcott road, stoke newington, (n16...",76011


In [22]:
#test = pd.read_csv("enhance_ocod/data/training_data/parsed_ground_truth_complete.csv")
data = pd.read_csv("enhance_ocod/data/training_data/ground_truth_dev_set_labels.csv").to_dict('records')
#data = pd.read_csv("enhance_ocod/data/training_data/ground_truth_test_set_labels.csv").to_dict('records')

In [23]:
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


grouped_data = {}
for item in data:
    datapoint_id = item['input:datapoint_id']
    if datapoint_id not in grouped_data:
        grouped_data[datapoint_id] = {
            "text": item['input:text'],
            "entities": []
        }
    
    grouped_data[datapoint_id]["entities"].append({
        "label": item['label'],
        "start": item['start'],
        "end": item['end']
    })

# Define unique entity labels from your data
unique_labels = sorted(set(item['label'] for item in data))

# Create BIO tagging scheme
bio_labels = ["O"]  # Outside any entity
for prefix in ["B", "I"]:  # Beginning and Inside
    for label in unique_labels:
        bio_labels.append(f"{prefix}-{label}")

# Create label mappings
label2id = {label: i for i, label in enumerate(bio_labels)}
id2label = {i: label for label, i in label2id.items()}

# Convert to the format needed for token classification
processed_examples = []

for datapoint in grouped_data.values():
    text = datapoint["text"]
    entities = datapoint["entities"]
    
    # Tokenize text
    tokenized = tokenizer(text, return_offsets_mapping=True, padding="max_length", truncation=True)
    
    # Create token tags (using BIO scheme)
    tags = ["O"] * len(tokenized["input_ids"])
    offset_mapping = tokenized.pop("offset_mapping")
    
    # Map character positions to token positions and assign labels
    for entity in entities:
        entity_start = entity["start"]
        entity_end = entity["end"]
        
        token_start = None
        token_end = None
        
        for i, (token_start_char, token_end_char) in enumerate(offset_mapping):
            # Skip special tokens
            if token_start_char == token_end_char == 0:
                continue
                
            if token_start_char <= entity_start < token_end_char:
                token_start = i
            
            if token_start_char < entity_end <= token_end_char:
                token_end = i
                break
        
        if token_start is not None and token_end is not None:
            # First token gets B- prefix (Beginning)
            tags[token_start] = f"B-{entity['label']}"
            
            # Any additional tokens get I- prefix (Inside)
            for i in range(token_start + 1, token_end + 1):
                tags[i] = f"I-{entity['label']}"
    
    processed_examples.append({
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": [label2id.get(tag, 0) for tag in tags]  # Convert string labels to IDs
    })

# Create dataset
dataset = Dataset.from_pandas(pd.DataFrame(processed_examples))

In [24]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})

In [1]:
from dataclasses import dataclass
from typing import List, Dict, Optional
import re

@dataclass
class Span:
    start: int
    end: int
    label: str
    source: str  # Which pattern matched

class AddressParser:
    def __init__(self):
        self.patterns = self._build_patterns()
    
    def _build_patterns(self) -> Dict[str, List[re.Pattern]]:
        """Define and compile regex patterns for each label type."""
        return {
            'BUILDING': [
                (re.compile(p, re.IGNORECASE), name) for name, p in [
                    ('building_general', r'\b(house|building|tower)s?\b'),
                    ('company', r'\b(ltd|limited|plc)\b'),
                ]
            ],
            'STREET': [
                (re.compile(p, re.IGNORECASE), name) for name, p in [
                    ('road_street', r'\b(road|street|lane)\b'),
                    ('avenue', r'\b(avenue|boulevard)\b'),
                ]
            ],
            'UNIT': [
                (re.compile(p, re.IGNORECASE), name) for name, p in [
                    ('flat', r'\b(flat|apartment)\s+([A-Z0-9]+)\b'),
                    ('number', r'\b(no|number)?\s*(\d+[A-Z]?)\b'),
                ]
            ]
        }
    
    def find_spans(self, text: str) -> List[Span]:
        """Find all spans in the text, with conflict resolution."""
        spans = []
        
        # Find all matches
        for label_type, pattern_list in self.patterns.items():
            for pattern, source in pattern_list:
                for match in pattern.finditer(text):
                    spans.append(Span(
                        start=match.start(),
                        end=match.end(),
                        label=label_type,
                        source=source
                    ))
        
        # Simple conflict resolution: keep longest span
        spans.sort(key=lambda x: (x.start, -x.end))
        filtered = []
        last_end = 0
        
        for span in spans:
            if span.start >= last_end:
                filtered.append(span)
                last_end = span.end
        
        return filtered


parser = AddressParser()
text = "Flat 3, 25 Test Road, London"

print(f"Parsing: {text}")
for span in parser.find_spans(text):
    print(f"- {span.label} ({span.source}): '{text[span.start:span.end]}'")

Parsing: Flat 3, 25 Test Road, London
- UNIT (flat): 'Flat 3'
- UNIT (number): '25'
- STREET (road_street): 'Road'


In [2]:
span

Span(start=16, end=20, label='STREET', source='road_street')