In [1]:
from enhance_ocod.inference_utils import parse_addresses_from_csv, convert_to_entity_dataframe
from enhance_ocod.address_parsing_helper_functions import (load_and_prep_OCOD_data, parsing_and_expansion_process, post_process_expanded_data)
from enhance_ocod.locate_and_classify_helper_functions import (load_postcode_district_lookup, preprocess_expandaded_ocod_data, 
                                                  add_missing_lads_ocod, load_voa_ratinglist, street_and_building_matching, substreet_matching,
                                                  counts_of_businesses_per_oa_lsoa, voa_address_match_all_data, classification_type1, classification_type2,
                                                  contract_ocod_after_classification)
from enhance_ocod.price_paid_process import load_and_process_pricepaid_data
import os
import pandas as pd


model_path = "models/address_parser_full/checkpoint-35286" #checkpoint-750"
model_path = "models/address_parser/checkpoint-750" 
csv_path = "data/training_data/ground_truth_test_set_labels.csv"

In [2]:
from enhance_ocod.address_parsing_helper_functions import load_cleaned_labels
temp = load_cleaned_labels("data/training_data/full_dataset_no_overlaps.json")

In [3]:
ground_truth_df = pd.read_csv(csv_path)

#I only need a small number of the columns to be able to calculate the F1 score
#Everything else just makes it confusing. 
#renaming is for consistancy
ground_truth_df = ground_truth_df.rename(
    columns = {'input:text':'property_address',
              'input:datapoint_id':'datapoint_id',
              'text':'label_text'})

ground_truth_df

Unnamed: 0,label,start,end,label_text,property_address,datapoint_id
0,street_number,0,3,161,"161, 163, 165, 167 and 169 uxbridge road, ealing",13755
1,street_number,5,8,163,"161, 163, 165, 167 and 169 uxbridge road, ealing",13755
2,street_number,10,13,165,"161, 163, 165, 167 and 169 uxbridge road, ealing",13755
3,street_number,15,18,167,"161, 163, 165, 167 and 169 uxbridge road, ealing",13755
4,street_number,23,26,169,"161, 163, 165, 167 and 169 uxbridge road, ealing",13755
...,...,...,...,...,...,...
33343,street_name,20,24,john,"118 metcalfe court, john harrison way, london ...",67996
33344,street_number,25,33,harrison,"118 metcalfe court, john harrison way, london ...",67996
33345,street_name,34,37,way,"118 metcalfe court, john harrison way, london ...",67996
33346,city,39,45,london,"118 metcalfe court, john harrison way, london ...",67996


In [6]:

results = parse_addresses_from_csv(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=2048
)

Processing 33348 addresses in batches of 2048


W0615 06:07:58.803000 19446 /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/_inductor/utils.py:1250] [1/0] Not enough SMs to use max_autotune_gemm mode
Processing batches: 100%|██████████| 17/17 [01:32<00:00,  5.45s/batch]


In [5]:

results = parse_addresses_from_csv(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=6144
)

Processing 33348 addresses in batches of 6144


Processing batches: 100%|██████████| 6/6 [01:28<00:00, 14.82s/batch]


In [9]:
import pandas as pd
import torch
import zipfile
import json
from pathlib import Path
from transformers import AutoModelForTokenClassification, AutoTokenizer
from datasets import Dataset
from typing import Dict, List, Optional, Union, Callable
import numpy as np
from tqdm import tqdm
from enhance_ocod.inference_utils import AddressParserInference
from itertools import islice
import json
import time
import torch.multiprocessing as mp

import torch.multiprocessing as mp
from torch.utils.data import DataLoader
from datasets import Dataset
import torch
import time
from tqdm import tqdm
from typing import Dict, Optional
import pandas as pd


In [5]:

def parse_addresses_from_csv_with_profiling(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address",
    index_column: Optional[str] = None,
    batch_size: int = 64,
    stream_results_path: Optional[str] = None
) -> Dict:
    """
    Complete function with timing to identify bottlenecks
    """

    
    parser = AddressParserInference(model_path)
    
    # Validation
    if target_column not in df.columns:
        raise ValueError(f"Column '{target_column}' not found. Available columns: {list(df.columns)}")
    
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    if index_column:
        if index_column not in df.columns:
            raise ValueError(f"Index column '{index_column}' not found. Available columns: {list(df.columns)}")
    
    # Use numpy arrays
    addresses = df[target_column].fillna("").astype(str).values
    indices = df[index_column].values if index_column else df.index.values
    has_datapoint_id = "datapoint_id" in df.columns
    datapoint_ids = df["datapoint_id"].values if has_datapoint_id else None
    
    total_addresses = len(addresses)
    print(f"Processing {total_addresses} addresses in batches of {batch_size}")
    
    # Streaming setup
    all_results = [] if stream_results_path is None else None
    result_file = open(stream_results_path, "w") if stream_results_path else None
    successful_parses = 0
    
    # Timing variables
    time_tokenization = 0
    time_gpu_inference = 0
    time_cpu_transfer = 0
    time_entity_extraction = 0
    time_result_building = 0
    time_cleanup = 0
    
    # Clean batching
    def batch_arrays(*arrays):
        for i in range(0, len(arrays[0]), batch_size):
            yield tuple(arr[i:i+batch_size] for arr in arrays)
    
    arrays_to_batch = [addresses, indices]
    if has_datapoint_id:
        arrays_to_batch.append(datapoint_ids)
    
    for batch in tqdm(batch_arrays(*arrays_to_batch), 
                     total=(total_addresses + batch_size - 1) // batch_size,
                     desc="Processing batches", unit="batch"):
        
        if has_datapoint_id:
            batch_addresses, batch_indices, batch_datapoint_ids = batch
        else:
            batch_addresses, batch_indices = batch
            batch_datapoint_ids = None
        
        # TIME: Tokenization
        start_time = time.time()
        inputs = parser.tokenizer(
            list(batch_addresses),
            padding=True,
            truncation=True,
            max_length=128,  
            return_tensors="pt",
            return_offsets_mapping=True
        )
        model_inputs = {k: v.to(parser.device) for k, v in inputs.items() if k != 'offset_mapping'}
        time_tokenization += time.time() - start_time
        
        # TIME: GPU Inference
        start_time = time.time()
        with torch.no_grad():
            outputs = parser.model(**model_inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        time_gpu_inference += time.time() - start_time
        
        # TIME: CPU Transfer
        start_time = time.time()
        predicted_token_class_ids_cpu = predicted_token_class_ids.cpu()
        input_ids_cpu = inputs["input_ids"].cpu()
        offset_mapping_cpu = inputs["offset_mapping"]
        
        # Cache frequently accessed objects
        id2label = parser.id2label
        tokenizer = parser.tokenizer
        time_cpu_transfer += time.time() - start_time
        
        # TIME: Entity Extraction (individual processing)
        start_time = time.time()
        batch_results = []
        for j, address in enumerate(batch_addresses):
            index = batch_indices[j]
            
            try:
                input_ids = input_ids_cpu[j]
                tokens = tokenizer.convert_ids_to_tokens(input_ids)
                predicted_labels = [id2label[pred.item()] 
                                  for pred in predicted_token_class_ids_cpu[j]]
                offset_mapping = offset_mapping_cpu[j]
                
                # This is likely the bottleneck
                entities = parser._extract_entities(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
                    
            except Exception as e:
                entities = []
                print(f"Warning: Failed to parse address at index {index}: {str(e)}")
            
            batch_results.append((index, address, entities, batch_datapoint_ids[j] if has_datapoint_id else None))
        
        time_entity_extraction += time.time() - start_time
        
        # TIME: Result Building
        start_time = time.time()
        for index, address, entities, datapoint_id in batch_results:
            result = {
                "row_index": index,
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities)
            }
            
            if has_datapoint_id:
                result["datapoint_id"] = datapoint_id
            
            # Store or stream result
            if all_results is not None:
                all_results.append(result)
            if result_file:
                result_file.write(json.dumps(result) + "\n")
        
        time_result_building += time.time() - start_time
        
        # TIME: GPU Cleanup
        start_time = time.time()
        del outputs, predictions, predicted_token_class_ids, predicted_token_class_ids_cpu, input_ids_cpu
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        time_cleanup += time.time() - start_time
    
    if result_file:
        result_file.close()
    
    # Print timing results
    total_time = time_tokenization + time_gpu_inference + time_cpu_transfer + time_entity_extraction + time_result_building + time_cleanup
    print(f"\n=== TIMING BREAKDOWN ===")
    print(f"Tokenization:      {time_tokenization:.2f}s ({time_tokenization/total_time*100:.1f}%)")
    print(f"GPU Inference:     {time_gpu_inference:.2f}s ({time_gpu_inference/total_time*100:.1f}%)")
    print(f"CPU Transfer:      {time_cpu_transfer:.2f}s ({time_cpu_transfer/total_time*100:.1f}%)")
    print(f"Entity Extraction: {time_entity_extraction:.2f}s ({time_entity_extraction/total_time*100:.1f}%)")
    print(f"Result Building:   {time_result_building:.2f}s ({time_result_building/total_time*100:.1f}%)")
    print(f"Cleanup:           {time_cleanup:.2f}s ({time_cleanup/total_time*100:.1f}%)")
    print(f"Total:             {total_time:.2f}s")
    
    # Build summary
    summary = {
        "total_addresses": total_addresses,
        "successful_parses": successful_parses,
        "failed_parses": total_addresses - successful_parses,
        "success_rate": successful_parses / total_addresses if total_addresses > 0 else 0,
        "timing": {
            "tokenization": time_tokenization,
            "gpu_inference": time_gpu_inference,
            "cpu_transfer": time_cpu_transfer,
            "entity_extraction": time_entity_extraction,
            "result_building": time_result_building,
            "cleanup": time_cleanup,
            "total": total_time
        }
    }
    
    return {
        "summary": summary,
        "results": all_results if all_results is not None else f"Streamed to {stream_results_path}"
    }

In [15]:

results = parse_addresses_from_csv_with_profiling(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=6144
)

Processing 33348 addresses in batches of 6144


Processing batches: 100%|██████████| 6/6 [01:29<00:00, 14.87s/batch]


=== TIMING BREAKDOWN ===
Tokenization:      3.74s (4.2%)
GPU Inference:     0.25s (0.3%)
CPU Transfer:      73.73s (82.7%)
Entity Extraction: 11.30s (12.7%)
Result Building:   0.06s (0.1%)
Cleanup:           0.12s (0.1%)
Total:             89.20s





In [5]:
def parse_addresses_from_csv_gpu_optimized(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address",
    index_column: Optional[str] = None,
    batch_size: int = 64,
    stream_results_path: Optional[str] = None
) -> Dict:
    """
    GPU-optimized version - minimal CPU transfers
    """
    from itertools import islice
    import json
    import time
    
    parser = AddressParserInference(model_path)
    
    # Validation (same as before)
    if target_column not in df.columns:
        raise ValueError(f"Column '{target_column}' not found. Available columns: {list(df.columns)}")
    
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    if index_column and index_column not in df.columns:
        raise ValueError(f"Index column '{index_column}' not found. Available columns: {list(df.columns)}")
    
    # Setup (same as before)
    addresses = df[target_column].fillna("").astype(str).values
    indices = df[index_column].values if index_column else df.index.values
    has_datapoint_id = "datapoint_id" in df.columns
    datapoint_ids = df["datapoint_id"].values if has_datapoint_id else None
    
    total_addresses = len(addresses)
    print(f"Processing {total_addresses} addresses in batches of {batch_size}")
    
    all_results = [] if stream_results_path is None else None
    result_file = open(stream_results_path, "w") if stream_results_path else None
    successful_parses = 0
    
    # Timing
    time_tokenization = 0
    time_gpu_inference = 0
    time_individual_processing = 0
    time_result_building = 0
    
    def batch_arrays(*arrays):
        for i in range(0, len(arrays[0]), batch_size):
            yield tuple(arr[i:i+batch_size] for arr in arrays)
    
    arrays_to_batch = [addresses, indices]
    if has_datapoint_id:
        arrays_to_batch.append(datapoint_ids)
    
    for batch in tqdm(batch_arrays(*arrays_to_batch), 
                     total=(total_addresses + batch_size - 1) // batch_size,
                     desc="Processing batches", unit="batch"):
        
        if has_datapoint_id:
            batch_addresses, batch_indices, batch_datapoint_ids = batch
        else:
            batch_addresses, batch_indices = batch
            batch_datapoint_ids = None
        
        # Tokenization
        start_time = time.time()
        inputs = parser.tokenizer(
            list(batch_addresses),
            padding=True,
            truncation=True,
            max_length=128,  
            return_tensors="pt",
            return_offsets_mapping=True
        )
        model_inputs = {k: v.to(parser.device) for k, v in inputs.items() if k != 'offset_mapping'}
        time_tokenization += time.time() - start_time
        
        # GPU Inference
        start_time = time.time()
        with torch.no_grad():
            outputs = parser.model(**model_inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        time_gpu_inference += time.time() - start_time
        
        # CRITICAL: Process one item at a time to avoid massive CPU transfers
        start_time = time.time()
        for j, address in enumerate(batch_addresses):
            index = batch_indices[j]
            
            try:
                # Transfer only single item tensors to CPU
                input_ids_single = inputs["input_ids"][j].cpu()
                predicted_labels_single = predicted_token_class_ids[j].cpu()
                offset_mapping_single = inputs["offset_mapping"][j]
                
                # Convert to tokens and labels
                tokens = parser.tokenizer.convert_ids_to_tokens(input_ids_single)
                predicted_labels = [parser.id2label[pred.item()] for pred in predicted_labels_single]
                
                # Extract entities
                entities = parser._extract_entities(address, tokens, predicted_labels, offset_mapping_single)
                if len(entities) > 0:
                    successful_parses += 1
                    
            except Exception as e:
                entities = []
                print(f"Warning: Failed to parse address at index {index}: {str(e)}")
            
            # Build and store result immediately
            result = {
                "row_index": index,
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities)
            }
            
            if has_datapoint_id:
                result["datapoint_id"] = batch_datapoint_ids[j]
            
            if all_results is not None:
                all_results.append(result)
            if result_file:
                result_file.write(json.dumps(result) + "\n")
        
        time_individual_processing += time.time() - start_time
        
        # Cleanup GPU memory
        del outputs, predictions, predicted_token_class_ids
        torch.cuda.empty_cache()
    
    if result_file:
        result_file.close()
    
    # Timing results
    total_time = time_tokenization + time_gpu_inference + time_individual_processing
    print(f"\n=== OPTIMIZED TIMING ===")
    print(f"Tokenization:         {time_tokenization:.2f}s ({time_tokenization/total_time*100:.1f}%)")
    print(f"GPU Inference:        {time_gpu_inference:.2f}s ({time_gpu_inference/total_time*100:.1f}%)")
    print(f"Individual Processing: {time_individual_processing:.2f}s ({time_individual_processing/total_time*100:.1f}%)")
    print(f"Total:                {total_time:.2f}s")
    
    summary = {
        "total_addresses": total_addresses,
        "successful_parses": successful_parses,
        "failed_parses": total_addresses - successful_parses,
        "success_rate": successful_parses / total_addresses if total_addresses > 0 else 0
    }
    
    return {
        "summary": summary,
        "results": all_results if all_results is not None else f"Streamed to {stream_results_path}"
    }


results = parse_addresses_from_csv_gpu_optimized(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=6144
)

NameError: name 'Optional' is not defined

In [8]:
def parse_addresses_from_csv_streaming(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address",
    index_column: Optional[str] = None,
    batch_size: int = 512,  # Much smaller!
    stream_results_path: Optional[str] = None
) -> Dict:
    """
    Streaming approach with smaller batches
    """
    from itertools import islice
    import json
    import time
    
    parser = AddressParserInference(model_path)
    
    # Validation
    if target_column not in df.columns:
        raise ValueError(f"Column '{target_column}' not found. Available columns: {list(df.columns)}")
    
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    if index_column and index_column not in df.columns:
        raise ValueError(f"Index column '{index_column}' not found. Available columns: {list(df.columns)}")
    
    # Setup
    addresses = df[target_column].fillna("").astype(str).values
    indices = df[index_column].values if index_column else df.index.values
    has_datapoint_id = "datapoint_id" in df.columns
    datapoint_ids = df["datapoint_id"].values if has_datapoint_id else None
    
    total_addresses = len(addresses)
    print(f"Processing {total_addresses} addresses in batches of {batch_size}")
    
    all_results = [] if stream_results_path is None else None
    result_file = open(stream_results_path, "w") if stream_results_path else None
    successful_parses = 0
    
    # Process in smaller chunks
    def batch_arrays(*arrays):
        for i in range(0, len(arrays[0]), batch_size):
            yield tuple(arr[i:i+batch_size] for arr in arrays)
    
    arrays_to_batch = [addresses, indices]
    if has_datapoint_id:
        arrays_to_batch.append(datapoint_ids)
    
    for batch in tqdm(batch_arrays(*arrays_to_batch), 
                     total=(total_addresses + batch_size - 1) // batch_size,
                     desc="Processing batches", unit="batch"):
        
        if has_datapoint_id:
            batch_addresses, batch_indices, batch_datapoint_ids = batch
        else:
            batch_addresses, batch_indices = batch
            batch_datapoint_ids = None
        
        # Tokenize smaller batch
        inputs = parser.tokenizer(
            list(batch_addresses),
            padding=True,
            truncation=True,
            max_length=128,  
            return_tensors="pt",
            return_offsets_mapping=True
        )
        
        model_inputs = {k: v.to(parser.device) for k, v in inputs.items() if k != 'offset_mapping'}
        
        # GPU inference on smaller batch
        with torch.no_grad():
            outputs = parser.model(**model_inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        
        # Now CPU transfer is much smaller and manageable
        predicted_token_class_ids_cpu = predicted_token_class_ids.cpu()
        input_ids_cpu = inputs["input_ids"].cpu()
        offset_mapping_cpu = inputs["offset_mapping"]
        
        # Process the smaller batch
        for j, address in enumerate(batch_addresses):
            index = batch_indices[j]
            
            try:
                input_ids = input_ids_cpu[j]
                tokens = parser.tokenizer.convert_ids_to_tokens(input_ids)
                predicted_labels = [parser.id2label[pred.item()] 
                                  for pred in predicted_token_class_ids_cpu[j]]
                offset_mapping = offset_mapping_cpu[j]
                
                entities = parser._extract_entities(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
                    
            except Exception as e:
                entities = []
                print(f"Warning: Failed to parse address at index {index}: {str(e)}")
            
            # Build result
            result = {
                "row_index": index,
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities)
            }
            
            if has_datapoint_id:
                result["datapoint_id"] = batch_datapoint_ids[j]
            
            # Stream immediately
            if all_results is not None:
                all_results.append(result)
            if result_file:
                result_file.write(json.dumps(result) + "\n")
        
        # Cleanup smaller tensors
        del outputs, predictions, predicted_token_class_ids, predicted_token_class_ids_cpu, input_ids_cpu
        torch.cuda.empty_cache()
    
    if result_file:
        result_file.close()
    
    summary = {
        "total_addresses": total_addresses,
        "successful_parses": successful_parses,
        "failed_parses": total_addresses - successful_parses,
        "success_rate": successful_parses / total_addresses if total_addresses > 0 else 0
    }
    
    return {
        "summary": summary,
        "results": all_results if all_results is not None else f"Streamed to {stream_results_path}"
    }

results = parse_addresses_from_csv_streaming(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=512
)

Processing 33348 addresses in batches of 4096


Processing batches: 100%|██████████| 9/9 [01:30<00:00, 10.11s/batch]


In [9]:
def parse_addresses_from_csv_streaming_with_profiling(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address",
    index_column: Optional[str] = None,
    batch_size: int = 512,
    stream_results_path: Optional[str] = None
) -> Dict:
    """
    Streaming approach with detailed timing
    """
    from itertools import islice
    import json
    import time
    
    parser = AddressParserInference(model_path)
    
    # Validation
    if target_column not in df.columns:
        raise ValueError(f"Column '{target_column}' not found. Available columns: {list(df.columns)}")
    
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    if index_column and index_column not in df.columns:
        raise ValueError(f"Index column '{index_column}' not found. Available columns: {list(df.columns)}")
    
    # Setup
    addresses = df[target_column].fillna("").astype(str).values
    indices = df[index_column].values if index_column else df.index.values
    has_datapoint_id = "datapoint_id" in df.columns
    datapoint_ids = df["datapoint_id"].values if has_datapoint_id else None
    
    total_addresses = len(addresses)
    print(f"Processing {total_addresses} addresses in batches of {batch_size}")
    
    all_results = [] if stream_results_path is None else None
    result_file = open(stream_results_path, "w") if stream_results_path else None
    successful_parses = 0
    
    # Timing variables
    time_tokenization = 0
    time_gpu_inference = 0
    time_cpu_transfer = 0
    time_entity_extraction = 0
    time_result_building = 0
    time_cleanup = 0
    
    # Process in smaller chunks
    def batch_arrays(*arrays):
        for i in range(0, len(arrays[0]), batch_size):
            yield tuple(arr[i:i+batch_size] for arr in arrays)
    
    arrays_to_batch = [addresses, indices]
    if has_datapoint_id:
        arrays_to_batch.append(datapoint_ids)
    
    for batch in tqdm(batch_arrays(*arrays_to_batch), 
                     total=(total_addresses + batch_size - 1) // batch_size,
                     desc="Processing batches", unit="batch"):
        
        if has_datapoint_id:
            batch_addresses, batch_indices, batch_datapoint_ids = batch
        else:
            batch_addresses, batch_indices = batch
            batch_datapoint_ids = None
        
        # TIME: Tokenization
        start_time = time.time()
        inputs = parser.tokenizer(
            list(batch_addresses),
            padding=True,
            truncation=True,
            max_length=128,  
            return_tensors="pt",
            return_offsets_mapping=True
        )
        model_inputs = {k: v.to(parser.device) for k, v in inputs.items() if k != 'offset_mapping'}
        time_tokenization += time.time() - start_time
        
        # TIME: GPU inference
        start_time = time.time()
        with torch.no_grad():
            outputs = parser.model(**model_inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        time_gpu_inference += time.time() - start_time
        
        # TIME: CPU transfer (now smaller)
        start_time = time.time()
        predicted_token_class_ids_cpu = predicted_token_class_ids.cpu()
        input_ids_cpu = inputs["input_ids"].cpu()
        offset_mapping_cpu = inputs["offset_mapping"]
        
        # Cache frequently accessed objects
        id2label = parser.id2label
        tokenizer = parser.tokenizer
        time_cpu_transfer += time.time() - start_time
        
        # TIME: Entity extraction
        start_time = time.time()
        batch_results = []
        for j, address in enumerate(batch_addresses):
            index = batch_indices[j]
            
            try:
                input_ids = input_ids_cpu[j]
                tokens = tokenizer.convert_ids_to_tokens(input_ids)
                predicted_labels = [id2label[pred.item()] 
                                  for pred in predicted_token_class_ids_cpu[j]]
                offset_mapping = offset_mapping_cpu[j]
                
                entities = parser._extract_entities(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
                    
            except Exception as e:
                entities = []
                print(f"Warning: Failed to parse address at index {index}: {str(e)}")
            
            batch_results.append((index, address, entities, batch_datapoint_ids[j] if has_datapoint_id else None))
        
        time_entity_extraction += time.time() - start_time
        
        # TIME: Result building
        start_time = time.time()
        for index, address, entities, datapoint_id in batch_results:
            result = {
                "row_index": index,
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities)
            }
            
            if has_datapoint_id:
                result["datapoint_id"] = datapoint_id
            
            # Stream immediately
            if all_results is not None:
                all_results.append(result)
            if result_file:
                result_file.write(json.dumps(result) + "\n")
        
        time_result_building += time.time() - start_time
        
        # TIME: Cleanup
        start_time = time.time()
        del outputs, predictions, predicted_token_class_ids, predicted_token_class_ids_cpu, input_ids_cpu
        torch.cuda.empty_cache()
        time_cleanup += time.time() - start_time
    
    if result_file:
        result_file.close()
    
    # Print timing results
    total_time = time_tokenization + time_gpu_inference + time_cpu_transfer + time_entity_extraction + time_result_building + time_cleanup
    print(f"\n=== STREAMING TIMING BREAKDOWN ===")
    print(f"Tokenization:      {time_tokenization:.2f}s ({time_tokenization/total_time*100:.1f}%)")
    print(f"GPU Inference:     {time_gpu_inference:.2f}s ({time_gpu_inference/total_time*100:.1f}%)")
    print(f"CPU Transfer:      {time_cpu_transfer:.2f}s ({time_cpu_transfer/total_time*100:.1f}%)")
    print(f"Entity Extraction: {time_entity_extraction:.2f}s ({time_entity_extraction/total_time*100:.1f}%)")
    print(f"Result Building:   {time_result_building:.2f}s ({time_result_building/total_time*100:.1f}%)")
    print(f"Cleanup:           {time_cleanup:.2f}s ({time_cleanup/total_time*100:.1f}%)")
    print(f"Total:             {total_time:.2f}s")
    print(f"Average per batch: {total_time/((total_addresses + batch_size - 1) // batch_size):.3f}s")
    print(f"Items per second:  {total_addresses/total_time:.1f}")
    
    summary = {
        "total_addresses": total_addresses,
        "successful_parses": successful_parses,
        "failed_parses": total_addresses - successful_parses,
        "success_rate": successful_parses / total_addresses if total_addresses > 0 else 0,
        "timing": {
            "tokenization": time_tokenization,
            "gpu_inference": time_gpu_inference,
            "cpu_transfer": time_cpu_transfer,
            "entity_extraction": time_entity_extraction,
            "result_building": time_result_building,
            "cleanup": time_cleanup,
            "total": total_time,
            "items_per_second": total_addresses/total_time
        }
    }
    
    return {
        "summary": summary,
        "results": all_results if all_results is not None else f"Streamed to {stream_results_path}"
    }

In [12]:
results = parse_addresses_from_csv_streaming_with_profiling(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=128
)

Processing 33348 addresses in batches of 128


Processing batches: 100%|██████████| 261/261 [00:40<00:00,  6.48batch/s]


=== STREAMING TIMING BREAKDOWN ===
Tokenization:      2.55s (6.4%)
GPU Inference:     8.12s (20.3%)
CPU Transfer:      21.65s (54.2%)
Entity Extraction: 6.82s (17.1%)
Result Building:   0.06s (0.2%)
Cleanup:           0.77s (1.9%)
Total:             39.98s
Average per batch: 0.153s
Items per second:  834.1





In [13]:
def parse_addresses_from_csv_streaming_with_cpu_transfer_optimization(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address",
    index_column: Optional[str] = None,
    batch_size: int = 512,
    stream_results_path: Optional[str] = None
) -> Dict:
    """
    Streaming approach with optimized CPU transfer.
    """
    import json
    import time

    parser = AddressParserInference(model_path)

    # Validation
    if target_column not in df.columns:
        raise ValueError(f"Column '{target_column}' not found. Available columns: {list(df.columns)}")

    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"

    if index_column and index_column not in df.columns:
        raise ValueError(f"Index column '{index_column}' not found. Available columns: {list(df.columns)}")

    addresses = df[target_column].fillna("").astype(str).values
    indices = df[index_column].values if index_column else df.index.values
    has_datapoint_id = "datapoint_id" in df.columns
    datapoint_ids = df["datapoint_id"].values if has_datapoint_id else None

    total_addresses = len(addresses)
    print(f"Processing {total_addresses} addresses in batches of {batch_size}")

    all_results = [] if stream_results_path is None else None
    result_file = open(stream_results_path, "w") if stream_results_path else None
    successful_parses = 0

    # Timing variables
    time_tokenization = 0
    time_gpu_inference = 0
    time_cpu_transfer = 0
    time_entity_extraction = 0
    time_result_building = 0
    time_cleanup = 0

    def batch_arrays(*arrays):
        for i in range(0, len(arrays[0]), batch_size):
            yield tuple(arr[i:i+batch_size] for arr in arrays)

    arrays_to_batch = [addresses, indices]
    if has_datapoint_id:
        arrays_to_batch.append(datapoint_ids)

    for batch in tqdm(batch_arrays(*arrays_to_batch),
                      total=(total_addresses + batch_size - 1) // batch_size,
                      desc="Processing batches", unit="batch"):

        if has_datapoint_id:
            batch_addresses, batch_indices, batch_datapoint_ids = batch
        else:
            batch_addresses, batch_indices = batch
            batch_datapoint_ids = None

        # TIME: Tokenization
        start_time = time.time()
        inputs = parser.tokenizer(
            list(batch_addresses),
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt",
            return_offsets_mapping=True
        )
        model_inputs = {k: v.to(parser.device) for k, v in inputs.items() if k != 'offset_mapping'}
        time_tokenization += time.time() - start_time

        # TIME: GPU inference
        start_time = time.time()
        with torch.no_grad():
            outputs = parser.model(**model_inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        time_gpu_inference += time.time() - start_time

        # Cache frequently accessed objects
        id2label = parser.id2label
        tokenizer = parser.tokenizer

        # TIME: Entity extraction + CPU transfer (optimized)
        start_time = time.time()
        batch_results = []
        for j, address in enumerate(batch_addresses):
            index = batch_indices[j]
            try:
                # Transfer only the relevant slices to CPU, not the whole batch
                input_ids = inputs["input_ids"][j].cpu()
                offset_mapping = inputs["offset_mapping"][j]  # already on CPU
                pred_token_class_ids = predicted_token_class_ids[j].cpu()
                tokens = tokenizer.convert_ids_to_tokens(input_ids)
                predicted_labels = [id2label[pred.item()] for pred in pred_token_class_ids]
                entities = parser._extract_entities(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
            except Exception as e:
                entities = []
                print(f"Warning: Failed to parse address at index {index}: {str(e)}")
            batch_results.append((index, address, entities, batch_datapoint_ids[j] if has_datapoint_id else None))
        time_cpu_transfer += time.time() - start_time  # now includes both transfer and extraction

        # TIME: Result building
        start_time = time.time()
        for index, address, entities, datapoint_id in batch_results:
            result = {
                "row_index": index,
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities)
            }
            if has_datapoint_id:
                result["datapoint_id"] = datapoint_id
            if all_results is not None:
                all_results.append(result)
            if result_file:
                result_file.write(json.dumps(result) + "\n")
        time_result_building += time.time() - start_time

        # TIME: Cleanup
        start_time = time.time()
        del outputs, predictions, predicted_token_class_ids
        torch.cuda.empty_cache()
        time_cleanup += time.time() - start_time

    if result_file:
        result_file.close()

    total_time = time_tokenization + time_gpu_inference + time_cpu_transfer + time_result_building + time_cleanup
    print(f"\n=== STREAMING TIMING BREAKDOWN (CPU Transfer Optimized) ===")
    print(f"Tokenization:      {time_tokenization:.2f}s ({time_tokenization/total_time*100:.1f}%)")
    print(f"GPU Inference:     {time_gpu_inference:.2f}s ({time_gpu_inference/total_time*100:.1f}%)")
    print(f"CPU Transfer+Entity Extraction: {time_cpu_transfer:.2f}s ({time_cpu_transfer/total_time*100:.1f}%)")
    print(f"Result Building:   {time_result_building:.2f}s ({time_result_building/total_time*100:.1f}%)")
    print(f"Cleanup:           {time_cleanup:.2f}s ({time_cleanup/total_time*100:.1f}%)")
    print(f"Total:             {total_time:.2f}s")
    print(f"Average per batch: {total_time/((total_addresses + batch_size - 1) // batch_size):.3f}s")
    print(f"Items per second:  {total_addresses/total_time:.1f}")

    summary = {
        "total_addresses": total_addresses,
        "successful_parses": successful_parses,
        "failed_parses": total_addresses - successful_parses,
        "success_rate": successful_parses / total_addresses if total_addresses > 0 else 0,
        "timing": {
            "tokenization": time_tokenization,
            "gpu_inference": time_gpu_inference,
            "cpu_transfer_and_extraction": time_cpu_transfer,
            "result_building": time_result_building,
            "cleanup": time_cleanup,
            "total": total_time,
            "items_per_second": total_addresses/total_time
        }
    }

    return {
        "summary": summary,
        "results": all_results if all_results is not None else f"Streamed to {stream_results_path}"
    }

In [15]:
results = parse_addresses_from_csv_streaming_with_cpu_transfer_optimization(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=128
)

Processing 33348 addresses in batches of 128


Processing batches: 100%|██████████| 261/261 [00:41<00:00,  6.32batch/s]


=== STREAMING TIMING BREAKDOWN (CPU Transfer Optimized) ===
Tokenization:      2.56s (6.3%)
GPU Inference:     8.08s (19.7%)
CPU Transfer+Entity Extraction: 29.57s (72.1%)
Result Building:   0.06s (0.1%)
Cleanup:           0.73s (1.8%)
Total:             40.99s
Average per batch: 0.157s
Items per second:  813.5





In [18]:
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
from datasets import Dataset
import torch
import time
from tqdm import tqdm
from typing import Dict, Optional
import pandas as pd

def parse_addresses_with_dataloader(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address", 
    index_column: Optional[str] = None,
    batch_size: int = 512,
    num_workers: int = 4
) -> Dict:
    """
    Using HuggingFace Dataset + DataLoader approach with optimized processing
    """
    # Set multiprocessing start method to avoid CUDA issues
    try:
        mp.set_start_method('spawn', force=True)
    except RuntimeError:
        pass  # Already set
    
    parser = AddressParserInference(model_path)
    
    # Setup index handling
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    # Create HuggingFace Dataset
    dataset_dict = {
        "address": df[target_column].fillna("").astype(str).tolist(),
        "row_index": df[index_column].tolist() if index_column else df.index.tolist()
    }
    
    if "datapoint_id" in df.columns:
        dataset_dict["datapoint_id"] = df["datapoint_id"].tolist()
    
    dataset = Dataset.from_dict(dataset_dict)
    
    # Tokenization function that preserves original data
    def tokenize_function(examples):
        # Create a CPU-only tokenizer to avoid CUDA issues in multiprocessing
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        tokenized = tokenizer(
            examples["address"],
            padding=False,  # DataLoader will handle padding
            truncation=True,
            max_length=128,
            return_offsets_mapping=True
        )
        
        # Keep the original address and metadata
        tokenized["original_address"] = examples["address"]
        tokenized["row_index"] = examples["row_index"]
        
        if "datapoint_id" in examples:
            tokenized["datapoint_id"] = examples["datapoint_id"]
        
        return tokenized
    
    # Apply tokenization with fallback for multiprocessing issues
    try:
        print("Attempting parallel tokenization...")
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            num_proc=2,  # Reduced to avoid issues
            remove_columns=[]  # Keep all columns
        )
        print("Parallel tokenization successful!")
    except Exception as e:
        print(f"Parallel tokenization failed: {e}")
        print("Falling back to single-process tokenization...")
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            num_proc=None,  # Single process
            remove_columns=[]  # Keep all columns
        )
        print("Single-process tokenization complete!")
    
    # Custom collate function for efficient batching
    def collate_fn(batch):
        # Extract data from batch items
        input_ids = [item["input_ids"] for item in batch]
        attention_mask = [item["attention_mask"] for item in batch]
        offset_mapping = [item["offset_mapping"] for item in batch]
        row_indices = [item["row_index"] for item in batch]
        addresses = [item["original_address"] for item in batch]  # Direct access from batch
        
        # Efficient padding
        max_len = max(len(ids) for ids in input_ids)
        padded_input_ids = []
        padded_attention_mask = []
        
        for ids, mask in zip(input_ids, attention_mask):
            pad_len = max_len - len(ids)
            padded_input_ids.append(ids + [parser.tokenizer.pad_token_id] * pad_len)
            padded_attention_mask.append(mask + [0] * pad_len)
        
        return {
            "input_ids": torch.tensor(padded_input_ids),
            "attention_mask": torch.tensor(padded_attention_mask),
            "offset_mapping": offset_mapping,
            "addresses": addresses,
            "row_indices": row_indices,
            "datapoint_ids": [item.get("datapoint_id") for item in batch] if "datapoint_id" in batch[0] else None
        }
    
    # Create DataLoader with conservative settings to avoid multiprocessing issues
    dataloader = DataLoader(
        tokenized_dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        num_workers=0,  # Disable multiprocessing for data loading
        pin_memory=True,  # Faster GPU transfers
        shuffle=False  # Maintain order
    )
    
    print(f"Processing {len(dataset)} addresses in batches of {batch_size}")
    
    # Initialize results and timing
    all_results = []
    successful_parses = 0
    
    time_data_loading = 0
    time_gpu_inference = 0
    time_cpu_transfer = 0
    time_entity_extraction = 0
    
    # Process batches
    for batch in tqdm(dataloader, desc="Processing batches"):
        # Data loading timing
        start_time = time.time()
        
        # Move tensors to GPU
        input_ids = batch["input_ids"].to(parser.device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(parser.device, non_blocking=True)
        
        time_data_loading += time.time() - start_time
        
        # GPU Inference
        start_time = time.time()
        with torch.no_grad():
            outputs = parser.model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        time_gpu_inference += time.time() - start_time
        
        # CPU Transfer (Fixed: removed non_blocking argument from cpu())
        start_time = time.time()
        predicted_token_class_ids_cpu = predicted_token_class_ids.cpu()
        input_ids_cpu = input_ids.cpu()
        torch.cuda.synchronize()  # Ensure GPU operations are complete
        time_cpu_transfer += time.time() - start_time
        
        # Entity Extraction
        start_time = time.time()
        for j in range(len(batch["addresses"])):
            address = batch["addresses"][j]
            tokens = parser.tokenizer.convert_ids_to_tokens(input_ids_cpu[j])
            predicted_labels = [parser.id2label[pred.item()] for pred in predicted_token_class_ids_cpu[j]]
            offset_mapping = batch["offset_mapping"][j]
            
            try:
                entities = parser._extract_entities(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
            except Exception as e:
                entities = []
            
            result = {
                "row_index": batch["row_indices"][j],
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities)
            }
            
            if batch["datapoint_ids"] and batch["datapoint_ids"][j]:
                result["datapoint_id"] = batch["datapoint_ids"][j]
            
            all_results.append(result)
        
        time_entity_extraction += time.time() - start_time
        
        # Memory cleanup
        del outputs, predictions, predicted_token_class_ids
        torch.cuda.empty_cache()
    
    # Print detailed timing information
    total_time = time_data_loading + time_gpu_inference + time_cpu_transfer + time_entity_extraction
    print(f"\n=== DATALOADER PERFORMANCE ANALYSIS ===")
    print(f"Data Loading:      {time_data_loading:.2f}s ({time_data_loading/total_time*100:.1f}%)")
    print(f"GPU Inference:     {time_gpu_inference:.2f}s ({time_gpu_inference/total_time*100:.1f}%)")
    print(f"CPU Transfer:      {time_cpu_transfer:.2f}s ({time_cpu_transfer/total_time*100:.1f}%)")
    print(f"Entity Extraction: {time_entity_extraction:.2f}s ({time_entity_extraction/total_time*100:.1f}%)")
    print(f"Total Time:        {total_time:.2f}s")
    print(f"Processing Speed:  {len(dataset)/total_time:.1f} addresses/second")
    print(f"Success Rate:      {successful_parses/len(dataset)*100:.1f}%")
    
    return {
        "summary": {
            "total_addresses": len(dataset),
            "successful_parses": successful_parses,
            "failed_parses": len(dataset) - successful_parses,
            "success_rate": successful_parses / len(dataset) if len(dataset) > 0 else 0,
            "processing_time": total_time,
            "addresses_per_second": len(dataset) / total_time if total_time > 0 else 0
        },
        "results": all_results
    }

# Usage
results = parse_addresses_with_dataloader(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=128
)

Attempting parallel tokenization...


Map (num_proc=2):   0%|          | 0/33348 [00:00<?, ? examples/s]

Parallel tokenization failed: One of the subprocesses has abruptly died during map operation.To debug the error, disable multiprocessing.
Falling back to single-process tokenization...


Map:   0%|          | 0/33348 [00:00<?, ? examples/s]

Single-process tokenization complete!
Processing 33348 addresses in batches of 128


Processing batches: 100%|██████████| 261/261 [00:45<00:00,  5.70it/s]


=== DATALOADER PERFORMANCE ANALYSIS ===
Data Loading:      0.04s (0.1%)
GPU Inference:     8.14s (24.6%)
CPU Transfer:      20.09s (60.8%)
Entity Extraction: 4.77s (14.4%)
Total Time:        33.04s
Processing Speed:  1009.4 addresses/second
Success Rate:      0.0%





In [19]:
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
from datasets import Dataset
import torch
import time
from tqdm import tqdm
from typing import Dict, Optional
import pandas as pd

def parse_addresses_with_dataloader_debug(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address", 
    index_column: Optional[str] = None,
    batch_size: int = 512,
    num_workers: int = 4,
    debug_first_batch: bool = True
) -> Dict:
    """
    Debug version to identify entity extraction issues
    """
    # Set multiprocessing start method to avoid CUDA issues
    try:
        mp.set_start_method('spawn', force=True)
    except RuntimeError:
        pass  # Already set
    
    parser = AddressParserInference(model_path)
    
    # Setup index handling
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    # Create HuggingFace Dataset
    dataset_dict = {
        "address": df[target_column].fillna("").astype(str).tolist(),
        "row_index": df[index_column].tolist() if index_column else df.index.tolist()
    }
    
    if "datapoint_id" in df.columns:
        dataset_dict["datapoint_id"] = df["datapoint_id"].tolist()
    
    dataset = Dataset.from_dict(dataset_dict)
    
    # Tokenization function that preserves original data
    def tokenize_function(examples):
        # IMPORTANT: Use the same tokenizer as the parser!
        tokenized = parser.tokenizer(  # Changed from creating new tokenizer
            examples["address"],
            padding=False,
            truncation=True,
            max_length=128,
            return_offsets_mapping=True
        )
        
        # Keep the original address and metadata
        tokenized["original_address"] = examples["address"]
        tokenized["row_index"] = examples["row_index"]
        
        if "datapoint_id" in examples:
            tokenized["datapoint_id"] = examples["datapoint_id"]
        
        return tokenized
    
    # Apply tokenization (disable parallel for debugging)
    print("Applying tokenization...")
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        batch_size=1000,
        num_proc=None,  # Single process for debugging
        remove_columns=[]
    )
    print("Tokenization complete!")
    
    # Custom collate function
    def collate_fn(batch):
        input_ids = [item["input_ids"] for item in batch]
        attention_mask = [item["attention_mask"] for item in batch]
        offset_mapping = [item["offset_mapping"] for item in batch]
        row_indices = [item["row_index"] for item in batch]
        addresses = [item["original_address"] for item in batch]
        
        # Efficient padding
        max_len = max(len(ids) for ids in input_ids)
        padded_input_ids = []
        padded_attention_mask = []
        
        for ids, mask in zip(input_ids, attention_mask):
            pad_len = max_len - len(ids)
            padded_input_ids.append(ids + [parser.tokenizer.pad_token_id] * pad_len)
            padded_attention_mask.append(mask + [0] * pad_len)
        
        return {
            "input_ids": torch.tensor(padded_input_ids),
            "attention_mask": torch.tensor(padded_attention_mask),
            "offset_mapping": offset_mapping,
            "addresses": addresses,
            "row_indices": row_indices,
            "datapoint_ids": [item.get("datapoint_id") for item in batch] if "datapoint_id" in batch[0] else None
        }
    
    # Create DataLoader
    dataloader = DataLoader(
        tokenized_dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        num_workers=0,
        pin_memory=True,
        shuffle=False
    )
    
    print(f"Processing {len(dataset)} addresses in batches of {batch_size}")
    
    all_results = []
    successful_parses = 0
    batch_count = 0
    
    for batch in tqdm(dataloader, desc="Processing batches"):
        batch_count += 1
        
        # Move tensors to GPU
        input_ids = batch["input_ids"].to(parser.device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(parser.device, non_blocking=True)
        
        # GPU Inference
        with torch.no_grad():
            outputs = parser.model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        
        # CPU Transfer
        predicted_token_class_ids_cpu = predicted_token_class_ids.cpu()
        input_ids_cpu = input_ids.cpu()
        torch.cuda.synchronize()
        
        # Debug first batch in detail
        if debug_first_batch and batch_count == 1:
            print("\n=== DEBUGGING FIRST BATCH ===")
            
            # Check first address in detail
            first_idx = 0
            address = batch["addresses"][first_idx]
            tokens = parser.tokenizer.convert_ids_to_tokens(input_ids_cpu[first_idx])
            predicted_labels = [parser.id2label[pred.item()] for pred in predicted_token_class_ids_cpu[first_idx]]
            offset_mapping = batch["offset_mapping"][first_idx]
            
            print(f"Address: {address}")
            print(f"Tokens: {tokens[:20]}...")  # First 20 tokens
            print(f"Labels: {predicted_labels[:20]}...")  # First 20 labels
            print(f"Unique labels in prediction: {set(predicted_labels)}")
            print(f"Label distribution: {dict(zip(*np.unique(predicted_labels, return_counts=True)))}")
            print(f"Offset mapping length: {len(offset_mapping)}")
            print(f"Model id2label keys: {list(parser.id2label.keys())[:10]}...")
            print(f"Model id2label values: {list(parser.id2label.values())[:10]}...")
            
            # Try entity extraction with debug
            try:
                print("\nAttempting entity extraction...")
                entities = parser._extract_entities(address, tokens, predicted_labels, offset_mapping)
                print(f"Extracted entities: {entities}")
                if len(entities) == 0:
                    print("WARNING: No entities extracted!")
                    
                    # Check if all predictions are "O" (outside)
                    non_o_labels = [label for label in predicted_labels if label != "O"]
                    print(f"Non-O labels: {non_o_labels[:10]}...")
                    print(f"Total non-O labels: {len(non_o_labels)}")
                    
            except Exception as e:
                print(f"Entity extraction failed with error: {e}")
                import traceback
                traceback.print_exc()
            
            print("=== END DEBUG ===\n")
        
        # Entity Extraction for all items in batch
        for j in range(len(batch["addresses"])):
            address = batch["addresses"][j]
            tokens = parser.tokenizer.convert_ids_to_tokens(input_ids_cpu[j])
            predicted_labels = [parser.id2label[pred.item()] for pred in predicted_token_class_ids_cpu[j]]
            offset_mapping = batch["offset_mapping"][j]
            
            try:
                entities = parser._extract_entities(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
            except Exception as e:
                entities = []
                if batch_count == 1 and j < 3:  # Debug first few items
                    print(f"Entity extraction error for item {j}: {e}")
            
            result = {
                "row_index": batch["row_indices"][j],
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities)
            }
            
            if batch["datapoint_ids"] and batch["datapoint_ids"][j]:
                result["datapoint_id"] = batch["datapoint_ids"][j]
            
            all_results.append(result)
        
        # Memory cleanup
        del outputs, predictions, predicted_token_class_ids
        torch.cuda.empty_cache()
        
        # Stop after first batch for debugging
        if debug_first_batch and batch_count == 1:
            print(f"Debug mode: Stopping after first batch. Found {successful_parses} successful parses out of {len(batch['addresses'])} addresses.")
            break
    
    return {
        "summary": {
            "total_addresses": len(all_results),
            "successful_parses": successful_parses,
            "failed_parses": len(all_results) - successful_parses,
            "success_rate": successful_parses / len(all_results) if len(all_results) > 0 else 0
        },
        "results": all_results
    }

# Run debug version
debug_results = parse_addresses_with_dataloader_debug(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=128,
    debug_first_batch=True
)


Applying tokenization...


Map:   0%|          | 0/33348 [00:00<?, ? examples/s]

Tokenization complete!
Processing 33348 addresses in batches of 128


Processing batches:   0%|          | 0/261 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/tmp/ipykernel_2027/3151257145.py", line 160, in parse_addresses_with_dataloader_debug
    entities = parser._extract_entities(address, tokens, predicted_labels, offset_mapping)
  File "/teamspace/studios/this_studio/enhance_ocod/src/enhance_ocod/inference_utils.py", line 129, in _extract_entities
    start_pos, end_pos = offset.tolist()
AttributeError: 'list' object has no attribute 'tolist'
Processing batches:   0%|          | 0/261 [00:00<?, ?it/s]


=== DEBUGGING FIRST BATCH ===
Address: 161, 163, 165, 167 and 169 uxbridge road, ealing
Tokens: ['[CLS]', '161', ',', 'Ġ163', ',', 'Ġ165', ',', 'Ġ167', 'Ġand', 'Ġ169', 'Ġu', 'x', 'bridge', 'Ġroad', ',', 'Ġe', 'aling', '[SEP]', '[PAD]', '[PAD]']...
Labels: ['O', 'B-street_number', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-street_name', 'I-street_name', 'I-street_name', 'O', 'O', 'O', 'O', 'O', 'O']...
Unique labels in prediction: {'B-street_number', 'I-street_name', 'O'}
Label distribution: {'B-street_number': 1, 'I-street_name': 3, 'O': 22}
Offset mapping length: 18
Model id2label keys: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]...
Model id2label values: ['O', 'B-building_name', 'I-building_name', 'B-street_name', 'I-street_name', 'B-street_number', 'I-street_number', 'B-filter_type', 'I-filter_type', 'B-unit_id']...

Attempting entity extraction...
Entity extraction failed with error: 'list' object has no attribute 'tolist'
=== END DEBUG ===

Entity extraction error for item 0: 'list' obje




In [10]:
def parse_addresses_with_dataloader_fixed(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address", 
    index_column: Optional[str] = None,
    batch_size: int = 512,
    num_workers: int = 4
) -> Dict:
    """
    Fixed version with proper offset mapping handling
    """
    # Set multiprocessing start method to avoid CUDA issues
    try:
        mp.set_start_method('spawn', force=True)
    except RuntimeError:
        pass  # Already set
    
    parser = AddressParserInference(model_path)
    
    # Setup index handling
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    # Create HuggingFace Dataset
    dataset_dict = {
        "address": df[target_column].fillna("").astype(str).tolist(),
        "row_index": df[index_column].tolist() if index_column else df.index.tolist()
    }
    
    if "datapoint_id" in df.columns:
        dataset_dict["datapoint_id"] = df["datapoint_id"].tolist()
    
    dataset = Dataset.from_dict(dataset_dict)
    
    # Tokenization function
    def tokenize_function(examples):
        tokenized = parser.tokenizer(
            examples["address"],
            padding=False,
            truncation=True,
            max_length=128,
            return_offsets_mapping=True
        )
        
        # Keep the original address and metadata
        tokenized["original_address"] = examples["address"]
        tokenized["row_index"] = examples["row_index"]
        
        if "datapoint_id" in examples:
            tokenized["datapoint_id"] = examples["datapoint_id"]
        
        return tokenized
    
    # Apply tokenization
    try:
        print("Attempting parallel tokenization...")
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            num_proc=2,
            remove_columns=[]
        )
        print("Parallel tokenization successful!")
    except Exception as e:
        print(f"Parallel tokenization failed: {e}")
        print("Falling back to single-process tokenization...")
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            num_proc=None,
            remove_columns=[]
        )
        print("Single-process tokenization complete!")
    
    # Fixed entity extraction function
    def extract_entities_fixed(address, tokens, predicted_labels, offset_mapping):
        """
        Fixed version that handles both tensor and list offset mappings
        """
        entities = []
        current_entity = None
        
        for i, (token, label, offset) in enumerate(zip(tokens, predicted_labels, offset_mapping)):
            # Skip special tokens
            if token in ['[CLS]', '[SEP]', '[PAD]']:
                continue
                
            # Handle offset mapping (could be tensor or list)
            if hasattr(offset, 'tolist'):
                start_pos, end_pos = offset.tolist()  # It's a tensor
            else:
                start_pos, end_pos = offset  # It's already a list
            
            # Skip tokens with no mapping (like padding)
            if start_pos == end_pos:
                continue
                
            if label.startswith('B-'):
                # Beginning of new entity
                if current_entity:
                    entities.append(current_entity)
                
                entity_type = label[2:]  # Remove 'B-' prefix
                current_entity = {
                    'type': entity_type,
                    'start': start_pos,
                    'end': end_pos,
                    'text': address[start_pos:end_pos]
                }
                
            elif label.startswith('I-') and current_entity:
                # Inside entity (continuation)
                entity_type = label[2:]  # Remove 'I-' prefix
                if entity_type == current_entity['type']:
                    # Extend current entity
                    current_entity['end'] = end_pos
                    current_entity['text'] = address[current_entity['start']:end_pos]
                else:
                    # Different entity type, start new one
                    entities.append(current_entity)
                    current_entity = {
                        'type': entity_type,
                        'start': start_pos,
                        'end': end_pos,
                        'text': address[start_pos:end_pos]
                    }
            else:
                # 'O' label or mismatched I- label
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
        
        # Don't forget the last entity
        if current_entity:
            entities.append(current_entity)
            
        return entities
    
    # Custom collate function
    def collate_fn(batch):
        input_ids = [item["input_ids"] for item in batch]
        attention_mask = [item["attention_mask"] for item in batch]
        offset_mapping = [item["offset_mapping"] for item in batch]
        row_indices = [item["row_index"] for item in batch]
        addresses = [item["original_address"] for item in batch]
        
        # Efficient padding
        max_len = max(len(ids) for ids in input_ids)
        padded_input_ids = []
        padded_attention_mask = []
        
        for ids, mask in zip(input_ids, attention_mask):
            pad_len = max_len - len(ids)
            padded_input_ids.append(ids + [parser.tokenizer.pad_token_id] * pad_len)
            padded_attention_mask.append(mask + [0] * pad_len)
        
        return {
            "input_ids": torch.tensor(padded_input_ids),
            "attention_mask": torch.tensor(padded_attention_mask),
            "offset_mapping": offset_mapping,
            "addresses": addresses,
            "row_indices": row_indices,
            "datapoint_ids": [item.get("datapoint_id") for item in batch] if "datapoint_id" in batch[0] else None
        }
    
    # Create DataLoader
    dataloader = DataLoader(
        tokenized_dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        num_workers=0,
        pin_memory=True,
        shuffle=False
    )
    
    print(f"Processing {len(dataset)} addresses in batches of {batch_size}")
    
    # Initialize results and timing
    all_results = []
    successful_parses = 0
    
    time_data_loading = 0
    time_gpu_inference = 0
    time_cpu_transfer = 0
    time_entity_extraction = 0
    
    # Process batches
    for batch in tqdm(dataloader, desc="Processing batches"):
        # Data loading timing
        start_time = time.time()
        
        # Move tensors to GPU
        input_ids = batch["input_ids"].to(parser.device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(parser.device, non_blocking=True)
        
        time_data_loading += time.time() - start_time
        
        # GPU Inference
        start_time = time.time()
        with torch.no_grad():
            outputs = parser.model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        time_gpu_inference += time.time() - start_time
        
        # CPU Transfer
        start_time = time.time()
        predicted_token_class_ids_cpu = predicted_token_class_ids.cpu()
        input_ids_cpu = input_ids.cpu()
        torch.cuda.synchronize()
        time_cpu_transfer += time.time() - start_time
        
        # Entity Extraction
        start_time = time.time()
        for j in range(len(batch["addresses"])):
            address = batch["addresses"][j]
            tokens = parser.tokenizer.convert_ids_to_tokens(input_ids_cpu[j])
            predicted_labels = [parser.id2label[pred.item()] for pred in predicted_token_class_ids_cpu[j]]
            offset_mapping = batch["offset_mapping"][j]
            
            try:
                # Use our fixed entity extraction
                entities = extract_entities_fixed(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
            except Exception as e:
                entities = []
                print(f"Entity extraction error: {e}")
            
            result = {
                "row_index": batch["row_indices"][j],
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities) if hasattr(parser, '_group_entities_by_type') else {}
            }
            
            if batch["datapoint_ids"] and batch["datapoint_ids"][j]:
                result["datapoint_id"] = batch["datapoint_ids"][j]
            
            all_results.append(result)
        
        time_entity_extraction += time.time() - start_time
        
        # Memory cleanup
        del outputs, predictions, predicted_token_class_ids
        torch.cuda.empty_cache()
    
    # Print detailed timing information
    total_time = time_data_loading + time_gpu_inference + time_cpu_transfer + time_entity_extraction
    print(f"\n=== DATALOADER PERFORMANCE ANALYSIS ===")
    print(f"Data Loading:      {time_data_loading:.2f}s ({time_data_loading/total_time*100:.1f}%)")
    print(f"GPU Inference:     {time_gpu_inference:.2f}s ({time_gpu_inference/total_time*100:.1f}%)")
    print(f"CPU Transfer:      {time_cpu_transfer:.2f}s ({time_cpu_transfer/total_time*100:.1f}%)")
    print(f"Entity Extraction: {time_entity_extraction:.2f}s ({time_entity_extraction/total_time*100:.1f}%)")
    print(f"Total Time:        {total_time:.2f}s")
    print(f"Processing Speed:  {len(dataset)/total_time:.1f} addresses/second")
    print(f"Success Rate:      {successful_parses/len(dataset)*100:.1f}%")
    
    return {
        "summary": {
            "total_addresses": len(dataset),
            "successful_parses": successful_parses,
            "failed_parses": len(dataset) - successful_parses,
            "success_rate": successful_parses / len(dataset) if len(dataset) > 0 else 0,
            "processing_time": total_time,
            "addresses_per_second": len(dataset) / total_time if total_time > 0 else 0
        },
        "results": all_results
    }

# Test the fixed version
results = parse_addresses_with_dataloader_fixed(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=128
)

Attempting parallel tokenization...


Map (num_proc=2):   0%|          | 0/33348 [00:00<?, ? examples/s]

Process ForkPoolWorker-5:
Traceback (most recent call last):
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/multiprocess/process.py", line 314, in _bootstrap
    self.run()
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/multiprocess/pool.py", line 114, in worker
    task = get()
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/multiprocess/queues.py", line 370, in get
    return _ForkingPickler.loads(res)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/dill/_dill.py", line 303, in loads
    return load(file, ignore, **kwds)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/dill/_dill.py", line 289, in load
    return Unpickler(file, ignore=ignore, **kwds).load()
  File "/home/zeus/miniconda3/e

Parallel tokenization failed: One of the subprocesses has abruptly died during map operation.To debug the error, disable multiprocessing.
Falling back to single-process tokenization...


Map:   0%|          | 0/33348 [00:00<?, ? examples/s]

Single-process tokenization complete!
Processing 33348 addresses in batches of 128


Processing batches: 100%|██████████| 261/261 [00:48<00:00,  5.38it/s]


=== DATALOADER PERFORMANCE ANALYSIS ===
Data Loading:      0.04s (0.1%)
GPU Inference:     11.50s (31.5%)
CPU Transfer:      19.77s (54.1%)
Entity Extraction: 5.23s (14.3%)
Total Time:        36.54s
Processing Speed:  912.6 addresses/second
Success Rate:      98.8%





In [11]:
def parse_addresses_with_dataloader_final(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address", 
    index_column: Optional[str] = None,
    batch_size: int = 512,
    num_workers: int = 4
) -> Dict:
    """
    Final optimized version with working multiprocessing
    """
    # Set multiprocessing start method to avoid CUDA issues
    try:
        mp.set_start_method('spawn', force=True)
    except RuntimeError:
        pass  # Already set
    
    parser = AddressParserInference(model_path)
    
    # Setup index handling
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    # Create HuggingFace Dataset
    dataset_dict = {
        "address": df[target_column].fillna("").astype(str).tolist(),
        "row_index": df[index_column].tolist() if index_column else df.index.tolist()
    }
    
    if "datapoint_id" in df.columns:
        dataset_dict["datapoint_id"] = df["datapoint_id"].tolist()
    
    dataset = Dataset.from_dict(dataset_dict)
    
    # Tokenization function that creates a fresh tokenizer (no CUDA dependencies)
    def tokenize_function(examples):
        # Import here to avoid issues with multiprocessing
        from transformers import AutoTokenizer
        
        # Create a completely fresh tokenizer in each worker process
        # This avoids CUDA state from the main process
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        tokenized = tokenizer(
            examples["address"],
            padding=False,
            truncation=True,
            max_length=128,
            return_offsets_mapping=True
        )
        
        # Keep the original address and metadata
        tokenized["original_address"] = examples["address"]
        tokenized["row_index"] = examples["row_index"]
        
        if "datapoint_id" in examples:
            tokenized["datapoint_id"] = examples["datapoint_id"]
        
        return tokenized
    
    # Apply tokenization with better error handling
    try:
        print("Attempting parallel tokenization...")
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            num_proc=4,  # Try more workers since we fixed the CUDA issue
            remove_columns=[],
            desc="Tokenizing"  # Progress bar for tokenization
        )
        print("Parallel tokenization successful!")
    except Exception as e:
        print(f"Parallel tokenization failed: {e}")
        print("Falling back to single-process tokenization...")
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            num_proc=None,
            remove_columns=[],
            desc="Tokenizing (single-process)"
        )
        print("Single-process tokenization complete!")
    
    # Fixed entity extraction function
    def extract_entities_fixed(address, tokens, predicted_labels, offset_mapping):
        """
        Fixed version that handles both tensor and list offset mappings
        """
        entities = []
        current_entity = None
        
        for i, (token, label, offset) in enumerate(zip(tokens, predicted_labels, offset_mapping)):
            # Skip special tokens
            if token in ['[CLS]', '[SEP]', '[PAD]']:
                continue
                
            # Handle offset mapping (could be tensor or list)
            if hasattr(offset, 'tolist'):
                start_pos, end_pos = offset.tolist()  # It's a tensor
            else:
                start_pos, end_pos = offset  # It's already a list
            
            # Skip tokens with no mapping (like padding)
            if start_pos == end_pos:
                continue
                
            if label.startswith('B-'):
                # Beginning of new entity
                if current_entity:
                    entities.append(current_entity)
                
                entity_type = label[2:]  # Remove 'B-' prefix
                current_entity = {
                    'type': entity_type,
                    'start': start_pos,
                    'end': end_pos,
                    'text': address[start_pos:end_pos]
                }
                
            elif label.startswith('I-') and current_entity:
                # Inside entity (continuation)
                entity_type = label[2:]  # Remove 'I-' prefix
                if entity_type == current_entity['type']:
                    # Extend current entity
                    current_entity['end'] = end_pos
                    current_entity['text'] = address[current_entity['start']:end_pos]
                else:
                    # Different entity type, start new one
                    entities.append(current_entity)
                    current_entity = {
                        'type': entity_type,
                        'start': start_pos,
                        'end': end_pos,
                        'text': address[start_pos:end_pos]
                    }
            else:
                # 'O' label or mismatched I- label
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
        
        # Don't forget the last entity
        if current_entity:
            entities.append(current_entity)
            
        return entities
    
    # Custom collate function
    def collate_fn(batch):
        input_ids = [item["input_ids"] for item in batch]
        attention_mask = [item["attention_mask"] for item in batch]
        offset_mapping = [item["offset_mapping"] for item in batch]
        row_indices = [item["row_index"] for item in batch]
        addresses = [item["original_address"] for item in batch]
        
        # Efficient padding
        max_len = max(len(ids) for ids in input_ids)
        padded_input_ids = []
        padded_attention_mask = []
        
        for ids, mask in zip(input_ids, attention_mask):
            pad_len = max_len - len(ids)
            padded_input_ids.append(ids + [parser.tokenizer.pad_token_id] * pad_len)
            padded_attention_mask.append(mask + [0] * pad_len)
        
        return {
            "input_ids": torch.tensor(padded_input_ids),
            "attention_mask": torch.tensor(padded_attention_mask),
            "offset_mapping": offset_mapping,
            "addresses": addresses,
            "row_indices": row_indices,
            "datapoint_ids": [item.get("datapoint_id") for item in batch] if "datapoint_id" in batch[0] else None
        }
    
    # Create DataLoader
    dataloader = DataLoader(
        tokenized_dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        num_workers=0,  # Keep this at 0 since tokenization is the bottleneck
        pin_memory=True,
        shuffle=False
    )
    
    print(f"Processing {len(dataset)} addresses in batches of {batch_size}")
    
    # Initialize results and timing
    all_results = []
    successful_parses = 0
    
    time_data_loading = 0
    time_gpu_inference = 0
    time_cpu_transfer = 0
    time_entity_extraction = 0
    
    # Process batches
    for batch in tqdm(dataloader, desc="Processing batches"):
        # Data loading timing
        start_time = time.time()
        
        # Move tensors to GPU
        input_ids = batch["input_ids"].to(parser.device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(parser.device, non_blocking=True)
        
        time_data_loading += time.time() - start_time
        
        # GPU Inference
        start_time = time.time()
        with torch.no_grad():
            outputs = parser.model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        time_gpu_inference += time.time() - start_time
        
        # CPU Transfer
        start_time = time.time()
        predicted_token_class_ids_cpu = predicted_token_class_ids.cpu()
        input_ids_cpu = input_ids.cpu()
        torch.cuda.synchronize()
        time_cpu_transfer += time.time() - start_time
        
        # Entity Extraction
        start_time = time.time()
        for j in range(len(batch["addresses"])):
            address = batch["addresses"][j]
            tokens = parser.tokenizer.convert_ids_to_tokens(input_ids_cpu[j])
            predicted_labels = [parser.id2label[pred.item()] for pred in predicted_token_class_ids_cpu[j]]
            offset_mapping = batch["offset_mapping"][j]
            
            try:
                # Use our fixed entity extraction
                entities = extract_entities_fixed(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
            except Exception as e:
                entities = []
            
            result = {
                "row_index": batch["row_indices"][j],
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities) if hasattr(parser, '_group_entities_by_type') else {}
            }
            
            if batch["datapoint_ids"] and batch["datapoint_ids"][j]:
                result["datapoint_id"] = batch["datapoint_ids"][j]
            
            all_results.append(result)
        
        time_entity_extraction += time.time() - start_time
        
        # Memory cleanup
        del outputs, predictions, predicted_token_class_ids
        torch.cuda.empty_cache()
    
    # Print detailed timing information
    total_time = time_data_loading + time_gpu_inference + time_cpu_transfer + time_entity_extraction
    print(f"\n=== FINAL PERFORMANCE ANALYSIS ===")
    print(f"Data Loading:      {time_data_loading:.2f}s ({time_data_loading/total_time*100:.1f}%)")
    print(f"GPU Inference:     {time_gpu_inference:.2f}s ({time_gpu_inference/total_time*100:.1f}%)")
    print(f"CPU Transfer:      {time_cpu_transfer:.2f}s ({time_cpu_transfer/total_time*100:.1f}%)")
    print(f"Entity Extraction: {time_entity_extraction:.2f}s ({time_entity_extraction/total_time*100:.1f}%)")
    print(f"Total Time:        {total_time:.2f}s")
    print(f"Processing Speed:  {len(dataset)/total_time:.1f} addresses/second")
    print(f"Success Rate:      {successful_parses/len(dataset)*100:.1f}%")
    
    return {
        "summary": {
            "total_addresses": len(dataset),
            "successful_parses": successful_parses,
            "failed_parses": len(dataset) - successful_parses,
            "success_rate": successful_parses / len(dataset) if len(dataset) > 0 else 0,
            "processing_time": total_time,
            "addresses_per_second": len(dataset) / total_time if total_time > 0 else 0
        },
        "results": all_results
    }

# Test the final version
results = parse_addresses_with_dataloader_final(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=128
)

Attempting parallel tokenization...


Tokenizing (num_proc=4):   0%|          | 0/33348 [00:00<?, ? examples/s]

Parallel tokenization successful!
Processing 33348 addresses in batches of 128


Processing batches: 100%|██████████| 261/261 [00:45<00:00,  5.69it/s]


=== FINAL PERFORMANCE ANALYSIS ===
Data Loading:      0.04s (0.1%)
GPU Inference:     8.14s (24.5%)
CPU Transfer:      20.03s (60.2%)
Entity Extraction: 5.08s (15.3%)
Total Time:        33.29s
Processing Speed:  1001.7 addresses/second
Success Rate:      98.8%





In [15]:
import torch.multiprocessing as mp
from transformers import pipeline
import torch
import time
from tqdm import tqdm
from typing import Dict, Optional
import pandas as pd

def parse_addresses_with_optimized_pipeline(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address", 
    index_column: Optional[str] = None,
    batch_size: int = 256,
    use_fp16: bool = True
) -> Dict:
    """
    Optimized version using HuggingFace Pipeline + Mixed Precision
    Let the pipeline handle all batching internally!
    """
    
    # Setup index handling
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    # Create optimized pipeline with mixed precision
    print(f"Loading model with mixed precision: {use_fp16}")
    
    device = 0 if torch.cuda.is_available() else -1
    torch_dtype = torch.float16 if use_fp16 and torch.cuda.is_available() else torch.float32
    
    # The key optimization: let the pipeline handle batching internally
    classifier = pipeline(
        "token-classification",
        model=model_path,
        tokenizer=model_path,
        device=device,
        torch_dtype=torch_dtype,
        aggregation_strategy="simple",
        batch_size=batch_size  # This tells the pipeline its internal batch size
    )
    
    # Prepare data
    addresses = df[target_column].fillna("").astype(str).tolist()
    row_indices = df[index_column].tolist() if index_column else df.index.tolist()
    datapoint_ids = df["datapoint_id"].tolist() if "datapoint_id" in df.columns else None
    
    print(f"Processing {len(addresses)} addresses with optimized pipeline (internal batch_size={batch_size})")
    print("Let the pipeline handle all batching and optimization...")
    
    # Initialize timing
    total_start = time.time()
    pipeline_start = time.time()
    
    # The magic: let the pipeline process ALL addresses at once!
    # It will handle batching, GPU optimization, everything internally
    try:
        all_predictions = classifier(addresses)
        pipeline_time = time.time() - pipeline_start
        print(f"Pipeline completed processing {len(addresses)} addresses in {pipeline_time:.2f}s")
    except Exception as e:
        print(f"Full batch processing failed: {e}")
        print("This might be due to memory constraints, falling back to manual batching...")
        
        # Fallback to manual batching if memory is an issue
        all_predictions = []
        pipeline_time = 0
        
        # Process in smaller chunks
        chunk_size = batch_size
        for i in tqdm(range(0, len(addresses), chunk_size), desc="Processing chunks"):
            chunk_start = time.time()
            chunk = addresses[i:i + chunk_size]
            chunk_predictions = classifier(chunk)
            all_predictions.extend(chunk_predictions)
            pipeline_time += time.time() - chunk_start
    
    # Post-process results
    postprocess_start = time.time()
    all_results = []
    successful_parses = 0
    
    for i, (address, entities) in enumerate(zip(addresses, all_predictions)):
        # Convert pipeline output to our format
        converted_entities = []
        
        if entities and isinstance(entities, list):
            for entity in entities:
                if isinstance(entity, dict):
                    converted_entities.append({
                        'type': entity.get('entity_group', entity.get('entity', 'UNKNOWN')),
                        'text': entity.get('word', ''),
                        'start': entity.get('start', 0),
                        'end': entity.get('end', 0),
                        'confidence': entity.get('score', 0.0)
                    })
        
        # Group entities by type
        parsed_components = {}
        for entity in converted_entities:
            entity_type = entity['type']
            if entity_type not in parsed_components:
                parsed_components[entity_type] = []
            parsed_components[entity_type].append(entity)
        
        if len(converted_entities) > 0:
            successful_parses += 1
        
        result = {
            "row_index": row_indices[i],
            "original_address": address,
            "entities": converted_entities,
            "parsed_components": parsed_components
        }
        
        if datapoint_ids and datapoint_ids[i]:
            result["datapoint_id"] = datapoint_ids[i]
        
        all_results.append(result)
    
    postprocess_time = time.time() - postprocess_start
    total_time = time.time() - total_start
    
    # Calculate metrics
    total_entities = sum(len(r['entities']) for r in all_results)
    avg_confidence = (
        sum(sum(e['confidence'] for e in r['entities']) for r in all_results if r['entities']) / max(total_entities, 1)
        if total_entities > 0 else 0.0
    )
    
    # Print detailed timing information
    print(f"\n=== OPTIMIZED PIPELINE PERFORMANCE ANALYSIS ===")
    print(f"Pipeline Processing:   {pipeline_time:.2f}s ({pipeline_time/total_time*100:.1f}%)")
    print(f"Post-processing:       {postprocess_time:.2f}s ({postprocess_time/total_time*100:.1f}%)")
    print(f"Other overhead:        {(total_time - pipeline_time - postprocess_time):.2f}s ({(total_time - pipeline_time - postprocess_time)/total_time*100:.1f}%)")
    print(f"Total Time:            {total_time:.2f}s")
    print(f"Processing Speed:      {len(addresses)/total_time:.1f} addresses/second")
    print(f"Success Rate:          {successful_parses/len(addresses)*100:.1f}%")
    print(f"Mixed Precision:       {'Enabled (FP16)' if use_fp16 else 'Disabled (FP32)'}")
    print(f"Average confidence:    {avg_confidence:.3f}")
    print(f"Pipeline batch size:   {batch_size}")
    
    return {
        "summary": {
            "total_addresses": len(addresses),
            "successful_parses": successful_parses,
            "failed_parses": len(addresses) - successful_parses,
            "success_rate": successful_parses / len(addresses) if len(addresses) > 0 else 0,
            "processing_time": total_time,
            "addresses_per_second": len(addresses) / total_time if total_time > 0 else 0,
            "pipeline_time": pipeline_time,
            "postprocess_time": postprocess_time,
            "mixed_precision_enabled": use_fp16,
            "average_confidence": avg_confidence
        },
        "results": all_results
    }

# Test with proper pipeline usage
results_optimized = parse_addresses_with_optimized_pipeline(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=128,  # Can be much larger now since pipeline handles it efficiently
    use_fp16=True
)

# You can also try different batch sizes to see what works best:
# batch_size=256, 512, 1024, etc.

Loading model with mixed precision: True


Device set to use cuda:0


Processing 33348 addresses with optimized pipeline (internal batch_size=128)
Let the pipeline handle all batching and optimization...
Pipeline completed processing 33348 addresses in 74.53s

=== OPTIMIZED PIPELINE PERFORMANCE ANALYSIS ===
Pipeline Processing:   74.53s (99.7%)
Post-processing:       0.25s (0.3%)
Other overhead:        0.00s (0.0%)
Total Time:            74.78s
Processing Speed:      446.0 addresses/second
Success Rate:          100.0%
Mixed Precision:       Enabled (FP16)
Average confidence:    0.925
Pipeline batch size:   128


In [24]:
def parse_addresses_with_custom_optimizations_final(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address", 
    index_column: Optional[str] = None,
    batch_size: int = 256,
    use_fp16: bool = True
) -> Dict:
    """
    Final optimized version with deprecation warnings fixed
    """
    try:
        mp.set_start_method('spawn', force=True)
    except RuntimeError:
        pass
    
    parser = AddressParserInference(model_path)
    print(f"Using mixed precision: {use_fp16}")
    
    # Setup index handling
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    # Create HuggingFace Dataset
    dataset_dict = {
        "address": df[target_column].fillna("").astype(str).tolist(),
        "row_index": df[index_column].tolist() if index_column else df.index.tolist()
    }
    
    if "datapoint_id" in df.columns:
        dataset_dict["datapoint_id"] = df["datapoint_id"].tolist()
    
    dataset = Dataset.from_dict(dataset_dict)
    
    # Tokenization function
    def tokenize_function(examples):
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        tokenized = tokenizer(
            examples["address"],
            padding=False,
            truncation=True,
            max_length=128,
            return_offsets_mapping=True
        )
        
        tokenized["original_address"] = examples["address"]
        tokenized["row_index"] = examples["row_index"]
        
        if "datapoint_id" in examples:
            tokenized["datapoint_id"] = examples["datapoint_id"]
        
        return tokenized
    
    # Tokenize with parallel processing
    try:
        print("Attempting parallel tokenization...")
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            num_proc=4,
            remove_columns=[],
            desc="Tokenizing"
        )
        print("Parallel tokenization successful!")
    except Exception as e:
        print(f"Parallel tokenization failed, using single process...")
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            remove_columns=[]
        )
    
    # Entity extraction function
    def extract_entities_optimized(address, tokens, predicted_labels, offset_mapping):
        entities = []
        current_entity = None
        
        for token, label, offset in zip(tokens, predicted_labels, offset_mapping):
            if token in ['[CLS]', '[SEP]', '[PAD]']:
                continue
                
            if hasattr(offset, 'tolist'):
                start_pos, end_pos = offset.tolist()
            else:
                start_pos, end_pos = offset
            
            if start_pos == end_pos:
                continue
                
            if label.startswith('B-'):
                if current_entity:
                    entities.append(current_entity)
                
                entity_type = label[2:]
                current_entity = {
                    'type': entity_type,
                    'start': start_pos,
                    'end': end_pos,
                    'text': address[start_pos:end_pos]
                }
                
            elif label.startswith('I-') and current_entity:
                entity_type = label[2:]
                if entity_type == current_entity['type']:
                    current_entity['end'] = end_pos
                    current_entity['text'] = address[current_entity['start']:end_pos]
                else:
                    entities.append(current_entity)
                    current_entity = {
                        'type': entity_type,
                        'start': start_pos,
                        'end': end_pos,
                        'text': address[start_pos:end_pos]
                    }
            else:
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
        
        if current_entity:
            entities.append(current_entity)
            
        return entities
    
    # Collate function
    def collate_fn(batch):
        input_ids = [item["input_ids"] for item in batch]
        attention_mask = [item["attention_mask"] for item in batch]
        offset_mapping = [item["offset_mapping"] for item in batch]
        row_indices = [item["row_index"] for item in batch]
        addresses = [item["original_address"] for item in batch]
        
        max_len = max(len(ids) for ids in input_ids)
        padded_input_ids = []
        padded_attention_mask = []
        
        for ids, mask in zip(input_ids, attention_mask):
            pad_len = max_len - len(ids)
            padded_input_ids.append(ids + [parser.tokenizer.pad_token_id] * pad_len)
            padded_attention_mask.append(mask + [0] * pad_len)
        
        return {
            "input_ids": torch.tensor(padded_input_ids),
            "attention_mask": torch.tensor(padded_attention_mask),
            "offset_mapping": offset_mapping,
            "addresses": addresses,
            "row_indices": row_indices,
            "datapoint_ids": [item.get("datapoint_id") for item in batch] if "datapoint_id" in batch[0] else None
        }
    
    # Create DataLoader
    dataloader = DataLoader(
        tokenized_dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        num_workers=0,
        pin_memory=True,
        shuffle=False
    )
    
    print(f"Processing {len(dataset)} addresses in batches of {batch_size}")
    
    all_results = []
    successful_parses = 0
    
    time_data_loading = 0
    time_gpu_inference = 0
    time_cpu_transfer = 0
    time_entity_extraction = 0
    
    # Process batches with FIXED mixed precision (no deprecation warning)
    for batch in tqdm(dataloader, desc="Processing batches"):
        start_time = time.time()
        
        input_ids = batch["input_ids"].to(parser.device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(parser.device, non_blocking=True)
        
        time_data_loading += time.time() - start_time
        
        # GPU Inference with FIXED mixed precision syntax
        start_time = time.time()
        with torch.no_grad():
            if use_fp16:
                # FIXED: Use the new autocast syntax
                with torch.amp.autocast('cuda'):
                    outputs = parser.model(input_ids=input_ids, attention_mask=attention_mask)
            else:
                outputs = parser.model(input_ids=input_ids, attention_mask=attention_mask)
            
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        time_gpu_inference += time.time() - start_time
        
        # CPU Transfer
        start_time = time.time()
        predicted_token_class_ids_cpu = predicted_token_class_ids.cpu()
        input_ids_cpu = input_ids.cpu()
        torch.cuda.synchronize()
        time_cpu_transfer += time.time() - start_time
        
        # Entity Extraction
        start_time = time.time()
        for j in range(len(batch["addresses"])):
            address = batch["addresses"][j]
            tokens = parser.tokenizer.convert_ids_to_tokens(input_ids_cpu[j])
            predicted_labels = [parser.id2label[pred.item()] for pred in predicted_token_class_ids_cpu[j]]
            offset_mapping = batch["offset_mapping"][j]
            
            try:
                entities = extract_entities_optimized(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
            except Exception as e:
                entities = []
            
            result = {
                "row_index": batch["row_indices"][j],
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities) if hasattr(parser, '_group_entities_by_type') else {}
            }
            
            if batch["datapoint_ids"] and batch["datapoint_ids"][j]:
                result["datapoint_id"] = batch["datapoint_ids"][j]
            
            all_results.append(result)
        
        time_entity_extraction += time.time() - start_time
        
        del outputs, predictions, predicted_token_class_ids
        torch.cuda.empty_cache()
    
    total_time = time_data_loading + time_gpu_inference + time_cpu_transfer + time_entity_extraction
    print(f"\n=== FINAL OPTIMIZED PERFORMANCE ===")
    print(f"Data Loading:      {time_data_loading:.2f}s ({time_data_loading/total_time*100:.1f}%)")
    print(f"GPU Inference:     {time_gpu_inference:.2f}s ({time_gpu_inference/total_time*100:.1f}%)")
    print(f"CPU Transfer:      {time_cpu_transfer:.2f}s ({time_cpu_transfer/total_time*100:.1f}%)")
    print(f"Entity Extraction: {time_entity_extraction:.2f}s ({time_entity_extraction/total_time*100:.1f}%)")
    print(f"Total Time:        {total_time:.2f}s")
    print(f"Processing Speed:  {len(dataset)/total_time:.1f} addresses/second")
    print(f"Success Rate:      {successful_parses/len(dataset)*100:.1f}%")
    print(f"Mixed Precision:   {'Enabled' if use_fp16 else 'Disabled'}")
    
    return {
        "summary": {
            "total_addresses": len(dataset),
            "successful_parses": successful_parses,
            "failed_parses": len(dataset) - successful_parses,
            "success_rate": successful_parses / len(dataset) if len(dataset) > 0 else 0,
            "processing_time": total_time,
            "addresses_per_second": len(dataset) / total_time if total_time > 0 else 0
        },
        "results": all_results
    }

# Test the final clean version
results_final = parse_addresses_with_custom_optimizations_final(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=256,
    use_fp16=True
)

Using mixed precision: True
Attempting parallel tokenization...


Tokenizing (num_proc=4):   0%|          | 0/33348 [00:00<?, ? examples/s]

Parallel tokenization successful!
Processing 33348 addresses in batches of 256


Processing batches: 100%|██████████| 131/131 [00:27<00:00,  4.75it/s]


=== FINAL OPTIMIZED PERFORMANCE ===
Data Loading:      0.02s (0.1%)
GPU Inference:     4.78s (32.1%)
CPU Transfer:      4.48s (30.2%)
Entity Extraction: 5.58s (37.6%)
Total Time:        14.86s
Processing Speed:  2244.0 addresses/second
Success Rate:      98.8%
Mixed Precision:   Enabled





In [26]:
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
from datasets import Dataset
import torch
from tqdm import tqdm
from typing import Dict, Optional
import pandas as pd

def parse_addresses_batch(
    df: pd.DataFrame,
    model_path: str,
    target_column: str = "address", 
    index_column: Optional[str] = None,
    batch_size: int = 256,
    use_fp16: bool = True,
    show_progress: bool = True
) -> Dict:
    """
    Production-ready batch address parsing with mixed precision optimization.
    
    Args:
        df: DataFrame containing addresses
        model_path: Path to the trained model
        target_column: Column name containing addresses
        index_column: Column to use as index (optional)
        batch_size: Batch size for processing
        use_fp16: Enable mixed precision for speed
        show_progress: Show progress bar
    
    Returns:
        Dict with summary and results
    """
    # Initialize multiprocessing
    try:
        mp.set_start_method('spawn', force=True)
    except RuntimeError:
        pass
    
    parser = AddressParserInference(model_path)
    
    # Setup indexing
    if index_column is None and "datapoint_id" in df.columns:
        index_column = "datapoint_id"
    
    # Prepare dataset
    dataset_dict = {
        "address": df[target_column].fillna("").astype(str).tolist(),
        "row_index": df[index_column].tolist() if index_column else df.index.tolist()
    }
    
    if "datapoint_id" in df.columns:
        dataset_dict["datapoint_id"] = df["datapoint_id"].tolist()
    
    dataset = Dataset.from_dict(dataset_dict)
    
    # Tokenization function
    def tokenize_function(examples):
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        tokenized = tokenizer(
            examples["address"],
            padding=False,
            truncation=True,
            max_length=128,
            return_offsets_mapping=True
        )
        
        tokenized["original_address"] = examples["address"]
        tokenized["row_index"] = examples["row_index"]
        
        if "datapoint_id" in examples:
            tokenized["datapoint_id"] = examples["datapoint_id"]
        
        return tokenized
    
    # Apply tokenization
    try:
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            num_proc=4,
            remove_columns=[]
        )
    except Exception:
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=1000,
            remove_columns=[]
        )
    
    # Entity extraction
    def extract_entities(address, tokens, predicted_labels, offset_mapping):
        entities = []
        current_entity = None
        
        for token, label, offset in zip(tokens, predicted_labels, offset_mapping):
            if token in ['[CLS]', '[SEP]', '[PAD]']:
                continue
                
            if hasattr(offset, 'tolist'):
                start_pos, end_pos = offset.tolist()
            else:
                start_pos, end_pos = offset
            
            if start_pos == end_pos:
                continue
                
            if label.startswith('B-'):
                if current_entity:
                    entities.append(current_entity)
                
                entity_type = label[2:]
                current_entity = {
                    'type': entity_type,
                    'start': start_pos,
                    'end': end_pos,
                    'text': address[start_pos:end_pos]
                }
                
            elif label.startswith('I-') and current_entity:
                entity_type = label[2:]
                if entity_type == current_entity['type']:
                    current_entity['end'] = end_pos
                    current_entity['text'] = address[current_entity['start']:end_pos]
                else:
                    entities.append(current_entity)
                    current_entity = {
                        'type': entity_type,
                        'start': start_pos,
                        'end': end_pos,
                        'text': address[start_pos:end_pos]
                    }
            else:
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
        
        if current_entity:
            entities.append(current_entity)
            
        return entities
    
    # Collate function
    def collate_fn(batch):
        input_ids = [item["input_ids"] for item in batch]
        attention_mask = [item["attention_mask"] for item in batch]
        offset_mapping = [item["offset_mapping"] for item in batch]
        row_indices = [item["row_index"] for item in batch]
        addresses = [item["original_address"] for item in batch]
        
        max_len = max(len(ids) for ids in input_ids)
        padded_input_ids = []
        padded_attention_mask = []
        
        for ids, mask in zip(input_ids, attention_mask):
            pad_len = max_len - len(ids)
            padded_input_ids.append(ids + [parser.tokenizer.pad_token_id] * pad_len)
            padded_attention_mask.append(mask + [0] * pad_len)
        
        return {
            "input_ids": torch.tensor(padded_input_ids),
            "attention_mask": torch.tensor(padded_attention_mask),
            "offset_mapping": offset_mapping,
            "addresses": addresses,
            "row_indices": row_indices,
            "datapoint_ids": [item.get("datapoint_id") for item in batch] if "datapoint_id" in batch[0] else None
        }
    
    # Create DataLoader
    dataloader = DataLoader(
        tokenized_dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        num_workers=0,
        pin_memory=True,
        shuffle=False
    )
    
    # Process batches
    all_results = []
    successful_parses = 0
    
    iterator = tqdm(dataloader, desc="Processing addresses") if show_progress else dataloader
    
    for batch in iterator:
        # Move to GPU
        input_ids = batch["input_ids"].to(parser.device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(parser.device, non_blocking=True)
        
        # Inference
        with torch.no_grad():
            if use_fp16:
                with torch.amp.autocast('cuda'):
                    outputs = parser.model(input_ids=input_ids, attention_mask=attention_mask)
            else:
                outputs = parser.model(input_ids=input_ids, attention_mask=attention_mask)
            
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_token_class_ids = predictions.argmax(dim=-1)
        
        # Move to CPU
        predicted_token_class_ids_cpu = predicted_token_class_ids.cpu()
        input_ids_cpu = input_ids.cpu()
        torch.cuda.synchronize()
        
        # Extract entities
        for j in range(len(batch["addresses"])):
            address = batch["addresses"][j]
            tokens = parser.tokenizer.convert_ids_to_tokens(input_ids_cpu[j])
            predicted_labels = [parser.id2label[pred.item()] for pred in predicted_token_class_ids_cpu[j]]
            offset_mapping = batch["offset_mapping"][j]
            
            try:
                entities = extract_entities(address, tokens, predicted_labels, offset_mapping)
                if len(entities) > 0:
                    successful_parses += 1
            except Exception:
                entities = []
            
            result = {
                "row_index": batch["row_indices"][j],
                "original_address": address,
                "entities": entities,
                "parsed_components": parser._group_entities_by_type(entities) if hasattr(parser, '_group_entities_by_type') else {}
            }
            
            if batch["datapoint_ids"] and batch["datapoint_ids"][j]:
                result["datapoint_id"] = batch["datapoint_ids"][j]
            
            all_results.append(result)
        
        # Cleanup
        del outputs, predictions, predicted_token_class_ids
        torch.cuda.empty_cache()
    
    return {
        "summary": {
            "total_addresses": len(dataset),
            "successful_parses": successful_parses,
            "failed_parses": len(dataset) - successful_parses,
            "success_rate": successful_parses / len(dataset) if len(dataset) > 0 else 0
        },
        "results": all_results
    }



In [27]:
results_final = parse_addresses_batch(
    df=ground_truth_df,
    model_path=str(model_path),
    target_column="property_address",
    batch_size=256,
    use_fp16=True
)

Map (num_proc=4):   0%|          | 0/33348 [00:00<?, ? examples/s]

Processing addresses: 100%|██████████| 131/131 [00:28<00:00,  4.66it/s]
