In [3]:
import pandas as pd
import numpy as np
import os
import pickle  # For catching UnpicklingError
from zipfile import BadZipFile 
from tqdm.auto import tqdm

# -------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------

# --- Overrides for testing (optional) ---
freqs = ["7GHz"]
txids = [1]

# Directories
base_dir = r"C:\Users\mkrishne\OneDrive - purdue.edu\ECNDATA\Desktop\PL_competition\c_evaluation_propagation_loss"
save_dir = r"C:\Users\mkrishne\OneDrive - purdue.edu\ECNDATA\Desktop\PL_competition\catboost_training\for_catboost_train"
region_dir = r"C:\Users\mkrishne\OneDrive - purdue.edu\ECNDATA\Desktop\PL_competition\extracted_regions\train"

# --- Master CSV directory is constant ---
MASTER_CSV_DIR = r"C:\Users\mkrishne\OneDrive - purdue.edu\ECNDATA\Desktop\PL_competition\c_evaluation_propagation_loss\800MHz"

# Coordinate shift
xmin = 384592.1875
ymin = 3944795.0

# -------------------------------------------------------------------
# HELPER FUNCTIONS
# -------------------------------------------------------------------

def calculate_polygon_area(polygon):
    """Calculates the total area of a single 3D polygon."""
    if len(polygon) < 3:
        return 0.0
    total_area = 0.0
    v0 = polygon[0]
    for i in range(1, len(polygon) - 1):
        v1 = polygon[i]
        v2 = polygon[i+1]
        area = 0.5 * np.linalg.norm(np.cross(v1 - v0, v2 - v0))
        total_area += area
    return total_area

def get_sphere_features(npz_file_path):
    """
    Calculates polygon count, total area, and average max height
    from a single ..._sphere50.npz file.
    """
    try:
        data = np.load(npz_file_path, allow_pickle=True)
        polygons = data['polys'].tolist()
    except FileNotFoundError:
        return "MISSING", 0.0, 0.0
    # Catches all known file corruption errors
    except (BadZipFile, EOFError, pickle.UnpicklingError, KeyError) as e:
        # KeyError handles missing 'polys' key
        return "CORRUPT", 0.0, 0.0
    except Exception as e:
        # Catch any other unexpected error during loading
        return f"ERROR: {e}", 0.0, 0.0
    
    if not polygons:
        return 0, 0.0, 0.0 

    try:
        poly_count = len(polygons)
        obstr_poly_area = 0.0
        max_heights_per_polygon = []
        
        for poly in polygons:
            obstr_poly_area += calculate_polygon_area(poly)
            # Add check for empty polygon data
            if len(poly) > 0:
                max_heights_per_polygon.append(np.max(poly[:, 2]))
        
        avg_clutter_height = np.mean(max_heights_per_polygon) if max_heights_per_polygon else 0.0
        
        return poly_count, obstr_poly_area, avg_clutter_height
    except Exception as e:
        # Catch errors during calculation (e.g., np.max on empty array)
        return f"ERROR: {e}", 0.0, 0.0

# -------------------------------------------------------------------
# Main Loop (Corrected Logic)
# -------------------------------------------------------------------
print("--- Starting Script to Add RX Sphere Features ---")
for freq in freqs:
    for tx_id in txids:
        print(f"\n▶ Processing {freq} | Tx {tx_id}")

        # 1. Define and load the final CSV file to be augmented
        input_file = os.path.join(save_dir, f"{freq}_Tx_{tx_id}_train_data.csv")
        if not os.path.exists(input_file):
            print(f"⚠️ Skipping missing CSV file: {input_file}")
            continue
        df = pd.read_csv(input_file)
        print(f"  -> Loaded CSV with {len(df)} rows.")
        
        if "avg_rx_clutter_height" in df.columns:
            print(f"  -> Columns already exist. Skipping this file.")
            continue

        # 2. Load the RX coordinates from the *base* CSV
        orig_csv_path = os.path.join(base_dir, freq, f"{freq}_Tx_{tx_id}.csv")
        if not os.path.exists(orig_csv_path):
            print(f"⚠️ Missing base file for RX coords: {orig_csv_path}. Skipping.")
            continue
        df_orig = pd.read_csv(orig_csv_path)
        
        # Ensure row count matches
        if len(df_orig) != len(df):
             print(f"  -> ❌ ERROR: Mismatch in row count between {input_file} ({len(df)}) and {orig_csv_path} ({len(df_orig)}). Skipping.")
             continue
        rx_all = df_orig.iloc[:, [6, 7, 8]].values
        rx_all_shifted = rx_all.copy()
        rx_all_shifted[:, 0] -= xmin
        rx_all_shifted[:, 1] -= ymin

        # 3. Load the RX coordinates from the *master* CSV (for lookup)
        master_csv_path = os.path.join(MASTER_CSV_DIR, f"800MHz_Tx_{tx_id}.csv")
        if not os.path.exists(master_csv_path):
            print(f"⚠️ Missing master file for lookup: {master_csv_path}. Skipping.")
            continue
        df_master = pd.read_csv(master_csv_path)
        rx_all_master = df_master.iloc[:, [6, 7, 8]].values
        rx_all_master_shifted = rx_all_master.copy()
        rx_all_master_shifted[:, 0] -= xmin
        rx_all_master_shifted[:, 1] -= ymin

        # 4. Build the fast lookup map: { (x,y,z) -> pair_id }
        print("  -> Building coordinate lookup map from 800MHz master file...")
        master_lookup = {
            tuple(np.round(coord, 5)): i 
            for i, coord in enumerate(rx_all_master_shifted)
        }
        print(f"  -> Map built with {len(master_lookup)} unique locations.")

        # 5. Loop through training RX points, find match, and get features
        all_rx_poly_counts = []
        all_rx_obstr_areas = []
        all_rx_clutter_heights = []
        
        # Detailed error logging
        lookup_failures = []
        corrupt_files = []
        missing_files = []
        other_errors = {}
        
        npz_dir = os.path.join(region_dir, f"rx_sphere_Tx_{tx_id}")
        
        print("  -> Matching RX points and processing NPZ files...")
        # Use df.index to match the rows being processed
        for i in tqdm(df.index, desc="  Processing RX spheres"):
            rx_pt = rx_all_shifted[i] # Get the corresponding rx coordinate
            rx_sphere_pair_id = master_lookup.get(tuple(np.round(rx_pt, 5)))
            
            if rx_sphere_pair_id is None:
                lookup_failures.append(i) # Log the row index
                all_rx_poly_counts.append(0)
                all_rx_obstr_areas.append(0.0)
                all_rx_clutter_heights.append(0.0)
                continue

            npz_file = os.path.join(npz_dir, f"pair{rx_sphere_pair_id:05d}_rx_sphere50.npz")
            poly_count, total_area, avg_height = get_sphere_features(npz_file)

            if isinstance(poly_count, str): # Check if an error string was returned
                if poly_count == "CORRUPT":
                    corrupt_files.append(rx_sphere_pair_id)
                elif poly_count == "MISSING":
                    missing_files.append(rx_sphere_pair_id)
                else: # Catches "ERROR: ..."
                    other_errors[rx_sphere_pair_id] = poly_count
                
                # Append defaults
                all_rx_poly_counts.append(0)
                all_rx_obstr_areas.append(0.0)
                all_rx_clutter_heights.append(0.0)
            else:
                # Success
                all_rx_poly_counts.append(poly_count)
                all_rx_obstr_areas.append(total_area)
                all_rx_clutter_heights.append(avg_height)
        
        # 6. Insert new columns into the main DataFrame
        
        # Detailed error reporting
        if lookup_failures:
            print(f"  -> ⚠️ WARNING: {len(lookup_failures)} RX points not found in master list. (First 10 indices: {lookup_failures[:10]})")
        if corrupt_files:
            print(f"  -> ⚠️ WARNING: {len(corrupt_files)} corrupt .npz files detected. (First 10 pair_ids: {corrupt_files[:10]})")
        if missing_files:
            print(f"  -> ⚠️ WARNING: {len(missing_files)} missing .npz files detected. (First 10 pair_ids: {missing_files[:10]})")
        if other_errors:
            print(f"  -> ❌ ERROR: {len(other_errors)} other processing errors. (First 5):")
            for k, v in list(other_errors.items())[:5]:
                print(f"     Pair {k}: {v}")
                
        try:
            insert_loc = df.columns.get_loc("measured_pathloss_dB")
            
            df.insert(insert_loc, "rx_sphere_poly_count", all_rx_poly_counts)
            df.insert(insert_loc + 1, "rx_sphere_obstr_poly_area", all_rx_obstr_areas)
            df.insert(insert_loc + 2, "avg_rx_clutter_height", all_rx_clutter_heights)
            
            # 7. Save the augmented file
            df.to_csv(input_file, index=False)
            print(f"  -> ✅ Successfully added features and saved file: {input_file}")

        except KeyError:
            print(f"  -> ❌ ERROR: Column 'measured_pathloss_dB' not found. Skipping save.")
        except Exception as e:
            print(f"  -> ❌ ERROR: An unexpected error occurred: {e}. Skipping save.")

print("\n\nAll files processed!")

--- Starting Script to Add RX Sphere Features ---

▶ Processing 7GHz | Tx 5
⚠️ Skipping missing CSV file: /home/mkrishne/PL_competition/catboost_training/for_catboost_test/7GHz_Tx_5_test_data.csv

▶ Processing 7GHz | Tx 9
⚠️ Skipping missing CSV file: /home/mkrishne/PL_competition/catboost_training/for_catboost_test/7GHz_Tx_9_test_data.csv

▶ Processing 7GHz | Tx 12
⚠️ Skipping missing CSV file: /home/mkrishne/PL_competition/catboost_training/for_catboost_test/7GHz_Tx_12_test_data.csv

▶ Processing 7GHz | Tx 14
⚠️ Skipping missing CSV file: /home/mkrishne/PL_competition/catboost_training/for_catboost_test/7GHz_Tx_14_test_data.csv

▶ Processing 7GHz | Tx 20
⚠️ Skipping missing CSV file: /home/mkrishne/PL_competition/catboost_training/for_catboost_test/7GHz_Tx_20_test_data.csv


All files processed!
