In [None]:
'''Cell 1: Imports'''
# ==============================================================================
import os
import re
import pandas as pd
import numpy as np
from scipy import stats
from scipy.interpolate import griddata
from sklearn.cluster import DBSCAN
from skimage.feature import graycomatrix, graycoprops
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)
print("Imports complete.")

In [None]:
'''Cell 2: Configuration and Path Setup'''
# ==============================================================================
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
INTERIM_ROOT_DIR = os.path.join(PROJECT_ROOT, 'data', 'interim')
PROCESSED_DIR = os.path.join(PROJECT_ROOT, 'data', 'processed')
METADATA_PATH = os.path.join(PROJECT_ROOT, 'data', 'raw', 'Coupon_metadata.csv')
OUTPUT_CSV_PATH = os.path.join(PROCESSED_DIR, 'dataset.csv')

GRID_SIZE = 300
SENSOR_CHANNELS = ['MeltVIEW plasma', 'MeltVIEW melt pool', 'LaserVIEW']
GLCM_LEVELS = 64
GLCM_DISTANCES = [4, 8]
GLCM_ANGLES = [0, np.pi/4, np.pi/2, 3*np.pi/4]
DBSCAN_HOTSPOT_PERCENTILE = 95
DBSCAN_COLDSPOT_PERCENTILE = 5
DBSCAN_EPS = 10.0
DBSCAN_MIN_SAMPLES = 50

os.makedirs(PROCESSED_DIR, exist_ok=True)
print(f"Reading interim data from: {INTERIM_ROOT_DIR}")
print(f"Saving final dataset to: {OUTPUT_CSV_PATH}")


In [None]:
'''Cell 3: Feature Calculation Helper Functions '''
# ==============================================================================
def safe_divide(a, b):
    """Division with zero protection."""
    return a / b if abs(b) > 1e-9 else 0.0

# --- FIX: Restored the missing calculate_entropy function ---
def calculate_entropy(data, bins=32):
    """Calculate Shannon entropy of a 1D array."""
    # Remove NaN values to avoid errors
    clean_data = data[~np.isnan(data)]
    if len(clean_data) < 2:
        return 0.0
    
    # Calculate histogram and probabilities
    hist, _ = np.histogram(clean_data, bins=bins)
    if hist.sum() == 0:
        return 0.0
    probs = hist / hist.sum()
    
    # Calculate entropy, adding a small epsilon to avoid log(0)
    return -np.sum(probs * np.log2(probs + 1e-10))

def calculate_extreme_counts(data):
    """Calculate 3-sigma outlier counts."""
    if data.size < 2: return 0
    mean, std = np.nanmean(data), np.nanstd(data)
    if std == 0: return 0
    return np.sum((data > (mean + 3*std)) | (data < (mean - 3*std)))

def calculate_basic_stats(data, prefix):
    """Calculate basic statistical features from a 1D sensor array."""
    if data.size == 0: return {}
    return {
        f'(St)_{prefix}_mean': np.nanmean(data),
        f'(St)_{prefix}_std': np.nanstd(data),
        f'(St)_{prefix}_min': np.nanmin(data),
        f'(St)_{prefix}_max': np.nanmax(data),
        f'(St)_{prefix}_ptp': np.ptp(data),
        f'(St)_{prefix}_skew': stats.skew(data, nan_policy='omit'),
        f'(St)_{prefix}_kurtosis': stats.kurtosis(data, nan_policy='omit'),
        f'(St)_{prefix}_mad': np.nanmedian(np.abs(data - np.nanmedian(data))),
        f'(St)_{prefix}_extreme_count': calculate_extreme_counts(data),
        f'(St)_{prefix}_entropy': calculate_entropy(data)
    }

def calculate_glcm_features(grid, prefix):
    """Calculate texture features using GLCM."""
    features = {}
    props_to_calc = ['contrast', 'homogeneity', 'energy', 'correlation', 'entropy']
    null_features = {f'(Sp)_{prefix}_glcm_{p}_mean': 0.0 for p in props_to_calc}
    null_features.update({f'(Sp)_{prefix}_glcm_{p}_std': 0.0 for p in props_to_calc})
    
    valid_data = grid[grid != 0]
    if valid_data.size < 2: return null_features

    glcm_img = np.zeros_like(grid, dtype=np.uint8)
    scaled = np.uint8((valid_data - valid_data.min()) / (valid_data.max() - valid_data.min() + 1e-8) * (GLCM_LEVELS - 1))
    glcm_img[grid != 0] = scaled
    
    glcm = graycomatrix(glcm_img, GLCM_DISTANCES, GLCM_ANGLES, levels=GLCM_LEVELS, symmetric=True, normed=True)
    
    for prop in ['contrast', 'homogeneity', 'energy', 'correlation']:
        values = graycoprops(glcm, prop)
        features[f'(Sp)_{prefix}_glcm_{prop}_mean'] = values.mean()
        features[f'(Sp)_{prefix}_glcm_{prop}_std'] = values.std()
    
    glcm_entropy = -np.sum(glcm * np.log2(glcm + 1e-10), axis=(0, 1))
    features[f'(Sp)_{prefix}_glcm_entropy_mean'] = glcm_entropy.mean()
    features[f'(Sp)_{prefix}_glcm_entropy_std'] = glcm_entropy.std()

    return features

def calculate_hotspot_features(grid, valid_data, prefix, mode, percentile):
    """Compute cluster-based features for hot or cold spots."""
    features = {
        f'(Sp)_{prefix}_{mode}_clusters': 0, f'(Sp)_{prefix}_{mode}_abs_contrast': 0.0,
        f'(Sp)_{prefix}_{mode}_rel_contrast': 0.0, f'(Sp)_{prefix}_{mode}_rel_size': 0.0,
    }
    if valid_data.size < DBSCAN_MIN_SAMPLES: return features

    global_intensity = valid_data.mean()
    threshold = np.percentile(valid_data, percentile)
    mask = (grid > threshold) if mode == 'hot' else (grid < threshold) & (grid != 0)

    coords = np.argwhere(mask)
    if len(coords) < DBSCAN_MIN_SAMPLES: return features

    labels = DBSCAN(eps=DBSCAN_EPS, min_samples=DBSCAN_MIN_SAMPLES).fit_predict(coords)
    unique_labels = set(labels) - {-1}
    if not unique_labels: return features

    cluster_sizes = [np.sum(labels == lbl) for lbl in unique_labels]
    cluster_intensities = [grid[coords[labels == lbl, 0], coords[labels == lbl, 1]].mean() for lbl in unique_labels]
    
    abs_contrast = (np.median(cluster_intensities) - global_intensity) if mode == 'hot' else (global_intensity - np.median(cluster_intensities))

    features[f'(Sp)_{prefix}_{mode}_clusters'] = len(unique_labels)
    features[f'(Sp)_{prefix}_{mode}_abs_contrast'] = abs_contrast
    features[f'(Sp)_{prefix}_{mode}_rel_contrast'] = safe_divide(abs_contrast, global_intensity)
    features[f'(Sp)_{prefix}_{mode}_rel_size'] = safe_divide(max(cluster_sizes), mask.sum())
    
    return features

def calculate_derived_features(features):
    """Calculate cross-channel derived features."""
    total_sensor_energy = features.get('(St)_MeltVIEW plasma_mean', 0) + features.get('(St)_MeltVIEW melt pool_mean', 0)
    
    return {
        '(PD)_Total_Energy': total_sensor_energy,
        '(PD)_MVP/TE_ratio': safe_divide(features.get('(St)_MeltVIEW plasma_mean', 0), total_sensor_energy),
        '(PD)_MVP/MVMP_ratio': safe_divide(features.get('(St)_MeltVIEW plasma_mean', 0), features.get('(St)_MeltVIEW melt pool_mean', 0)),
        '(PD)_MVMP/LV': safe_divide(features.get('(St)_MeltVIEW melt pool_mean', 0), features.get('(St)_LaserVIEW_mean', 0)),
        '(PD)_MVP/LV': safe_divide(features.get('(St)_MeltVIEW plasma_mean', 0), features.get('(St)_LaserVIEW_mean', 0)),
        '(PD)_LV_STD/LV': safe_divide(features.get('(St)_LaserVIEW_std', 0), features.get('(St)_LaserVIEW_mean', 0)),
        '(PDPB)_MVMP/VED': safe_divide(features.get('(St)_MeltVIEW melt pool_mean', 0), features.get('(PB)_VED', 0)),
        '(PDPB)_MVP/VED': safe_divide(features.get('(St)_MeltVIEW plasma_mean', 0), features.get('(PB)_VED', 0)),
        '(PDPB)_TotalEnergy/VED': safe_divide(total_sensor_energy, features.get('(PB)_VED', 0)),
        '(PDPB)_LV/LP': safe_divide(features.get('(St)_LaserVIEW_mean', 0), features.get('(PB)_laser_power', 0))
    }

In [None]:
'''Cell 4: Main Processing Function '''
# =============================================================================
def process_layer(layer_path, meta_row):
    """
    Orchestrates the feature calculation for a single layer CSV file.
    """
    df = pd.read_csv(layer_path)
    coupon_id = meta_row['Coupon']
    layer_num = int(re.search(r'layer_(\d+)', layer_path).group(1))

    # Initialize features with metadata
    features = {
        'coupon_id': coupon_id,
        'layer_index': layer_num,
        'Label': meta_row['Label'],
        '(MT)_identifier': meta_row['Identifier'],
        '(MT)_center_x': meta_row['Center_X'],
        '(MT)_center_y': meta_row['Center_Y'],
        '(PB)_laser_power': meta_row['Laser Power'],
        '(PB)_scan_speed': meta_row['Scan Speed'],
        '(PB)_hatch_distance': meta_row['Hatch Distance'],
        '(PB)_powder_thickness': meta_row['Powder Thickness'],
        '(PB)_VED': meta_row['VED']
    }
    
    # Create interpolated grid
    x_coords = np.linspace(meta_row['Center_X'] - 7.5, meta_row['Center_X'] + 7.5, GRID_SIZE)
    y_coords = np.linspace(meta_row['Center_Y'] + 7.5, meta_row['Center_Y'] - 7.5, GRID_SIZE)
    grid = np.zeros((GRID_SIZE, GRID_SIZE, len(SENSOR_CHANNELS)), dtype=np.float32)

    for i, channel in enumerate(SENSOR_CHANNELS):
        grid[:, :, i] = griddata(
            (df["Demand X"], df["Demand Y"]), df[channel].values,
            (x_coords[None, :], y_coords[:, None]), method='linear', fill_value=0
        )
    
    for i, channel in enumerate(SENSOR_CHANNELS):
        data = df[channel].values
        features.update(calculate_basic_stats(data, channel))
        
        grid_channel = grid[:, :, i]
        valid_grid_data = grid_channel[grid_channel != 0]

        if valid_grid_data.size > 1 and channel in ['MeltVIEW plasma', 'MeltVIEW melt pool']:
            features.update(calculate_glcm_features(grid_channel, channel))
            features.update(calculate_hotspot_features(grid_channel, valid_grid_data, channel, 'hot', DBSCAN_HOTSPOT_PERCENTILE))
            features.update(calculate_hotspot_features(grid_channel, valid_grid_data, channel, 'cold', DBSCAN_COLDSPOT_PERCENTILE))

    features.update(calculate_derived_features(features))
    
    return pd.Series(features)

In [None]:
'''Cell 5: Main Script Execution'''
# ==============================================================================
def main():
    try:
        metadata_df = pd.read_csv(METADATA_PATH)
        metadata_df.columns = metadata_df.columns.str.strip()
    except Exception as e:
        print(f"FATAL: Could not load metadata file. Error: {e}")
        return

    all_features_list = []
    
    layer_file_paths = []
    for dirpath, _, filenames in os.walk(INTERIM_ROOT_DIR):
        for f in filenames:
            if f.endswith('.csv'):
                layer_file_paths.append(os.path.join(dirpath, f))

    if not layer_file_paths:
        print("No segmented layer files found in 'data/interim/'. Please run the segmentation script first.")
        return

    for layer_path in tqdm(layer_file_paths, desc="Generating Features"):
        try:
            coupon_id = os.path.basename(os.path.dirname(layer_path))
            meta_row = metadata_df[metadata_df['Coupon'] == coupon_id].iloc[0]
            features = process_layer(layer_path, meta_row)
            all_features_list.append(features)
        except Exception as e:
            print(f"⚠️ Could not process {layer_path}. Error: {e}")

    if not all_features_list:
        print("No features were generated.")
        return
        
    final_df = pd.DataFrame(all_features_list)
    
    final_model_columns = [
        'coupon_id', 'layer_index', 'Label',
        '(St)_MeltVIEW plasma_mean', '(St)_MeltVIEW plasma_std', '(St)_MeltVIEW plasma_min', '(St)_MeltVIEW plasma_max', '(St)_MeltVIEW plasma_ptp', '(St)_MeltVIEW plasma_skew', '(St)_MeltVIEW plasma_kurtosis', '(St)_MeltVIEW plasma_mad', '(St)_MeltVIEW plasma_extreme_count', '(St)_MeltVIEW plasma_entropy', '(Sp)_MeltVIEW plasma_glcm_contrast_mean', '(Sp)_MeltVIEW plasma_glcm_contrast_std', '(Sp)_MeltVIEW plasma_glcm_homogeneity_mean', '(Sp)_MeltVIEW plasma_glcm_homogeneity_std', '(Sp)_MeltVIEW plasma_glcm_energy_mean', '(Sp)_MeltVIEW plasma_glcm_energy_std', '(Sp)_MeltVIEW plasma_glcm_correlation_mean', '(Sp)_MeltVIEW plasma_glcm_correlation_std', '(Sp)_MeltVIEW plasma_glcm_entropy_mean', '(Sp)_MeltVIEW plasma_glcm_entropy_std', '(Sp)_MeltVIEW plasma_hot_clusters', '(Sp)_MeltVIEW plasma_hot_abs_contrast', '(Sp)_MeltVIEW plasma_hot_rel_contrast', '(Sp)_MeltVIEW plasma_hot_rel_size', '(Sp)_MeltVIEW plasma_cold_clusters', '(Sp)_MeltVIEW plasma_cold_abs_contrast', '(Sp)_MeltVIEW plasma_cold_rel_contrast', '(Sp)_MeltVIEW plasma_cold_rel_size',
        '(St)_MeltVIEW melt pool_mean', '(St)_MeltVIEW melt pool_std', '(St)_MeltVIEW melt pool_min', '(St)_MeltVIEW melt pool_max', '(St)_MeltVIEW melt pool_ptp', '(St)_MeltVIEW melt pool_skew', '(St)_MeltVIEW melt pool_kurtosis', '(St)_MeltVIEW melt pool_mad', '(St)_MeltVIEW melt pool_extreme_count', '(St)_MeltVIEW melt pool_entropy', '(Sp)_MeltVIEW melt pool_glcm_contrast_mean', '(Sp)_MeltVIEW melt pool_glcm_contrast_std', '(Sp)_MeltVIEW melt pool_glcm_homogeneity_mean', '(Sp)_MeltVIEW melt pool_glcm_homogeneity_std', '(Sp)_MeltVIEW melt pool_glcm_energy_mean', '(Sp)_MeltVIEW melt pool_glcm_energy_std', '(Sp)_MeltVIEW melt pool_glcm_correlation_mean', '(Sp)_MeltVIEW melt pool_glcm_correlation_std', '(Sp)_MeltVIEW melt pool_glcm_entropy_mean', '(Sp)_MeltVIEW melt pool_glcm_entropy_std', '(Sp)_MeltVIEW melt pool_hot_clusters', '(Sp)_MeltVIEW melt pool_hot_abs_contrast', '(Sp)_MeltVIEW melt pool_hot_rel_contrast', '(Sp)_MeltVIEW melt pool_hot_rel_size', '(Sp)_MeltVIEW melt pool_cold_clusters', '(Sp)_MeltVIEW melt pool_cold_abs_contrast', '(Sp)_MeltVIEW melt pool_cold_rel_contrast', '(Sp)_MeltVIEW melt pool_cold_rel_size',
        '(St)_LaserVIEW_mean', '(St)_LaserVIEW_std', '(St)_LaserVIEW_min', '(St)_LaserVIEW_max', '(St)_LaserVIEW_ptp', '(St)_LaserVIEW_skew', '(St)_LaserVIEW_kurtosis', '(St)_LaserVIEW_mad', '(St)_LaserVIEW_extreme_count', '(St)_LaserVIEW_entropy',
        '(PD)_Total_Energy', '(PD)_MVP/TE_ratio', '(PD)_MVP/MVMP_ratio', '(PD)_MVMP/LV', '(PD)_MVP/LV', '(PD)_LV_STD/LV',
        '(PDPB)_MVMP/VED', '(PDPB)_MVP/VED', '(PDPB)_TotalEnergy/VED', '(PDPB)_LV/LP'
    ]
    
    final_df = final_df[final_model_columns]
    
    final_df.to_csv(OUTPUT_CSV_PATH, index=False)
    print(f"\n✅ Processing complete. Saved {len(final_df)} rows and {len(final_df.columns)} columns to '{OUTPUT_CSV_PATH}'")



In [None]:
'''Cell 6: Run Main Script'''
# ==============================================================================
if __name__ == '__main__':
    main()