# Member 2 â€” Feature Engineering (Signal Construction)

**Goal:** Turn company attributes into **numeric, comparable, explainable signals** for **PCA + clustering**.

**Scope (Member 2):**
- Start from **Member 1's cleaned & imputed dataset** (recommended: `data/processed/clean_base.csv`).
- Do **not** make global cleaning decisions (dropping rows/columns, global imputation strategies, etc.).
- You *may* create **presence indicators** and derived features that treat missingness as signal.

**Outputs (for handoff):**
- `df_features_raw`: numeric feature matrix (not scaled)
- `df_features_scaled`: scaled matrix for PCA/clustering
- `feature_dict`: short description of engineered features

---


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import shutil
import re

pd.set_option('display.max_columns', 250)


In [None]:
# -------------------------------
# Copy clean_base and load
# -------------------------------
ROOT = Path.cwd()
if ROOT.name.lower() == "notebooks":
    ROOT = ROOT.parent

SOURCE_PATH = ROOT / "modified_data" / "cleaned_base.csv"
DEST_DIR = ROOT / "data" / "processed"
DEST_DIR.mkdir(parents=True, exist_ok=True)
CLEAN_PATH = DEST_DIR / "clean_base.csv"

if not SOURCE_PATH.exists():
    raise FileNotFoundError(f"Missing source file: {SOURCE_PATH}")

# Always copy to ensure an exact, unmodified clone
shutil.copyfile(SOURCE_PATH, CLEAN_PATH)

df = pd.read_csv(CLEAN_PATH)

print("Copied clean_base.csv to:", CLEAN_PATH)
print("clean_base shape:", df.shape)


Loaded: ../data/champions_group_data.csv
Shape: (8559, 72)


  df = pd.read_csv(RAW_FALLBACK)


## 1) Helper functions

We standardize "presence" checks so empty strings don't count as filled.
We also parse range strings like `"1 to 10"` into numeric midpoints.


In [None]:
def has_value(series: pd.Series) -> pd.Series:
    s = series.astype("string").str.strip()
    return s.notna() & s.ne("")

def normalize_text(series: pd.Series) -> pd.Series:
    s = series.astype("string").str.strip().str.upper()
    return s.replace({"": pd.NA})

def range_to_midpoint(value):
    if isinstance(value, str):
        if "to" in value:
            parts = value.split("to")
        elif "-" in value:
            parts = value.split("-")
        else:
            return np.nan
        if len(parts) >= 2:
            try:
                low = float(parts[0].strip())
                high = float(parts[1].strip())
                return (low + high) / 2
            except ValueError:
                return np.nan
    return np.nan

_split_re = re.compile(r"[;,|]+")

def count_delimited_items(value) -> int:
    if value is None:
        return 0
    if not isinstance(value, str):
        value = str(value)
    s = value.strip()
    if s == "":
        return 0
    parts = [p.strip() for p in _split_re.split(s) if p.strip()]
    return len(parts) if parts else 0

def safe_to_numeric(series: pd.Series) -> pd.Series:
    return pd.to_numeric(series, errors="coerce")

def add_numeric_feature(features: pd.DataFrame, series: pd.Series, name: str, log: bool = True):
    series = safe_to_numeric(series)
    missing = series.isna()
    features[f"{name}_missing"] = missing.astype(int)
    if series.notna().any():
        fill_value = series.median()
        series_filled = series.fillna(fill_value)
    else:
        series_filled = series.fillna(0)
    features[name] = series_filled
    if log:
        features[f"log_{name}"] = np.log1p(series_filled.clip(lower=0))

def missing_ratio_for(cols):
    cols = [c for c in cols if c in df.columns]
    if not cols:
        return pd.Series(0, index=df.index)
    present = {}
    for c in cols:
        s = df[c]
        if pd.api.types.is_numeric_dtype(s):
            present[c] = s.notna()
        else:
            present[c] = has_value(s)
    present_df = pd.DataFrame(present)
    return 1 - present_df.mean(axis=1)


## 2) Core transparency + structure signals (curated short names)

These are your **interpretable, low-risk** binary signals.


In [None]:
features = pd.DataFrame(index=df.index)

def add_has(col, feature_name):
    if col in df.columns:
        features[feature_name] = has_value(df[col]).astype(int)

# Transparency / traceability
add_has("Website", "has_website")
add_has("Phone Number", "has_phone")
add_has("Address Line 1", "has_address")
add_has("City", "has_city")
add_has("State", "has_state")
add_has("State Or Province Abbreviation", "has_state_abbrev")
add_has("Postal Code", "has_postal_code")
add_has("Country", "has_country")
add_has("Region", "has_region")

# Company name (column appears labeled as Company Sites in this dataset)
add_has("Company Sites", "has_company_name")

# Ownership / structure
add_has("Parent Company", "has_parent")
add_has("Global Ultimate Company", "has_global_ultimate")
add_has("Domestic Ultimate Company", "has_domestic_ultimate")

# Verifiability extras
add_has("Ticker", "has_ticker")
add_has("Registration Number", "has_registration_number")
add_has("Company Description", "has_company_description")
add_has("Legal Status", "has_legal_status")
add_has("Ownership Type", "has_ownership_type")
add_has("Entity Type", "has_entity_type")

# Company status (value-coded)
status_col = "Company Status (Active/Inactive)"
if status_col in df.columns:
    status = df[status_col].astype("string").str.strip().str.lower()
    status_map = {"active": 1, "inactive": 0}
    features["company_status_binary"] = status.map(status_map)
    features["has_company_status"] = features["company_status_binary"].notna().astype(int)
    features["company_status_binary"] = features["company_status_binary"].fillna(0)


## 3) Credibility / completeness score (and missing_ratio)

We include **status-known** (not status value) so inactive firms aren't penalized.


In [None]:
credibility_flag_cols = [c for c in [
    "has_website", "has_address", "has_phone",
    "has_ticker", "has_parent", "has_global_ultimate", "has_domestic_ultimate",
    "has_registration_number", "has_company_description",
    "has_company_status"
] if c in features.columns]

if credibility_flag_cols:
    features["credibility_score"] = features[credibility_flag_cols].sum(axis=1)
    features["credibility_score_norm"] = features["credibility_score"] / len(credibility_flag_cols)
    features["missing_ratio_credibility"] = 1 - features["credibility_score_norm"]

# Missingness ratios by group
contact_cols = [
    "Website", "Phone Number", "Address Line 1", "City", "State",
    "State Or Province Abbreviation", "Postal Code", "Country", "Region"
]
ownership_cols = [
    "Parent Company", "Parent Country/Region",
    "Global Ultimate Company", "Global Ultimate Country Name",
    "Domestic Ultimate Company"
]
financial_cols = [
    "Employees Single Site", "Employees Total", "Revenue (USD)", "Market Value (USD)",
    "Corporate Family Members", "Year Found"
]
it_cols = [
    "No. of PC", "No. of Desktops", "No. of Laptops", "No. of Routers",
    "No. of Servers", "No. of Storage Devices", "IT Budget", "IT spend"
]
code_cols = [
    "SIC Code", "8-Digit SIC Code", "NAICS Code", "NACE Rev 2 Code",
    "ANZSIC Code", "ISIC Rev 4 Code"
]

features["missing_ratio_contact"] = missing_ratio_for(contact_cols)
features["missing_ratio_ownership"] = missing_ratio_for(ownership_cols)
features["missing_ratio_financial"] = missing_ratio_for(financial_cols)
features["missing_ratio_it"] = missing_ratio_for(it_cols)
features["missing_ratio_codes"] = missing_ratio_for(code_cols)

all_missing_cols = contact_cols + ownership_cols + financial_cols + it_cols + code_cols
features["missing_ratio_overall"] = missing_ratio_for(all_missing_cols)


## 4) Organisational complexity (group size by global ultimate)

Companies sharing the same global ultimate are treated as belonging to the same group.


In [None]:
if "Global Ultimate Company" in df.columns:
    key = normalize_text(df["Global Ultimate Company"])
    group_sizes = key.groupby(key).transform("size")
    features["org_complexity_count"] = group_sizes.fillna(0).astype(int)
    features["log_org_complexity_count"] = np.log1p(features["org_complexity_count"])


## 5) Scale + market signals

These control for size so clustering isn't just "big vs small".


In [None]:
# Coerce numeric fields and add log versions
numeric_cols = {
    "Employees Total": "employees_total",
    "Employees Single Site": "employees_single_site",
    "Revenue (USD)": "revenue_usd",
    "Market Value (USD)": "market_value_usd",
    "Corporate Family Members": "corporate_family_members",
}

for col, name in numeric_cols.items():
    if col in df.columns:
        add_numeric_feature(features, df[col], name, log=True)

# Company age
CURRENT_YEAR = 2026
if "Year Found" in df.columns:
    year_found = safe_to_numeric(df["Year Found"])
    company_age = CURRENT_YEAR - year_found
    company_age = company_age.where((company_age >= 0) & (company_age <= 300))
    features["company_age_missing"] = company_age.isna().astype(int)
    fill_value = company_age.median()
    if pd.isna(fill_value):
        fill_value = 0
    company_age_filled = company_age.fillna(fill_value)
    features["company_age"] = company_age_filled
    features["log_company_age"] = np.log1p(company_age_filled.clip(lower=0))

# Ratios
if "employees_total" in features.columns and "employees_single_site" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["employee_concentration"] = (features["employees_single_site"] / denom).fillna(0)

if "revenue_usd" in features.columns and "employees_total" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["revenue_per_employee"] = (features["revenue_usd"] / denom).fillna(0)

if "market_value_usd" in features.columns and "employees_total" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["market_value_per_employee"] = (features["market_value_usd"] / denom).fillna(0)


## 6) Geography + multinational heuristics

We avoid one-hot encoding high-cardinality city names. Use country/region + parent/ultimate country comparisons.


In [None]:
# Coordinates
if "Lattitude" in df.columns:
    add_numeric_feature(features, df["Lattitude"], "latitude", log=False)

if "Longitude" in df.columns:
    add_numeric_feature(features, df["Longitude"], "longitude", log=False)

entity_country = normalize_text(df["Country"]) if "Country" in df.columns else pd.Series(pd.NA, index=df.index)
parent_country = normalize_text(df["Parent Country/Region"]) if "Parent Country/Region" in df.columns else pd.Series(pd.NA, index=df.index)
global_country = normalize_text(df["Global Ultimate Country Name"]) if "Global Ultimate Country Name" in df.columns else pd.Series(pd.NA, index=df.index)

parent_present = has_value(df["Parent Company"]) if "Parent Company" in df.columns else pd.Series(False, index=df.index)
global_present = has_value(df["Global Ultimate Company"]) if "Global Ultimate Company" in df.columns else pd.Series(False, index=df.index)

features["parent_foreign_flag"] = (parent_present & parent_country.notna() & (parent_country != entity_country)).astype(int)
features["global_ultimate_foreign_flag"] = (global_present & global_country.notna() & (global_country != entity_country)).astype(int)
features["multinational_flag"] = ((features["parent_foreign_flag"] == 1) | (features["global_ultimate_foreign_flag"] == 1)).astype(int)

countries_df = pd.concat([entity_country, parent_country, global_country], axis=1)
features["num_countries_reported"] = countries_df.nunique(axis=1, dropna=True)


## 7) IT / operational footprint signals

We parse IT asset ranges into midpoints, then build intensity & composition signals.


In [None]:
def parse_range_or_numeric(value):
    if isinstance(value, (int, float)) and not pd.isna(value):
        return float(value)
    if isinstance(value, str):
        mid = range_to_midpoint(value)
        if not pd.isna(mid):
            return mid
        try:
            return float(value.strip())
        except ValueError:
            return np.nan
    return np.nan

asset_cols_map = {
    "No. of PC": "pc_midpoint",
    "No. of Desktops": "desktops_midpoint",
    "No. of Laptops": "laptops_midpoint",
    "No. of Routers": "routers_midpoint",
    "No. of Servers": "servers_midpoint",
    "No. of Storage Devices": "storage_devices_midpoint",
}

for col, name in asset_cols_map.items():
    if col in df.columns:
        series = df[col].apply(parse_range_or_numeric)
        add_numeric_feature(features, series, name, log=True)

asset_feature_cols = [name for name in asset_cols_map.values() if name in features.columns]
if asset_feature_cols:
    features["it_assets_total"] = features[asset_feature_cols].sum(axis=1)
    features["log_it_assets_total"] = np.log1p(features["it_assets_total"])

if "IT Budget" in df.columns:
    add_numeric_feature(features, df["IT Budget"], "it_budget", log=True)
if "IT spend" in df.columns:
    add_numeric_feature(features, df["IT spend"], "it_spend", log=True)

if "it_budget" in features.columns and "it_spend" in features.columns:
    denom = features["it_budget"].replace(0, np.nan)
    features["it_spend_rate"] = (features["it_spend"] / denom).fillna(0).clip(lower=0, upper=3)
    features["it_budget_gap"] = features["it_budget"] - features["it_spend"]
    features["log_abs_it_budget_gap"] = np.log1p(features["it_budget_gap"].abs())

if "it_assets_total" in features.columns and "employees_total" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["it_assets_per_employee"] = (features["it_assets_total"] / denom).fillna(0)

if "it_spend" in features.columns and "employees_total" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["it_spend_per_employee"] = (features["it_spend"] / denom).fillna(0)


## 8) Industry code features (low-cardinality sector buckets)

Full industry codes can be high-cardinality. For clustering/PCA, we use **2-digit buckets** (industry sectors).


In [None]:
code_cols = {
    "SIC Code": "sic_code",
    "8-Digit SIC Code": "sic8_code",
    "NAICS Code": "naics_code",
    "NACE Rev 2 Code": "nace2_code",
    "ANZSIC Code": "anzsic_code",
    "ISIC Rev 4 Code": "isic4_code",
}

for col, prefix in code_cols.items():
    if col in df.columns:
        counts = df[col].astype("string").apply(count_delimited_items)
        features[f"{prefix}_count"] = counts
        features[f"has_{prefix}"] = (counts > 0).astype(int)

if "Registration Number" in df.columns:
    reg_counts = df["Registration Number"].astype("string").apply(count_delimited_items)
    features["registration_number_count"] = reg_counts


## 9) Categorical encoding (PCA/clustering ready)

We one-hot encode selected low-cardinality categoricals + sector buckets.


In [None]:
bool_map = {"true": 1, "false": 0, "yes": 1, "no": 0, "y": 1, "n": 0, "1": 1, "0": 0}

def add_boolean_feature(col, name):
    if col not in df.columns:
        return
    s = df[col].astype("string").str.strip().str.lower()
    mapped = s.map(bool_map)
    features[name] = mapped.fillna(0).astype(int)
    features[f"{name}_missing"] = mapped.isna().astype(int)

add_boolean_feature("Is Headquarters", "is_headquarters")
add_boolean_feature("Is Domestic Ultimate", "is_domestic_ultimate")

candidate_categoricals = [
    "Region",
    "Entity Type",
    "Ownership Type",
    "Legal Status",
    "Franchise Status",
    "Manufacturing Status",
    "Registration Number Type",
]

categorical_cols = []
for col in candidate_categoricals:
    if col in df.columns and df[col].nunique(dropna=True) <= 20:
        categorical_cols.append(col)

if categorical_cols:
    df_cats = df[categorical_cols].copy()
    for col in categorical_cols:
        df_cats[col] = df_cats[col].astype("string").str.strip()
    df_dummies = pd.get_dummies(df_cats, prefix=categorical_cols, prefix_sep="__", dummy_na=True)
else:
    df_dummies = pd.DataFrame(index=df.index)

print("Categorical columns encoded:", categorical_cols)
print("One-hot shape:", df_dummies.shape)


After encoding: (8559, 422)


## 10) Build final feature matrices (raw + scaled)

- Drop obvious identifiers & free-text
- Keep numeric + engineered features + one-hot columns
- Create scaled matrix for PCA/clustering


In [None]:
df_features_raw = pd.concat([features, df_dummies], axis=1)

# Drop columns that are fully missing
df_features_raw = df_features_raw.dropna(axis=1, how="all")

# Ensure numeric-only
df_features_raw = df_features_raw.apply(pd.to_numeric, errors="coerce")

print("Feature matrix shape (pre-impute):", df_features_raw.shape)


Feature matrix shape: (8559, 68)
Any NaNs in features? True


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [None]:
# Fix any remaining NaNs
if df_features_raw.isna().any().any():
    nan_cols = df_features_raw.columns[df_features_raw.isna().any()].tolist()
    print("NaNs found in raw features. Top columns:", nan_cols[:20])
    for col in nan_cols:
        series = df_features_raw[col]
        fill_value = series.median()
        if pd.isna(fill_value):
            fill_value = 0
        df_features_raw[col] = series.fillna(fill_value)

assert not df_features_raw.isna().any().any(), "NaNs remain in raw features"

# Scale for PCA/clustering
scaler = StandardScaler()
df_features_scaled = pd.DataFrame(
    scaler.fit_transform(df_features_raw),
    columns=df_features_raw.columns,
    index=df_features_raw.index
)

assert not df_features_scaled.isna().any().any(), "NaNs remain in scaled features"


Saved: nan_report_member2.csv


Unnamed: 0,feature_name,nan_count,nan_pct
0,Company Sites,8559,1.0
1,log_company_sites,8559,1.0
2,Ticker,8555,0.999533
3,ANZSIC Code,7133,0.833392
4,NACE Rev 2 Code,7045,0.82311
5,ISIC Rev 4 Code,7045,0.82311
6,Lattitude,6649,0.776843
7,Longitude,6647,0.776609
8,NAICS Code,5387,0.629396
9,8-Digit SIC Code,5309,0.620283


In [None]:
def find_first_column(candidates):
    for col in candidates:
        if col in df.columns:
            return col
    return None

duns_col = None
for col in df.columns:
    if col.strip().lower() == "duns number":
        duns_col = col
        break

name_col = find_first_column(["Company Name", "Company", "Company Sites"])

row_id_map = pd.DataFrame({"row_index": df.index})
if duns_col:
    row_id_map["DUNS Number"] = df[duns_col]
if name_col:
    row_id_map["Company Name"] = df[name_col]

print("Row id map columns:", row_id_map.columns.tolist())


Rows with at least one NaN: 8559
Saved: nan_rows_sample_member2.csv


Unnamed: 0,Company Sites,log_company_sites,Ticker,ANZSIC Code,NACE Rev 2 Code,ISIC Rev 4 Code,Lattitude,Longitude,NAICS Code,8-Digit SIC Code,storage_devices_midpoint,servers_midpoint,it_spend_rate,routers_midpoint,it_spend_per_employee,log_it_assets_per_employee,log_it_spend_per_employee,it_assets_per_employee,company_age,Year Found
0,,,,3322.0,4672.0,4662.0,,,423510.0,50510000.0,,,,5.5,0.0,3.135494,0.0,22.0,3.0,2023.0
1,,,,,,,,,,,5.5,5.5,0.619889,5.5,173.6,0.97456,5.162498,1.65,18.0,2008.0
2,,,,,,,47.34088,123.96045,311411.0,20370000.0,5.5,5.5,0.619999,5.5,605.404494,0.501796,6.407547,0.651685,13.0,2013.0
3,,,,,,,,,,,,,0.619948,,,,,,14.0,2012.0
4,,,,,,,,,,,5.5,5.5,0.619993,5.5,29314.5,2.862201,10.285872,16.5,15.0,2011.0


In [None]:
feature_descriptions = {
    "has_website": "1 if Website is present",
    "has_phone": "1 if Phone Number is present",
    "has_address": "1 if Address Line 1 is present",
    "has_city": "1 if City is present",
    "has_state": "1 if State is present",
    "has_state_abbrev": "1 if State Or Province Abbreviation is present",
    "has_postal_code": "1 if Postal Code is present",
    "has_country": "1 if Country is present",
    "has_region": "1 if Region is present",
    "has_company_name": "1 if Company Sites (company name) is present",
    "has_parent": "1 if Parent Company is present",
    "has_global_ultimate": "1 if Global Ultimate Company is present",
    "has_domestic_ultimate": "1 if Domestic Ultimate Company is present",
    "has_ticker": "1 if Ticker is present",
    "has_registration_number": "1 if Registration Number is present",
    "has_company_description": "1 if Company Description is present",
    "has_legal_status": "1 if Legal Status is present",
    "has_ownership_type": "1 if Ownership Type is present",
    "has_entity_type": "1 if Entity Type is present",
    "company_status_binary": "1 if Company Status is active, 0 if inactive or missing",
    "has_company_status": "1 if Company Status is present",
    "credibility_score": "Count of core transparency/ownership flags present",
    "credibility_score_norm": "Credibility score normalized by number of flags",
    "missing_ratio_credibility": "1 - credibility_score_norm",
    "missing_ratio_contact": "Missing ratio across contact fields",
    "missing_ratio_ownership": "Missing ratio across ownership fields",
    "missing_ratio_financial": "Missing ratio across financial fields",
    "missing_ratio_it": "Missing ratio across IT fields",
    "missing_ratio_codes": "Missing ratio across industry code fields",
    "missing_ratio_overall": "Missing ratio across key groups",
    "org_complexity_count": "Count of entities sharing the same global ultimate",
    "log_org_complexity_count": "log1p of org_complexity_count",
    "employees_total": "Employees total (imputed)",
    "employees_single_site": "Employees single site (imputed)",
    "revenue_usd": "Revenue USD (imputed)",
    "market_value_usd": "Market value USD (imputed)",
    "corporate_family_members": "Corporate family members (imputed)",
    "company_age": "Company age in years (imputed)",
    "employee_concentration": "Employees single site divided by employees total",
    "revenue_per_employee": "Revenue per employee (USD)",
    "market_value_per_employee": "Market value per employee (USD)",
    "latitude": "Latitude (imputed)",
    "longitude": "Longitude (imputed)",
    "parent_foreign_flag": "1 if parent country differs from entity country",
    "global_ultimate_foreign_flag": "1 if global ultimate country differs from entity country",
    "multinational_flag": "1 if parent or global ultimate is foreign",
    "num_countries_reported": "Count of unique countries reported (entity/parent/global)",
    "pc_midpoint": "Midpoint of PC count range (imputed)",
    "desktops_midpoint": "Midpoint of desktop count range (imputed)",
    "laptops_midpoint": "Midpoint of laptop count range (imputed)",
    "routers_midpoint": "Midpoint of router count range (imputed)",
    "servers_midpoint": "Midpoint of server count range (imputed)",
    "storage_devices_midpoint": "Midpoint of storage device count range (imputed)",
    "it_assets_total": "Sum of IT asset midpoints",
    "log_it_assets_total": "log1p of it_assets_total",
    "it_budget": "IT budget (imputed)",
    "it_spend": "IT spend (imputed)",
    "it_spend_rate": "IT spend divided by IT budget (clipped)",
    "it_budget_gap": "IT budget minus IT spend",
    "log_abs_it_budget_gap": "log1p of absolute IT budget gap",
    "it_assets_per_employee": "IT assets per employee",
    "it_spend_per_employee": "IT spend per employee",
    "sic_code_count": "Number of SIC codes (split on delimiters)",
    "sic8_code_count": "Number of 8-digit SIC codes (split on delimiters)",
    "naics_code_count": "Number of NAICS codes (split on delimiters)",
    "nace2_code_count": "Number of NACE Rev 2 codes (split on delimiters)",
    "anzsic_code_count": "Number of ANZSIC codes (split on delimiters)",
    "isic4_code_count": "Number of ISIC Rev 4 codes (split on delimiters)",
    "has_sic_code": "1 if SIC Code is present",
    "has_sic8_code": "1 if 8-digit SIC Code is present",
    "has_naics_code": "1 if NAICS Code is present",
    "has_nace2_code": "1 if NACE Rev 2 Code is present",
    "has_anzsic_code": "1 if ANZSIC Code is present",
    "has_isic4_code": "1 if ISIC Rev 4 Code is present",
    "registration_number_count": "Number of registration numbers (split on delimiters)",
}

def describe_feature(name):
    if name in feature_descriptions:
        return feature_descriptions[name]
    if name.endswith("_missing"):
        base = name[:-8]
        return f"Missing indicator for {base}"
    if name.startswith("log_"):
        base = name[4:]
        return f"log1p of {base}"
    if "__" in name:
        base, val = name.split("__", 1)
        val_norm = val.strip().lower()
        if val_norm in ("nan", "<na>"):
            return f"Missing indicator for {base}"
        return f"One-hot: {base} = {val}"
    return "Derived numeric feature"

feature_dict = pd.DataFrame({
    "feature": df_features_raw.columns,
    "description": [describe_feature(c) for c in df_features_raw.columns],
})


Saved: feature_dtypes_member2.csv


Unnamed: 0,feature_name,dtype
0,Company Sites,float64
1,Employees Single Site,float64
2,Employees Total,int64
3,Revenue (USD),int64
4,SIC Code,int64


## 11) Export for Member 3 (optional)

Uncomment to export once Member 1's clean_base.csv is ready.


In [None]:
OUTPUT_DIR = DEST_DIR
DOCS_DIR = ROOT / "docs"
DOCS_DIR.mkdir(parents=True, exist_ok=True)

features_raw_path = OUTPUT_DIR / "features_for_clustering_raw.csv"
features_scaled_path = OUTPUT_DIR / "features_for_clustering_scaled.csv"
row_id_map_path = OUTPUT_DIR / "row_id_map.csv"
feature_dict_path = DOCS_DIR / "feature_dictionary_member2.csv"

df_features_raw.to_csv(features_raw_path, index=False)
df_features_scaled.to_csv(features_scaled_path, index=False)
row_id_map.to_csv(row_id_map_path, index=False)
feature_dict.to_csv(feature_dict_path, index=False)

print("Saved:", features_raw_path)
print("Saved:", features_scaled_path)
print("Saved:", row_id_map_path)
print("Saved:", feature_dict_path)

print("clean_base shape:", df.shape)
print("Raw features shape:", df_features_raw.shape)
print("Scaled features shape:", df_features_scaled.shape)
print("Any NaNs in raw features?", df_features_raw.isna().any().any())
print("Any NaNs in scaled features?", df_features_scaled.isna().any().any())
print("Number of feature columns:", df_features_raw.shape[1])

assert df_features_raw.shape[0] == df.shape[0]
assert df_features_scaled.shape[0] == df.shape[0]
