# Member 2 Feature Engineering (Signal Construction)

**Goal:** Turn company attributes into **numeric, comparable, explainable signals** for PCA + clustering.

**Scope:** Start from Member 1's cleaned dataset and create features without global cleaning decisions.


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import re


In [None]:
pd.set_option("display.max_columns", 250)

ROOT = Path.cwd()
if ROOT.name.lower() == "notebooks":
    ROOT = ROOT.parent

DATA_DIR = ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
SOURCE_PATH = PROCESSED_DIR / "cleaned_base.csv"

OUTPUT_DIR = PROCESSED_DIR
DOCS_DIR = ROOT / "docs"

CURRENT_YEAR = 2026
EXPORT_FEATURES = False
PLOT_FEATURE_OVERVIEW = False


## Data Loading
Load the cleaned base dataset produced by Member 1.


In [None]:
if not SOURCE_PATH.exists():
    raise FileNotFoundError(f"Missing source file: {SOURCE_PATH}")

df = pd.read_csv(SOURCE_PATH)


## Cleaning
Apply lightweight, non-destructive cleanup to prepare for feature construction.


In [None]:
df = df.copy()
df.columns = df.columns.str.strip()


## Functions and Helpers
Reusable helpers for parsing, normalization, and feature construction.


In [None]:
def has_value(series: pd.Series) -> pd.Series:
    s = series.astype("string").str.strip()
    return s.notna() & s.ne("")

def normalize_text(series: pd.Series) -> pd.Series:
    s = series.astype("string").str.strip().str.upper()
    return s.replace({"": pd.NA})

def range_to_midpoint(value):
    if isinstance(value, str):
        if "to" in value:
            parts = value.split("to")
        elif "-" in value:
            parts = value.split("-")
        else:
            return np.nan
        if len(parts) >= 2:
            try:
                low = float(parts[0].strip())
                high = float(parts[1].strip())
                return (low + high) / 2
            except ValueError:
                return np.nan
    return np.nan

_split_re = re.compile(r"[;,|]+")

def count_delimited_items(value) -> int:
    if value is None:
        return 0
    if not isinstance(value, str):
        value = str(value)
    s = value.strip()
    if s == "":
        return 0
    parts = [p.strip() for p in _split_re.split(s) if p.strip()]
    return len(parts) if parts else 0

def safe_to_numeric(series: pd.Series) -> pd.Series:
    return pd.to_numeric(series, errors="coerce")

def add_numeric_feature(features: pd.DataFrame, series: pd.Series, name: str, log: bool = True):
    series = safe_to_numeric(series)
    missing = series.isna()
    features[f"{name}_missing"] = missing.astype(int)
    if series.notna().any():
        fill_value = series.median()
        series_filled = series.fillna(fill_value)
    else:
        series_filled = series.fillna(0)
    features[name] = series_filled
    if log:
        features[f"log_{name}"] = np.log1p(series_filled.clip(lower=0))

def missing_ratio_for(df: pd.DataFrame, cols):
    cols = [c for c in cols if c in df.columns]
    if not cols:
        return pd.Series(0, index=df.index)
    present = {}
    for c in cols:
        s = df[c]
        if pd.api.types.is_numeric_dtype(s):
            present[c] = s.notna()
        else:
            present[c] = has_value(s)
    present_df = pd.DataFrame(present)
    return 1 - present_df.mean(axis=1)

def parse_range_or_numeric(value):
    if isinstance(value, (int, float)) and not pd.isna(value):
        return float(value)
    if isinstance(value, str):
        mid = range_to_midpoint(value)
        if not pd.isna(mid):
            return mid
        try:
            return float(value.strip())
        except ValueError:
            return np.nan
    return np.nan

def add_has(df: pd.DataFrame, features: pd.DataFrame, col: str, feature_name: str):
    if col in df.columns:
        features[feature_name] = has_value(df[col]).astype(int)

_bool_map = {"true": 1, "false": 0, "yes": 1, "no": 0, "y": 1, "n": 0, "1": 1, "0": 0}

def add_boolean_feature(df: pd.DataFrame, features: pd.DataFrame, col: str, name: str):
    if col not in df.columns:
        return
    s = df[col].astype("string").str.strip().str.lower()
    mapped = s.map(_bool_map)
    features[name] = mapped.fillna(0).astype(int)
    features[f"{name}_missing"] = mapped.isna().astype(int)

def find_first_column(df: pd.DataFrame, candidates):
    for col in candidates:
        if col in df.columns:
            return col
    return None


## Core Transparency and Structure Signals
Create low-risk, interpretable indicators for traceability and entity structure.


In [None]:
features = pd.DataFrame(index=df.index)

# Transparency / traceability
add_has(df, features, "Website", "has_website")
add_has(df, features, "Phone Number", "has_phone")
add_has(df, features, "Address Line 1", "has_address")
add_has(df, features, "City", "has_city")
add_has(df, features, "State", "has_state")
add_has(df, features, "State Or Province Abbreviation", "has_state_abbrev")
add_has(df, features, "Postal Code", "has_postal_code")
add_has(df, features, "Country", "has_country")
add_has(df, features, "Region", "has_region")

# Company name (column appears labeled as Company Sites in this dataset)
add_has(df, features, "Company Sites", "has_company_name")

# Ownership / structure
add_has(df, features, "Parent Company", "has_parent")
add_has(df, features, "Global Ultimate Company", "has_global_ultimate")
add_has(df, features, "Domestic Ultimate Company", "has_domestic_ultimate")

# Verifiability extras
add_has(df, features, "Ticker", "has_ticker")
add_has(df, features, "Registration Number", "has_registration_number")
add_has(df, features, "Company Description", "has_company_description")
add_has(df, features, "Legal Status", "has_legal_status")
add_has(df, features, "Ownership Type", "has_ownership_type")
add_has(df, features, "Entity Type", "has_entity_type")

# Company status (value-coded)
status_col = "Company Status (Active/Inactive)"
if status_col in df.columns:
    status = df[status_col].astype("string").str.strip().str.lower()
    status_map = {"active": 1, "inactive": 0}
    features["company_status_binary"] = status.map(status_map)
    features["has_company_status"] = features["company_status_binary"].notna().astype(int)
    features["company_status_binary"] = features["company_status_binary"].fillna(0)


## Credibility and Missingness Signals
Summarize completeness of key fields and compute group-wise missingness ratios.


In [None]:
credibility_flag_cols = [c for c in [
    "has_website", "has_address", "has_phone",
    "has_ticker", "has_parent", "has_global_ultimate", "has_domestic_ultimate",
    "has_registration_number", "has_company_description",
    "has_company_status"
] if c in features.columns]

if credibility_flag_cols:
    features["credibility_score"] = features[credibility_flag_cols].sum(axis=1)
    features["credibility_score_norm"] = features["credibility_score"] / len(credibility_flag_cols)
    features["missing_ratio_credibility"] = 1 - features["credibility_score_norm"]

contact_cols = [
    "Website", "Phone Number", "Address Line 1", "City", "State",
    "State Or Province Abbreviation", "Postal Code", "Country", "Region"
]
ownership_cols = [
    "Parent Company", "Parent Country/Region",
    "Global Ultimate Company", "Global Ultimate Country Name",
    "Domestic Ultimate Company"
]
financial_cols = [
    "Employees Single Site", "Employees Total", "Revenue (USD)", "Market Value (USD)",
    "Corporate Family Members", "Year Found"
]
it_cols = [
    "No. of PC", "No. of Desktops", "No. of Laptops", "No. of Routers",
    "No. of Servers", "No. of Storage Devices", "IT Budget", "IT spend"
]
code_cols = [
    "SIC Code", "8-Digit SIC Code", "NAICS Code", "NACE Rev 2 Code",
    "ANZSIC Code", "ISIC Rev 4 Code"
]

features["missing_ratio_contact"] = missing_ratio_for(df, contact_cols)
features["missing_ratio_ownership"] = missing_ratio_for(df, ownership_cols)
features["missing_ratio_financial"] = missing_ratio_for(df, financial_cols)
features["missing_ratio_it"] = missing_ratio_for(df, it_cols)
features["missing_ratio_codes"] = missing_ratio_for(df, code_cols)

all_missing_cols = contact_cols + ownership_cols + financial_cols + it_cols + code_cols
features["missing_ratio_overall"] = missing_ratio_for(df, all_missing_cols)


## Organizational Complexity
Estimate group size based on global ultimate ownership.


In [None]:
if "Global Ultimate Company" in df.columns:
    key = normalize_text(df["Global Ultimate Company"])
    group_sizes = key.groupby(key).transform("size")
    features["org_complexity_count"] = group_sizes.fillna(0).astype(int)
    features["log_org_complexity_count"] = np.log1p(features["org_complexity_count"])


## Scale and Market Signals
Normalize size metrics and compute scale-related ratios.


In [None]:
numeric_cols = {
    "Employees Total": "employees_total",
    "Employees Single Site": "employees_single_site",
    "Revenue (USD)": "revenue_usd",
    "Market Value (USD)": "market_value_usd",
    "Corporate Family Members": "corporate_family_members",
}

for col, name in numeric_cols.items():
    if col in df.columns:
        add_numeric_feature(features, df[col], name, log=True)

if "Year Found" in df.columns:
    year_found = safe_to_numeric(df["Year Found"])
    company_age = CURRENT_YEAR - year_found
    company_age = company_age.where((company_age >= 0) & (company_age <= 300))
    features["company_age_missing"] = company_age.isna().astype(int)
    fill_value = company_age.median()
    if pd.isna(fill_value):
        fill_value = 0
    company_age_filled = company_age.fillna(fill_value)
    features["company_age"] = company_age_filled
    features["log_company_age"] = np.log1p(company_age_filled.clip(lower=0))

if "employees_total" in features.columns and "employees_single_site" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["employee_concentration"] = (features["employees_single_site"] / denom).fillna(0)

if "revenue_usd" in features.columns and "employees_total" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["revenue_per_employee"] = (features["revenue_usd"] / denom).fillna(0)

if "market_value_usd" in features.columns and "employees_total" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["market_value_per_employee"] = (features["market_value_usd"] / denom).fillna(0)


## Geography and Multinational Heuristics
Derive country-level comparisons and multinational indicators.


In [None]:
if "Lattitude" in df.columns:
    add_numeric_feature(features, df["Lattitude"], "latitude", log=False)

if "Longitude" in df.columns:
    add_numeric_feature(features, df["Longitude"], "longitude", log=False)

entity_country = normalize_text(df["Country"]) if "Country" in df.columns else pd.Series(pd.NA, index=df.index)
parent_country = normalize_text(df["Parent Country/Region"]) if "Parent Country/Region" in df.columns else pd.Series(pd.NA, index=df.index)
global_country = normalize_text(df["Global Ultimate Country Name"]) if "Global Ultimate Country Name" in df.columns else pd.Series(pd.NA, index=df.index)

parent_present = has_value(df["Parent Company"]) if "Parent Company" in df.columns else pd.Series(False, index=df.index)
global_present = has_value(df["Global Ultimate Company"]) if "Global Ultimate Company" in df.columns else pd.Series(False, index=df.index)

features["parent_foreign_flag"] = (parent_present & parent_country.notna() & (parent_country != entity_country)).astype(int)
features["global_ultimate_foreign_flag"] = (global_present & global_country.notna() & (global_country != entity_country)).astype(int)
features["multinational_flag"] = ((features["parent_foreign_flag"] == 1) | (features["global_ultimate_foreign_flag"] == 1)).astype(int)

countries_df = pd.concat([entity_country, parent_country, global_country], axis=1)
features["num_countries_reported"] = countries_df.nunique(axis=1, dropna=True)


## IT and Operational Footprint Signals
Parse IT asset ranges and create intensity ratios.


In [None]:
asset_cols_map = {
    "No. of PC": "pc_midpoint",
    "No. of Desktops": "desktops_midpoint",
    "No. of Laptops": "laptops_midpoint",
    "No. of Routers": "routers_midpoint",
    "No. of Servers": "servers_midpoint",
    "No. of Storage Devices": "storage_devices_midpoint",
}

for col, name in asset_cols_map.items():
    if col in df.columns:
        series = df[col].apply(parse_range_or_numeric)
        add_numeric_feature(features, series, name, log=True)

asset_feature_cols = [name for name in asset_cols_map.values() if name in features.columns]
if asset_feature_cols:
    features["it_assets_total"] = features[asset_feature_cols].sum(axis=1)
    features["log_it_assets_total"] = np.log1p(features["it_assets_total"])

if "IT Budget" in df.columns:
    add_numeric_feature(features, df["IT Budget"], "it_budget", log=True)
if "IT spend" in df.columns:
    add_numeric_feature(features, df["IT spend"], "it_spend", log=True)

if "it_budget" in features.columns and "it_spend" in features.columns:
    denom = features["it_budget"].replace(0, np.nan)
    features["it_spend_rate"] = (features["it_spend"] / denom).fillna(0).clip(lower=0, upper=3)
    features["it_budget_gap"] = features["it_budget"] - features["it_spend"]
    features["log_abs_it_budget_gap"] = np.log1p(features["it_budget_gap"].abs())

if "it_assets_total" in features.columns and "employees_total" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["it_assets_per_employee"] = (features["it_assets_total"] / denom).fillna(0)

if "it_spend" in features.columns and "employees_total" in features.columns:
    denom = features["employees_total"].replace(0, np.nan)
    features["it_spend_per_employee"] = (features["it_spend"] / denom).fillna(0)


## Industry Code Features
Convert high-cardinality industry codes into compact counts and buckets.


In [None]:
code_cols = {
    "SIC Code": "sic_code",
    "8-Digit SIC Code": "sic8_code",
    "NAICS Code": "naics_code",
    "NACE Rev 2 Code": "nace2_code",
    "ANZSIC Code": "anzsic_code",
    "ISIC Rev 4 Code": "isic4_code",
}

for col, prefix in code_cols.items():
    if col in df.columns:
        counts = df[col].astype("string").apply(count_delimited_items)
        features[f"{prefix}_count"] = counts

        codes = df[col].astype("string").str.strip()
        sector = codes.str[:2].replace({"": pd.NA})
        sector_dummies = pd.get_dummies(sector, prefix=f"{prefix}_sector", dummy_na=True)
        features = pd.concat([features, sector_dummies], axis=1)


## Categorical Encoding
Create low-cardinality one-hot features and boolean flags.


In [None]:
add_boolean_feature(df, features, "Is Headquarters", "is_headquarters")
add_boolean_feature(df, features, "Is Domestic Ultimate", "is_domestic_ultimate")

candidate_categoricals = [
    "Region",
    "Entity Type",
    "Ownership Type",
    "Legal Status",
    "Franchise Status",
    "Manufacturing Status",
    "Registration Number Type",
]

categorical_cols = []
for col in candidate_categoricals:
    if col in df.columns and df[col].nunique(dropna=True) <= 20:
        categorical_cols.append(col)

if categorical_cols:
    df_cats = df[categorical_cols].copy()
    for col in categorical_cols:
        df_cats[col] = df_cats[col].astype("string").str.strip()
    df_dummies = pd.get_dummies(df_cats, prefix=categorical_cols, prefix_sep="__", dummy_na=True)
else:
    df_dummies = pd.DataFrame(index=df.index)


## Build Feature Matrices
Combine engineered features and one-hot encodings into raw and numeric-only matrices.


In [None]:
df_features_raw = pd.concat([features, df_dummies], axis=1)
df_features_raw = df_features_raw.dropna(axis=1, how="all")
df_features_raw = df_features_raw.apply(pd.to_numeric, errors="coerce")


## Impute Remaining NaNs and Scale
Fill any remaining missing values and build a scaled matrix for PCA/clustering.


In [None]:
if df_features_raw.isna().any().any():
    nan_cols = df_features_raw.columns[df_features_raw.isna().any()].tolist()
    for col in nan_cols:
        series = df_features_raw[col]
        fill_value = series.median()
        if pd.isna(fill_value):
            fill_value = 0
        df_features_raw[col] = series.fillna(fill_value)

assert not df_features_raw.isna().any().any(), "NaNs remain in raw features"

scaler = StandardScaler()
df_features_scaled = pd.DataFrame(
    scaler.fit_transform(df_features_raw),
    columns=df_features_raw.columns,
    index=df_features_raw.index,
)

assert not df_features_scaled.isna().any().any(), "NaNs remain in scaled features"


## Row ID Map
Create a lookup table to map features back to source identifiers.


In [None]:
duns_col = None
for col in df.columns:
    if col.strip().lower() == "duns number":
        duns_col = col
        break

name_col = find_first_column(df, ["Company Name", "Company", "Company Sites"])

row_id_map = pd.DataFrame({"row_index": df.index})
if duns_col:
    row_id_map["DUNS Number"] = df[duns_col]
if name_col:
    row_id_map["Company Name"] = df[name_col]


## Feature Dictionary
Document feature meanings for downstream consumers.


In [None]:
feature_descriptions = {
    "has_website": "1 if Website is present",
    "has_phone": "1 if Phone Number is present",
    "has_address": "1 if Address Line 1 is present",
    "has_city": "1 if City is present",
    "has_state": "1 if State is present",
    "has_state_abbrev": "1 if State Or Province Abbreviation is present",
    "has_postal_code": "1 if Postal Code is present",
    "has_country": "1 if Country is present",
    "has_region": "1 if Region is present",
    "has_company_name": "1 if Company name is present",
    "has_parent": "1 if Parent Company is present",
    "has_global_ultimate": "1 if Global Ultimate Company is present",
    "has_domestic_ultimate": "1 if Domestic Ultimate Company is present",
    "has_ticker": "1 if Ticker is present",
    "has_registration_number": "1 if Registration Number is present",
    "has_company_description": "1 if Company Description is present",
    "has_legal_status": "1 if Legal Status is present",
    "has_ownership_type": "1 if Ownership Type is present",
    "has_entity_type": "1 if Entity Type is present",
    "company_status_binary": "1 if Company Status is Active; 0 otherwise",
    "credibility_score": "Count of key transparency signals",
    "credibility_score_norm": "Credibility score normalized to [0,1]",
    "missing_ratio_credibility": "1 - normalized credibility score",
    "missing_ratio_contact": "Missing ratio for contact fields",
    "missing_ratio_ownership": "Missing ratio for ownership fields",
    "missing_ratio_financial": "Missing ratio for financial fields",
    "missing_ratio_it": "Missing ratio for IT fields",
    "missing_ratio_codes": "Missing ratio for code fields",
    "missing_ratio_overall": "Missing ratio across all key fields",
    "org_complexity_count": "Number of companies in same global ultimate group",
    "log_org_complexity_count": "Log of org complexity count",
    "employees_total": "Employees Total (filled)",
    "employees_single_site": "Employees Single Site (filled)",
    "revenue_usd": "Revenue in USD (filled)",
    "market_value_usd": "Market Value in USD (filled)",
    "corporate_family_members": "Corporate Family Members (filled)",
    "company_age": "Company age in years",
    "employee_concentration": "Employees single site / total employees",
    "revenue_per_employee": "Revenue per employee",
    "market_value_per_employee": "Market value per employee",
    "latitude": "Latitude (filled)",
    "longitude": "Longitude (filled)",
    "parent_foreign_flag": "1 if parent country differs from entity country",
    "global_ultimate_foreign_flag": "1 if global ultimate country differs from entity country",
    "multinational_flag": "1 if any parent/ultimate is foreign",
    "num_countries_reported": "Count of unique countries reported",
    "it_assets_total": "Total IT assets midpoint sum",
    "it_budget": "IT budget (filled)",
    "it_spend": "IT spend (filled)",
    "it_spend_rate": "IT spend / IT budget",
    "it_budget_gap": "IT budget minus IT spend",
    "it_assets_per_employee": "IT assets per employee",
    "it_spend_per_employee": "IT spend per employee",
}


## Optional Column Pruning
Remove near-duplicate or low-signal columns prior to downstream modeling.


In [None]:
cols_to_drop = [
    "has_state_abbrev",
    "company_status_binary",
    "credibility_score",
    "missing_ratio_credibility",
    "org_complexity_count",
    "employees_total",
    "log_employees_single_site",
    "revenue_usd",
    "market_value_usd",
    "corporate_family_members",
    "company_age",
    "pc_midpoint",
    "desktops_midpoint",
    "laptops_midpoint",
    "routers_midpoint",
    "servers_midpoint",
    "storage_devices_midpoint",
    "it_assets_total",
    "it_budget",
    "it_spend",
    "it_budget_gap",
]

df_features_scaled_drop = df_features_scaled.drop(columns=cols_to_drop, errors="ignore")


## Export Artifacts (Optional)
Persist feature matrices and documentation for downstream modeling.


In [None]:
if EXPORT_FEATURES:
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    DOCS_DIR.mkdir(parents=True, exist_ok=True)

    features_raw_path = OUTPUT_DIR / "features_for_clustering_raw.csv"
    features_scaled_path = OUTPUT_DIR / "features_for_clustering_scaled.csv"
    row_id_map_path = OUTPUT_DIR / "row_id_map.csv"
    feature_dict_path = DOCS_DIR / "feature_dictionary_member2.csv"

    df_features_raw.to_csv(features_raw_path, index=False)
    df_features_scaled.to_csv(features_scaled_path, index=False)
    row_id_map.to_csv(row_id_map_path, index=False)
    pd.DataFrame(sorted(feature_descriptions.items()), columns=["feature", "description"]).to_csv(
        feature_dict_path, index=False
    )


## Visualization
Quick optional overview plots for feature completeness.


In [None]:
if PLOT_FEATURE_OVERVIEW:
    import matplotlib.pyplot as plt

    missing_rates = df_features_raw.isna().mean().sort_values(ascending=False).head(25)
    if not missing_rates.empty:
        ax = missing_rates.plot(kind="bar", figsize=(12, 4))
        ax.set_title("Top 25 Feature Missing Rates")
        ax.set_ylabel("Missing Rate")
        plt.tight_layout()
        plt.show()
