In [None]:
# Feature Engineering (Signal Construction)

This notebook is responsible for converting raw company attributes into
meaningful numerical signals for clustering.

⚠️ Scope:
- No data cleaning or missing-value decisions are made here
- All logic assumes a future `clean_base.csv` produced by Member 1
- Raw data is used temporarily for feature design only


In [None]:
import pandas as pd
import numpy as np

# Display all columns during inspection (feature-heavy dataset)
pd.set_option("display.max_columns", 200)

In [None]:
# TEMPORARY STEP
# Raw data is loaded ONLY to understand column structure
# and prototype feature transformations.
# This will later be replaced with Member 1's clean_base.csv.

df_raw= pd.read_csv('../data/champions_group_data.csv')

# Inspect dataset shape and column names
df_raw.shape, df_raw.columns.tolist()

In [10]:
# Transparency-related signals

# 1 if the company has a website listed, 0 otherwise
# captures basic online presence and legitimacy
df_raw["has_website"] = df_raw["Website"].notna().astype(int)

# 1 if a phone number is available, 0 otherwise
# Absence may indicate low transparency or early-stage entities
df_raw["has_phone"] = df_raw["Phone Number"].notna().astype(int)

# 1 if a physical address is listed, 0 otherwise
# Used as a proxy for operational traceability
df_raw["has_address"] = df_raw["Address Line 1"].notna().astype(int)

In [None]:
# Ownership / structural signals
# Capture organisational complexity.
# Complex ownership structures often correlate with higher monitoring
# and compliance risk in real-world company analysis.

# Indicates whether a parent company is recorded
df_raw["has_parent"] = df_raw["Parent Company"].notna().astype(int)

# Indicates whether a global ultimate owner is recorded
# Global ultimate entities suggest multinational structures
df_raw["has_global_ultimate"] = df_raw["Global Ultimate Company"].notna().astype(int)

# Indicates whether a domestic ultimate owner is recorded
df_raw["has_domestic_ultimate"] = df_raw["Domestic Ultimate Company"].notna().astype(int)

In [19]:
# Organisational complexity signal
# Idea: companies that share the same "Global Ultimate Company" belong to the same corporate group.
# We define organisational complexity as the size of that group (number of companies in the dataset
# that roll up to the same global ultimate).

# Note:
# We must avoid treating missing global ultimate as one giant group.
# So missing values get a count of 0 (or 1 if you prefer "unknown but standalone").
# We also standardize names (strip whitespace, upper-case) to reduce duplicates caused by formatting.

# 1 Create a standardized key for grouping (prevents "ABC LTD" vs " ABC LTD " being treated differently)
df_raw["global_ultimate_key"] = (
    df_raw["Global Ultimate Company"]
    .astype("string")          # ensure consistent string dtype
    .str.strip()               # remove leading/trailing spaces
    .str.upper()               # normalize casing
)

# 2  Count how many rows share the same global ultimate key
# transform("size") returns a series aligned with original rows (same length as df_raw)
group_sizes = df_raw.groupby("global_ultimate_key")["global_ultimate_key"].transform("size")

# 3 Assign organisational complexity count
# For missing global ultimate, set to 0 so "unknown ultimate" doesn't become an artificial mega-group
df_raw["org_complexity_count"] = np.where(
    df_raw["global_ultimate_key"].notna(),
    group_sizes,
    0
)
print(min(df_raw["org_complexity_count"] ))


1


In [12]:
# Scale-related signals
# Raw size metrics (employees, revenue) are heavily skewed.
# Log-transforming prevents large firms from dominating
# distance-based clustering.
# Log-transform total employee count
# log1p(x)=ln(1+x) to handle zeroes 
df_raw["log_employees_total"] = np.log1p(df_raw["Employees Total"])
df_raw["log_revenue_usd"] = np.log1p(df_raw["Revenue (USD)"])

In [41]:
# Operational footprint signals
# Some IT-related columns are stored as ranges (e.g. "1 to 10").
# These are converted into numeric midpoints to enable comparison.

def range_to_midpoint(value):

    if isinstance(value, str) and "to" in value:
        low, high = value.split("to")
        return (float(low.strip()) + float(high.strip())) / 2
    return np.nan

# Apply midpoint conversion to IT-related columns 
df_raw["servers_midpoint"] = df_raw["No. of Servers"].apply(range_to_midpoint)
df_raw["desktops_midpoint"] = df_raw["No. of Desktops"].apply(range_to_midpoint)
df_raw["routers_midpoint"] = df_raw["No. of Routers"].apply(range_to_midpoint)
df_raw["storage_devices_midpoint"] = df_raw["No. of Storage Devices"].apply(range_to_midpoint)
df_raw["pc_midpoint"] = df_raw["No. of PC"].apply(range_to_midpoint)
df_raw["laptops_midpoint"] = df_raw["No. of Laptops"].apply(range_to_midpoint)

# Total endpoint count (treat NaNs as 0 for the sum; keep missing flags separately if you want)
it_assets = ["desktops_midpoint", "laptops_midpoint", "pc_midpoint", "servers_midpoint", "routers_midpoint", "storage_devices_midpoint"]
df_raw["it_assets_total"] = df_raw[it_assets].fillna(0).sum(axis=1)

# Log-transform to reduce domination by firms with very large counts
df_raw["log_it_assets_total"] = np.log1p(df_raw["it_assets_total"])


In [None]:
# -------------------------------
# IT Budget / Spend signals
# it_spend_rate = IT Spend / IT Budget (efficiency / execution)
# budget_minus_spend = IT Budget - IT Spend (underspend/overspend)
# -------------------------------

# Coerce to numeric safely (handles strings, commas; non-parsable -> NaN)
df_raw["it_budget"] = pd.to_numeric(df_raw["IT Budget"], errors="coerce")
df_raw["it_spend"]  = pd.to_numeric(df_raw["IT spend"], errors="coerce")

# Log transforms for clustering stability
df_raw["log_it_budget"] = np.log1p(df_raw["it_budget"])
df_raw["log_it_spend"]  = np.log1p(df_raw["it_spend"])

# Spend rate: how much of budget is actually spent
df_raw["it_spend_rate"] = df_raw["it_spend"] / df_raw["it_budget"]

# Keep spend rate bounded (optional but useful)
df_raw["it_spend_rate"] = df_raw["it_spend_rate"].clip(lower=0, upper=3)

# Budget gap: positive = under-spending, negative = overspending
df_raw["it_budget_gap"] = df_raw["it_budget"] - df_raw["it_spend"]
df_raw["log_abs_it_budget_gap"] = np.log1p(df_raw["it_budget_gap"].abs())

In [42]:
# IT intensity per employee (size-normalized)

# Ensure employees are numeric; if you already have Employees Total numeric, use that
employees = pd.to_numeric(df_raw["Employees Total"], errors="coerce")

# Avoid divide-by-zero and nonsense: only compute where employees > 0
mask_emp = employees > 0

df_raw["it_assets_per_employee"] = np.nan
df_raw.loc[mask_emp, "it_assets_per_employee"] = df_raw.loc[mask_emp, "it_assets_total"] / employees[mask_emp]

df_raw["it_spend_per_employee"] = np.nan
df_raw.loc[mask_emp, "it_spend_per_employee"] = df_raw.loc[mask_emp, "it_spend"] / employees[mask_emp]

# Log to reduce heavy tails
df_raw["log_it_assets_per_employee"] = np.log1p(df_raw["it_assets_per_employee"])
df_raw["log_it_spend_per_employee"] = np.log1p(df_raw["it_spend_per_employee"])

In [47]:
# Server-centric vs endpoint-centric ratio
# This helps separate: infrastructure-heavy orgs (servers/storage)
# vs office/endpoint-heavy orgs (PC/laptops/desktops)

# IT composition ratios


# -------------------------------
# IT composition ratios
# -------------------------------

endpoint_total = df_raw[["desktops_midpoint", "laptops_midpoint", "pc_midpoint"]].fillna(0).sum(axis=1)
infra_total = df_raw[["servers_midpoint", "storage_devices_midpoint", "routers_midpoint"]].fillna(0).sum(axis=1)

# Add small epsilon to avoid division by zero
eps = 1e-9

df_raw["infra_to_endpoint_ratio"] = infra_total / (endpoint_total + eps)
df_raw["log_infra_to_endpoint_ratio"] = np.log1p(df_raw["infra_to_endpoint_ratio"])

In [49]:
# IT reporting completeness

it_raw_cols = [
    "No. of Desktops", "No. of Laptops", "No. of Routers", "No. of Servers",
    "No. of Storage Devices", "No. of PC", "IT Budget", "IT spend"
]

# 1 if non-missing and not empty
it_trimmed = df_raw[it_raw_cols].apply(lambda s: s.astype("string").str.strip())
df_raw["it_reporting_score"] = (it_trimmed.notna() & it_trimmed.ne("")).sum(axis=1)

df_raw["it_reporting_score_norm"] = df_raw["it_reporting_score"] / len(it_raw_cols)
df_raw["it_missing_ratio"] = 1 - df_raw["it_reporting_score_norm"]

In [None]:
# Credible + IT-heavy interaction
# Sometimes clusters form like:
# high credibility + high IT footprint (mature corp)
# low credibility + low IT footprint (small/opaque)
# high credibility + low IT footprint (traditional industries)


# Interaction features (optional)
df_raw["cred_x_log_it_assets"] = df_raw["credibility_score_norm"] * df_raw["log_it_assets_total"]
df_raw["cred_x_log_it_spend"]  = df_raw["credibility_score_norm"] * df_raw["log_it_spend"]

df_raw.head(10)

In [None]:
# Credibility/missingness 
df_raw["has_ticker"] = df_raw["Ticker"].notna().astype(int)
df_raw["has_registration_number"] = df_raw["Registration Number"].notna().astype(int)
df_raw["has_company_description"] = df_raw["Company Description"].notna().astype(int)
status_col = "Company Status (Active/Inactive)"
df_raw["company_status_binary"] = (
    df_raw[status_col]
    .astype("string")
    .str.strip()
    .str.lower()
    .map({"active": 1, "inactive": 0})
)

credibility_flag_cols = [
    "has_website",
    "has_address",
    "has_phone",
    "has_ticker",
    "has_parent",
    "has_global_ultimate",
    "has_domestic_ultimate",
    "has_registration_number",
    "has_company_description",
    "company_status_binary"]

# credibility score = how many verifiability fields are present
df_raw["credibility_score"] = df_raw[credibility_flag_cols].sum(axis=1)

# normalized to 0..1
df_raw["credibility_score_norm"] = df_raw["credibility_score"] / len(credibility_flag_cols)

# missing_ratio = inverse of credibility
df_raw["missing_ratio"] = 1 - df_raw["credibility_score_norm"]
df_raw.head(10)




In [16]:
# Encoding categorical variables
# Low-cardinality categorical variables are one-hot encoded
# to allow distance-based clustering to treat categories distinctly.
categorical_cols = ["Region", "Entity Type", "Ownership Type"]

# One-hot encode selected categorical columns
df_encoded = pd.get_dummies(df_raw, columns=categorical_cols, drop_first=False)