In [1]:
import os
import numpy as np
import pandas as pd

SEED = 42
rng = np.random.default_rng(SEED)

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..")) if os.path.basename(os.getcwd()) == "notebooks" else os.getcwd()
RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
os.makedirs(RAW_DIR, exist_ok=True)

dim_date = pd.read_csv(os.path.join(RAW_DIR, "dim_date.csv"), parse_dates=["date"])
dim_region = pd.read_csv(os.path.join(RAW_DIR, "dim_region.csv"))
dim_dealer = pd.read_csv(os.path.join(RAW_DIR, "dim_dealer.csv"))
dim_customer = pd.read_csv(os.path.join(RAW_DIR, "dim_customer.csv"))
dim_machine_model = pd.read_csv(os.path.join(RAW_DIR, "dim_machine_model.csv"))
dim_part = pd.read_csv(os.path.join(RAW_DIR, "dim_part.csv"))
dim_failure_code = pd.read_csv(os.path.join(RAW_DIR, "dim_failure_code.csv"))
dim_warehouse = pd.read_csv(os.path.join(RAW_DIR, "dim_warehouse.csv"))

print("Loaded dims ✅")


Loaded dims ✅


In [2]:
N_MACHINES = 1500

# To guarantee 365 days of telematics fits inside our dim_date range (ends 2025-12-31),
# we keep sale_date between 2023-01-01 and 2024-12-31.
sale_start = pd.Timestamp("2023-01-01")
sale_end   = pd.Timestamp("2024-12-31")

sale_dates = pd.to_datetime(rng.integers(sale_start.value//10**9, sale_end.value//10**9, size=N_MACHINES), unit="s").normalize()

machine_ids = [f"MC{str(i).zfill(5)}" for i in range(1, N_MACHINES + 1)]
sale_ids = [f"S{str(i).zfill(6)}" for i in range(1, N_MACHINES + 1)]

# Model distribution (more excavators & loaders than haulers)
model_ids = dim_machine_model["model_id"].tolist()
model_probs = np.array([0.26, 0.18, 0.20, 0.16, 0.12, 0.08])
model_probs = model_probs / model_probs.sum()
chosen_models = rng.choice(model_ids, size=N_MACHINES, p=model_probs)

# Choose dealer and customer
dealer_ids = dim_dealer["dealer_id"].tolist()
chosen_dealers = rng.choice(dealer_ids, size=N_MACHINES)

customer_ids = dim_customer["customer_id"].tolist()
chosen_customers = rng.choice(customer_ids, size=N_MACHINES)

# Warranty logic: mostly 12 months, some 24 (fleets / rentals often negotiate)
warranty_months = rng.choice([12, 24], size=N_MACHINES, p=[0.80, 0.20])

fact_machine_sales = pd.DataFrame({
    "sale_id": sale_ids,
    "machine_id": machine_ids,
    "model_id": chosen_models,
    "sale_date": sale_dates,
    "dealer_id": chosen_dealers,
    "customer_id": chosen_customers,
    "warranty_months": warranty_months
})

# Add sale_price based on model base price + noise
fact_machine_sales = fact_machine_sales.merge(
    dim_machine_model[["model_id", "base_price_inr"]],
    on="model_id",
    how="left"
)

price_noise = rng.normal(loc=0.0, scale=0.06, size=N_MACHINES)  # +/- ~6% variation
fact_machine_sales["sale_price_inr"] = (fact_machine_sales["base_price_inr"] * (1 + price_noise)).round(0).astype(int)
fact_machine_sales.drop(columns=["base_price_inr"], inplace=True)

# Export
fact_machine_sales.to_csv(os.path.join(RAW_DIR, "fact_machine_sales.csv"), index=False)

fact_machine_sales.head()


Unnamed: 0,sale_id,machine_id,model_id,sale_date,dealer_id,customer_id,warranty_months,sale_price_inr
0,S000001,MC00001,M02,2023-03-07,D040,C01601,12,18792402
1,S000002,MC00002,M01,2024-07-18,D031,C01486,12,13565924
2,S000003,MC00003,M03,2024-04-22,D001,C01169,12,15100866
3,S000004,MC00004,M04,2023-11-17,D016,C00613,12,21015031
4,S000005,MC00005,M01,2023-11-13,D012,C00702,12,13510574


In [3]:
assert len(fact_machine_sales) == 1500
assert fact_machine_sales["machine_id"].is_unique
print("fact_machine_sales ✅ rows:", len(fact_machine_sales))


fact_machine_sales ✅ rows: 1500


In [4]:
# Pull customer type and expected hours into the machine table
machine_attr = fact_machine_sales.merge(
    dim_customer[["customer_id", "customer_type", "region_id"]],
    on="customer_id",
    how="left"
).merge(
    dim_machine_model[["model_id", "expected_annual_hours", "category"]],
    on="model_id",
    how="left"
)

machine_attr.head()


Unnamed: 0,sale_id,machine_id,model_id,sale_date,dealer_id,customer_id,warranty_months,sale_price_inr,customer_type,region_id,expected_annual_hours,category
0,S000001,MC00001,M02,2023-03-07,D040,C01601,12,18792402,Fleet,R03,2000,Excavator
1,S000002,MC00002,M01,2024-07-18,D031,C01486,12,13565924,Contractor,R01,1800,Excavator
2,S000003,MC00003,M03,2024-04-22,D001,C01169,12,15100866,Rental,R03,1900,Loader
3,S000004,MC00004,M04,2023-11-17,D016,C00613,12,21015031,Contractor,R06,2100,Loader
4,S000005,MC00005,M01,2023-11-13,D012,C00702,12,13510574,Contractor,R02,1800,Excavator


In [5]:
DAYS_PER_MACHINE = 365

# Repeat each machine row 365 times
rep = machine_attr.loc[machine_attr.index.repeat(DAYS_PER_MACHINE)].reset_index(drop=True)

# Day index 0..364 for each machine
rep["day_index"] = np.tile(np.arange(DAYS_PER_MACHINE), len(machine_attr))

# Snapshot date = sale_date + day_index
rep["snapshot_date"] = pd.to_datetime(rep["sale_date"]) + pd.to_timedelta(rep["day_index"], unit="D")

# Ensure snapshot_date exists in our dim_date range
min_d, max_d = rep["snapshot_date"].min(), rep["snapshot_date"].max()
print("Telematics date range:", min_d.date(), "to", max_d.date())


Telematics date range: 2023-01-01 to 2025-12-29


In [6]:
cust_mult = {
    "Fleet": 1.15,
    "Rental": 1.05,
    "Contractor": 0.95,
    "Government": 0.80
}
rep["cust_multiplier"] = rep["customer_type"].map(cust_mult).fillna(1.0)

base_daily_hours = (rep["expected_annual_hours"] / 365.0) * rep["cust_multiplier"]

# Add daily variation (some days high, some low)
rep["engine_hours"] = np.clip(rng.normal(loc=base_daily_hours, scale=base_daily_hours * 0.35), 0, None)

# Optional: weekends slightly lower usage
rep["is_weekend"] = rep["snapshot_date"].dt.dayofweek.isin([5, 6]).astype(int)
rep.loc[rep["is_weekend"] == 1, "engine_hours"] *= rng.uniform(0.70, 0.95, size=(rep["is_weekend"] == 1).sum())

rep["engine_hours"] = rep["engine_hours"].round(2)


In [7]:
idle_base = {
    "Fleet": 25,
    "Rental": 32,
    "Contractor": 28,
    "Government": 22
}
idle_mu = rep["customer_type"].map(idle_base).fillna(27).astype(float)

rep["idle_pct"] = np.clip(rng.normal(idle_mu, 8.0, size=len(rep)), 5, 60).round(1)


In [8]:
# Create per-machine parameters (consistent across all days)
machine_params = machine_attr[["machine_id", "customer_type"]].copy()

# Some machines start slightly worse/better
machine_params["base_health"] = np.clip(rng.normal(96, 3.0, size=len(machine_params)), 85, 100)

# Customer-type influences decay (fleet/rental degrade faster due to usage intensity)
decay_by_type = {"Fleet": 0.060, "Rental": 0.055, "Contractor": 0.045, "Government": 0.035}
machine_params["daily_decay"] = machine_params["customer_type"].map(decay_by_type).fillna(0.045)

# Add per-machine variation
machine_params["daily_decay"] = np.clip(
    machine_params["daily_decay"] + rng.normal(0, 0.010, size=len(machine_params)),
    0.020, 0.090
)

# Join parameters into rep
rep = rep.merge(machine_params[["machine_id", "base_health", "daily_decay"]], on="machine_id", how="left")

# Age decay accumulates with day_index
age_decay = rep["day_index"] * rep["daily_decay"]  # over 365 days: ~7 to 33 points drop

# Usage stress (engine hours) and idle penalty
usage_stress = (rep["engine_hours"] / rep["engine_hours"].quantile(0.95)).clip(0, 1) * 10.0
idle_penalty = (rep["idle_pct"] - 20).clip(lower=0) * 0.12  # up to ~4.8 points

# Noise
noise = rng.normal(0, 3.0, size=len(rep))

rep["health_score"] = rep["base_health"] - age_decay - usage_stress - idle_penalty + noise
rep["health_score"] = np.clip(rep["health_score"], 5, 100).round(1)




In [9]:
# Lower health -> higher error lambda
error_lambda = np.clip((100 - rep["health_score"]) / 10.0, 0.05, 8.0)
rep["error_code_count"] = rng.poisson(lam=error_lambda).astype(int)

In [10]:
fuel_rate = {
    "Excavator": 18,   # liters/hour (rough style)
    "Loader": 16,
    "Hauler": 28
}
rep["fuel_rate_lph"] = rep["category"].map(fuel_rate).fillna(18).astype(float)

rep["fuel_consumption_liters"] = (rep["engine_hours"] * rep["fuel_rate_lph"] * rng.uniform(0.92, 1.08, size=len(rep))).round(2)


In [11]:
fact_telematics_daily = rep[[
    "snapshot_date",
    "machine_id",
    "engine_hours",
    "idle_pct",
    "fuel_consumption_liters",
    "error_code_count",
    "health_score"
]].copy()

# Export
fact_telematics_daily.to_csv(os.path.join(RAW_DIR, "fact_telematics_daily.csv"), index=False)

fact_telematics_daily.head(), len(fact_telematics_daily)


(  snapshot_date machine_id  engine_hours  idle_pct  fuel_consumption_liters  \
 0    2023-03-07    MC00001          5.12      22.3                    96.97   
 1    2023-03-08    MC00001          4.86      31.8                    91.24   
 2    2023-03-09    MC00001          7.05      40.7                   127.00   
 3    2023-03-10    MC00001          5.51      30.5                   103.53   
 4    2023-03-11    MC00001          4.88      20.6                    83.42   
 
    error_code_count  health_score  
 0                 1          90.5  
 1                 6          80.7  
 2                 1          82.2  
 3                 1          88.9  
 4                 1          87.2  ,
 547500)

In [12]:
assert len(fact_telematics_daily) == 1500 * 365
assert fact_telematics_daily["machine_id"].nunique() == 1500
print("fact_telematics_daily ✅ rows:", len(fact_telematics_daily))


fact_telematics_daily ✅ rows: 547500
