In [1]:
import os
import numpy as np
import pandas as pd

SEED = 42
rng = np.random.default_rng(SEED)

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..")) if os.path.basename(os.getcwd()) == "notebooks" else os.getcwd()
RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
os.makedirs(RAW_DIR, exist_ok=True)

fact_service_job = pd.read_csv(os.path.join(RAW_DIR, "fact_service_job.csv"), parse_dates=["service_date"])
fact_machine_sales = pd.read_csv(os.path.join(RAW_DIR, "fact_machine_sales.csv"), parse_dates=["sale_date"])

dim_dealer = pd.read_csv(os.path.join(RAW_DIR, "dim_dealer.csv"))
dim_part = pd.read_csv(os.path.join(RAW_DIR, "dim_part.csv"))
dim_region = pd.read_csv(os.path.join(RAW_DIR, "dim_region.csv"))

print("Loaded ✅", len(fact_service_job), "service jobs")


Loaded ✅ 11118 service jobs


In [2]:
service = fact_service_job.merge(
    dim_dealer[["dealer_id", "region_id"]],
    on="dealer_id",
    how="left"
)

service["region_id"].isna().mean()


np.float64(0.0)

In [3]:
# Part category list from your dim_part
part_cats = dim_part["part_category"].unique().tolist()
part_cats


['Wear Parts', 'Hydraulics', 'Filters', 'Electrical', 'Engine', 'Transmission']

In [4]:
failure_to_partcats = {
    # Breakdown related
    "Hydraulic":     ["Hydraulics", "Wear Parts", "Filters"],
    "Electrical":    ["Electrical", "Filters"],
    "Engine":        ["Engine", "Filters", "Wear Parts"],
    "Transmission":  ["Transmission", "Wear Parts"],
    "Undercarriage": ["Wear Parts"],
    "Wear":          ["Wear Parts", "Filters"]
}


In [5]:
dim_failure = pd.read_csv(os.path.join(RAW_DIR, "dim_failure_code.csv"))

service = service.merge(
    dim_failure[["failure_code", "failure_category", "severity"]],
    on="failure_code",
    how="left"
)

service[["job_type","failure_category","severity"]].head(10)


Unnamed: 0,job_type,failure_category,severity
0,Breakdown,Wear,Low
1,Breakdown,Wear,Low
2,Preventive,,
3,Inspection,,
4,Breakdown,Wear,Low
5,Breakdown,Wear,Low
6,Preventive,,
7,Inspection,,
8,Breakdown,Undercarriage,Medium
9,Breakdown,Engine,Medium


In [6]:
def num_parts_lines(row):
    if row["job_type"] == "Inspection":
        return int(rng.choice([0,1], p=[0.75, 0.25]))
    if row["job_type"] == "Preventive":
        return int(rng.integers(1, 4))  # 1-3
    # Breakdown
    sev = row["severity"]
    if sev == "High":
        return int(rng.integers(3, 9))  # 3-8
    if sev == "Medium":
        return int(rng.integers(2, 6))  # 2-5
    return int(rng.integers(1, 4))      # 1-3

service["n_lines"] = service.apply(num_parts_lines, axis=1)
service["n_lines"].describe()


count    11118.000000
mean         2.026354
std          1.173578
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max          5.000000
Name: n_lines, dtype: float64

In [7]:
lines = service.loc[service.index.repeat(service["n_lines"])].reset_index(drop=True)

# Create a line number per service_job_id (1..n_lines)
lines["line_num"] = lines.groupby("service_job_id").cumcount() + 1

len(lines)


22529

In [8]:
# Build parts lookup by (model_id, part_category) for fast sampling
parts_by_model_cat = (
    dim_part.groupby(["model_id", "part_category"])["part_id"]
    .apply(list)
    .to_dict()
)

# Fallback: parts by category only
parts_by_cat = dim_part.groupby("part_category")["part_id"].apply(list).to_dict()

def choose_part_id(row):
    model = row["model_id"]
    if row["job_type"] == "Preventive":
        # Filters + Wear parts dominate
        cat = rng.choice(["Filters", "Wear Parts"], p=[0.60, 0.40])
    elif row["job_type"] == "Inspection":
        # If any part, usually filter or small wear
        cat = rng.choice(["Filters", "Wear Parts"], p=[0.70, 0.30])
    else:
        # Breakdown based on failure category
        fcat = row["failure_category"]
        choices = failure_to_partcats.get(fcat, ["Wear Parts", "Filters"])
        # Slight bias: first category gets more weight
        probs = np.array([0.55] + [0.45/(len(choices)-1)]*(len(choices)-1)) if len(choices) > 1 else np.array([1.0])
        cat = rng.choice(choices, p=probs)

    # Sample part id from model+category if available; else from category fallback
    pool = parts_by_model_cat.get((model, cat))
    if not pool:
        pool = parts_by_cat.get(cat, dim_part["part_id"].tolist())
    return rng.choice(pool)

lines["part_id"] = lines.apply(choose_part_id, axis=1)
lines[["service_job_id","job_type","failure_category","part_id"]].head()


Unnamed: 0,service_job_id,job_type,failure_category,part_id
0,SJ0000001,Breakdown,Wear,P0147
1,SJ0000002,Breakdown,Wear,P0044
2,SJ0000002,Breakdown,Wear,P0146
3,SJ0000002,Breakdown,Wear,P0155
4,SJ0000003,Preventive,,P0146


In [12]:
def choose_qty(row):
    # Filters usually 1
    # Wear parts can be multiple
    part_cat = dim_part.loc[dim_part["part_id"] == row["part_id"], "part_category"].iloc[0]
    if part_cat == "Filters":
        return int(rng.choice([1,2], p=[0.85,0.15]))
    if part_cat == "Wear Parts":
        return int(rng.choice([1,2,3,4], p=[0.55,0.25,0.15,0.05]))
    # For expensive categories, usually 1
    return 1

lines["quantity"] = lines.apply(choose_qty, axis=1)
lines["quantity"].describe()


count    22529.000000
mean         1.350748
std          0.686155
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          4.000000
Name: quantity, dtype: float64

In [13]:
lines = lines.merge(
    dim_part[["part_id", "unit_cost_inr", "unit_price_inr", "part_category", "criticality", "lead_time_days"]],
    on="part_id",
    how="left"
)

# Revenue/cost
lines["cost_inr"] = (lines["quantity"] * lines["unit_cost_inr"]).round(2)
lines["revenue_inr"] = (lines["quantity"] * lines["unit_price_inr"]).round(2)


In [14]:
def emergency_flag(row):
    if row["job_type"] != "Breakdown":
        return 0
    p = 0.03
    if row["criticality"] == "High":
        p += 0.08
    if row["severity"] == "High":
        p += 0.05
    if row["lead_time_days"] >= 12:
        p += 0.04
    return int(rng.random() < p)

lines["is_emergency_order"] = lines.apply(emergency_flag, axis=1)
lines["is_emergency_order"].mean()


np.float64(0.033334812907807716)

In [15]:
lines = lines.sort_values(["service_date","service_job_id","line_num"]).reset_index(drop=True)
lines["parts_line_id"] = ["PL" + str(i).zfill(8) for i in range(1, len(lines) + 1)]

fact_parts_sales = lines[[
    "parts_line_id",
    "service_job_id",
    "part_id",
    "quantity",
    "service_date",
    "dealer_id",
    "region_id",
    "revenue_inr",
    "cost_inr",
    "is_emergency_order"
]].rename(columns={"service_date": "sale_date"})

fact_parts_sales.to_csv(os.path.join(RAW_DIR, "fact_parts_sales.csv"), index=False)

print("fact_parts_sales rows:", len(fact_parts_sales))
fact_parts_sales.head()


fact_parts_sales rows: 22529


Unnamed: 0,parts_line_id,service_job_id,part_id,quantity,sale_date,dealer_id,region_id,revenue_inr,cost_inr,is_emergency_order
0,PL00000001,SJ0000001,P0147,2,2023-01-06,D039,R04,3018.2,2076.44,0
1,PL00000002,SJ0000002,P0044,1,2023-01-13,D020,R02,4049.03,2404.81,0
2,PL00000003,SJ0000002,P0146,1,2023-01-13,D020,R02,785.9,576.39,0
3,PL00000004,SJ0000002,P0155,1,2023-01-13,D020,R02,1396.53,866.67,0
4,PL00000005,SJ0000003,P0146,1,2023-01-21,D028,R03,785.9,576.39,0


In [16]:
assert fact_parts_sales["parts_line_id"].is_unique
assert fact_parts_sales["service_job_id"].nunique() <= fact_service_job["service_job_id"].nunique()
print("✅ fact_parts_sales created")


✅ fact_parts_sales created


In [17]:
print("Revenue total (INR):", fact_parts_sales["revenue_inr"].sum())
print("Cost total (INR):", fact_parts_sales["cost_inr"].sum())
print("Gross Margin %:",
      (fact_parts_sales["revenue_inr"].sum() - fact_parts_sales["cost_inr"].sum())
      / fact_parts_sales["revenue_inr"].sum())


Revenue total (INR): 195574404.83
Cost total (INR): 132099027.92
Gross Margin %: 0.3245587118885776
