In [None]:
import os
import numpy as np
import pandas as pd
from faker import Faker
from datetime import date

# -------------------------
# Config
# -------------------------
SEED = 42
np.random.seed(SEED)
fake = Faker("en_IN")
fake.seed_instance(SEED)

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..")) if os.path.basename(os.getcwd()) == "notebooks" else os.getcwd()
RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
os.makedirs(RAW_DIR, exist_ok=True)

START_DATE = "2023-01-01"
END_DATE   = "2025-12-31"

print("Project root:", PROJECT_ROOT)
print("Raw dir:", RAW_DIR)


Project root: j:\PROJECTS\volvo-aftersales-analytics
Raw dir: j:\PROJECTS\volvo-aftersales-analytics\data\raw


In [3]:
dim_date = pd.DataFrame({
    "date": pd.date_range(start=START_DATE, end=END_DATE, freq="D")
})
dim_date["date_key"] = dim_date["date"].dt.strftime("%Y%m%d").astype(int)
dim_date["year"] = dim_date["date"].dt.year
dim_date["month"] = dim_date["date"].dt.month
dim_date["month_name"] = dim_date["date"].dt.strftime("%b")
dim_date["quarter"] = dim_date["date"].dt.quarter
dim_date["week"] = dim_date["date"].dt.isocalendar().week.astype(int)
dim_date["day_name"] = dim_date["date"].dt.strftime("%a")
dim_date["is_weekend"] = dim_date["day_name"].isin(["Sat", "Sun"]).astype(int)

dim_date.to_csv(os.path.join(RAW_DIR, "dim_date.csv"), index=False)
dim_date.head()


Unnamed: 0,date,date_key,year,month,month_name,quarter,week,day_name,is_weekend
0,2023-01-01,20230101,2023,1,Jan,1,52,Sun,1
1,2023-01-02,20230102,2023,1,Jan,1,1,Mon,0
2,2023-01-03,20230103,2023,1,Jan,1,1,Tue,0
3,2023-01-04,20230104,2023,1,Jan,1,1,Wed,0
4,2023-01-05,20230105,2023,1,Jan,1,1,Thu,0


In [4]:
regions = [
    ("R01", "North", 1.00),
    ("R02", "South", 1.15),
    ("R03", "West", 1.10),
    ("R04", "East", 0.95),
    ("R05", "Central", 1.05),
    ("R06", "Northeast", 0.85),
]

dim_region = pd.DataFrame(regions, columns=["region_id", "region_name", "demand_multiplier"])
dim_region.to_csv(os.path.join(RAW_DIR, "dim_region.csv"), index=False)
dim_region


Unnamed: 0,region_id,region_name,demand_multiplier
0,R01,North,1.0
1,R02,South,1.15
2,R03,West,1.1
3,R04,East,0.95
4,R05,Central,1.05
5,R06,Northeast,0.85


In [5]:
n_dealers = 40
dealer_ids = [f"D{str(i).zfill(3)}" for i in range(1, n_dealers + 1)]

# Region assignment (weighted)
region_choices = dim_region["region_id"].tolist()
region_probs = np.array([0.18, 0.20, 0.18, 0.16, 0.18, 0.10])  # South/West/North/Central heavier
region_probs = region_probs / region_probs.sum()

# Dealer tier distribution
tiers = ["A", "B", "C"]
tier_probs = [0.25, 0.50, 0.25]

dim_dealer = pd.DataFrame({
    "dealer_id": dealer_ids,
    "dealer_name": [f"{fake.company()} Motors" for _ in range(n_dealers)],
    "region_id": np.random.choice(region_choices, size=n_dealers, p=region_probs),
    "tier": np.random.choice(tiers, size=n_dealers, p=tier_probs),
})

dim_dealer.to_csv(os.path.join(RAW_DIR, "dim_dealer.csv"), index=False)
dim_dealer.head()


Unnamed: 0,dealer_id,dealer_name,region_id,tier
0,D001,"Choudhury, Bakshi and Maharaj Motors",R02,A
1,D002,Kapoor and Sons Motors,R06,B
2,D003,"Chaudry, Chahal and Sami Motors",R05,A
3,D004,Balan Inc Motors,R04,C
4,D005,Kaul PLC Motors,R01,B


In [6]:
n_customers = 2500
customer_ids = [f"C{str(i).zfill(5)}" for i in range(1, n_customers + 1)]

customer_types = ["Fleet", "Contractor", "Government", "Rental"]
customer_type_probs = [0.35, 0.40, 0.10, 0.15]

size_bands = ["Small", "Medium", "Large"]
size_probs = [0.55, 0.35, 0.10]

dim_customer = pd.DataFrame({
    "customer_id": customer_ids,
    "customer_name": [fake.company() for _ in range(n_customers)],
    "customer_type": np.random.choice(customer_types, size=n_customers, p=customer_type_probs),
    "region_id": np.random.choice(region_choices, size=n_customers, p=region_probs),
    "size_band": np.random.choice(size_bands, size=n_customers, p=size_probs),
})

dim_customer.to_csv(os.path.join(RAW_DIR, "dim_customer.csv"), index=False)
dim_customer.head()


Unnamed: 0,customer_id,customer_name,customer_type,region_id,size_band
0,C00001,"Lall, Tella and Gokhale",Rental,R01,Small
1,C00002,"Cherian, Misra and Tripathi",Contractor,R05,Small
2,C00003,"Issac, Dua and Prakash",Fleet,R03,Medium
3,C00004,Zacharia PLC,Fleet,R03,Small
4,C00005,Naik Group,Fleet,R04,Small


In [7]:
models = [
    ("M01", "EC220 Excavator", "Excavator", 13500000, 1800),
    ("M02", "EC300 Excavator", "Excavator", 18500000, 2000),
    ("M03", "L120 Wheel Loader", "Loader",   16000000, 1900),
    ("M04", "L150 Wheel Loader", "Loader",   21000000, 2100),
    ("M05", "A30 Articulated Hauler", "Hauler", 35000000, 2200),
    ("M06", "A40 Articulated Hauler", "Hauler", 42000000, 2400),
]

dim_machine_model = pd.DataFrame(models, columns=[
    "model_id", "model_name", "category", "base_price_inr", "expected_annual_hours"
])
dim_machine_model.to_csv(os.path.join(RAW_DIR, "dim_machine_model.csv"), index=False)
dim_machine_model


Unnamed: 0,model_id,model_name,category,base_price_inr,expected_annual_hours
0,M01,EC220 Excavator,Excavator,13500000,1800
1,M02,EC300 Excavator,Excavator,18500000,2000
2,M03,L120 Wheel Loader,Loader,16000000,1900
3,M04,L150 Wheel Loader,Loader,21000000,2100
4,M05,A30 Articulated Hauler,Hauler,35000000,2200
5,M06,A40 Articulated Hauler,Hauler,42000000,2400


In [8]:
failure_rows = [
    ("F001", "Hydraulic Leak", "Hydraulic", "Medium", 3),
    ("F002", "Hydraulic Pump Failure", "Hydraulic", "High", 7),
    ("F003", "Electrical Sensor Fault", "Electrical", "Low", 1),
    ("F004", "ECU / Controller Issue", "Electrical", "High", 6),
    ("F005", "Engine Overheating", "Engine", "High", 8),
    ("F006", "Fuel System Issue", "Engine", "Medium", 4),
    ("F007", "Transmission Slippage", "Transmission", "High", 9),
    ("F008", "Undercarriage Wear", "Undercarriage", "Medium", 5),
    ("F009", "Brake Wear", "Wear", "Low", 2),
    ("F010", "Air Filter Clogging", "Wear", "Low", 1),
]

dim_failure_code = pd.DataFrame(failure_rows, columns=[
    "failure_code", "failure_name", "failure_category", "severity", "typical_downtime_days"
])
dim_failure_code.to_csv(os.path.join(RAW_DIR, "dim_failure_code.csv"), index=False)
dim_failure_code


Unnamed: 0,failure_code,failure_name,failure_category,severity,typical_downtime_days
0,F001,Hydraulic Leak,Hydraulic,Medium,3
1,F002,Hydraulic Pump Failure,Hydraulic,High,7
2,F003,Electrical Sensor Fault,Electrical,Low,1
3,F004,ECU / Controller Issue,Electrical,High,6
4,F005,Engine Overheating,Engine,High,8
5,F006,Fuel System Issue,Engine,Medium,4
6,F007,Transmission Slippage,Transmission,High,9
7,F008,Undercarriage Wear,Undercarriage,Medium,5
8,F009,Brake Wear,Wear,Low,2
9,F010,Air Filter Clogging,Wear,Low,1


In [9]:
warehouses = [
    ("W01", "Bangalore", "R02"),
    ("W02", "Delhi NCR", "R01"),
    ("W03", "Mumbai", "R03"),
    ("W04", "Kolkata", "R04"),
]

dim_warehouse = pd.DataFrame(warehouses, columns=["warehouse_id", "warehouse_city", "region_id"])
dim_warehouse.to_csv(os.path.join(RAW_DIR, "dim_warehouse.csv"), index=False)
dim_warehouse


Unnamed: 0,warehouse_id,warehouse_city,region_id
0,W01,Bangalore,R02
1,W02,Delhi NCR,R01
2,W03,Mumbai,R03
3,W04,Kolkata,R04


In [10]:
n_parts = 200
part_ids = [f"P{str(i).zfill(4)}" for i in range(1, n_parts + 1)]

part_categories = ["Filters", "Hydraulics", "Electrical", "Wear Parts", "Engine", "Transmission"]
cat_probs = [0.22, 0.20, 0.16, 0.22, 0.12, 0.08]

criticality_levels = ["Low", "Medium", "High"]
crit_probs = [0.55, 0.30, 0.15]

model_choices = dim_machine_model["model_id"].tolist()

def sample_lead_time(cat, crit):
    # Base lead time by category (days)
    base = {
        "Filters": 3,
        "Wear Parts": 5,
        "Electrical": 10,
        "Hydraulics": 12,
        "Engine": 14,
        "Transmission": 16
    }[cat]
    # Critical parts tend to take longer (and should be stocked)
    bump = {"Low": 0, "Medium": 2, "High": 4}[crit]
    # Noise
    return int(np.clip(np.random.normal(base + bump, 2), 1, 30))

parts_cat = np.random.choice(part_categories, size=n_parts, p=cat_probs)
parts_crit = np.random.choice(criticality_levels, size=n_parts, p=crit_probs)

unit_cost = []
unit_price = []
lead_times = []

for c, crit in zip(parts_cat, parts_crit):
    # Cost ranges by category (INR)
    cost_base = {
        "Filters": 800,
        "Wear Parts": 2500,
        "Electrical": 12000,
        "Hydraulics": 18000,
        "Engine": 45000,
        "Transmission": 60000
    }[c]
    cost = float(np.clip(np.random.normal(cost_base, cost_base * 0.25), 200, None))
    margin = np.random.uniform(1.25, 1.70)  # 25% to 70% markup (simplified)
    price = cost * margin

    unit_cost.append(round(cost, 2))
    unit_price.append(round(price, 2))
    lead_times.append(sample_lead_time(c, crit))

dim_part = pd.DataFrame({
    "part_id": part_ids,
    "part_name": [f"{fake.word().title()} {fake.word().title()} {np.random.choice(['Kit','Assembly','Unit','Set','Module','Filter','Pump','Sensor'])}"
                  for _ in range(n_parts)],
    "part_category": parts_cat,
    "model_id": np.random.choice(model_choices, size=n_parts),
    "unit_cost_inr": unit_cost,
    "unit_price_inr": unit_price,
    "lead_time_days": lead_times,
    "criticality": parts_crit,
})

dim_part.to_csv(os.path.join(RAW_DIR, "dim_part.csv"), index=False)
dim_part.head()


Unnamed: 0,part_id,part_name,part_category,model_id,unit_cost_inr,unit_price_inr,lead_time_days,criticality
0,P0001,Nisi Neque Set,Wear Parts,M03,2128.62,3320.07,3,Low
1,P0002,Beatae Consectetur Kit,Hydraulics,M04,16566.23,26537.45,9,Low
2,P0003,Assumenda Nisi Assembly,Filters,M06,751.71,1207.62,9,High
3,P0004,Delectus Pariatur Filter,Filters,M04,890.29,1204.38,4,Low
4,P0005,Error Occaecati Sensor,Electrical,M02,9212.59,11999.97,11,High


In [13]:
print("Rows:")
print("dim_date:", len(dim_date))
print("dim_region:", len(dim_region))
print("dim_dealer:", len(dim_dealer))
print("dim_customer:", len(dim_customer))
print("dim_machine_model:", len(dim_machine_model))
print("dim_failure_code:", len(dim_failure_code))
print("dim_warehouse:", len(dim_warehouse))
print("dim_part:", len(dim_part))

# Check uniqueness
assert dim_dealer["dealer_id"].is_unique
assert dim_customer["customer_id"].is_unique
assert dim_part["part_id"].is_unique

print("-------------checks passed------------------")


Rows:
dim_date: 1096
dim_region: 6
dim_dealer: 40
dim_customer: 2500
dim_machine_model: 6
dim_failure_code: 10
dim_warehouse: 4
dim_part: 200
-------------checks passed------------------
