In [1]:
import os
import numpy as np
import pandas as pd

SEED = 42
rng = np.random.default_rng(SEED)

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..")) if os.path.basename(os.getcwd()) == "notebooks" else os.getcwd()
RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
os.makedirs(RAW_DIR, exist_ok=True)

fact_service = pd.read_csv(os.path.join(RAW_DIR, "fact_service_job.csv"), parse_dates=["service_date"])
fact_parts = pd.read_csv(os.path.join(RAW_DIR, "fact_parts_sales.csv"), parse_dates=["sale_date"])
fact_inventory = pd.read_csv(os.path.join(RAW_DIR, "fact_inventory_daily.csv"), parse_dates=["snapshot_date"])

fact_machine_sales = pd.read_csv(os.path.join(RAW_DIR, "fact_machine_sales.csv"), parse_dates=["sale_date"])
dim_customer = pd.read_csv(os.path.join(RAW_DIR, "dim_customer.csv"))
dim_dealer = pd.read_csv(os.path.join(RAW_DIR, "dim_dealer.csv"))
dim_part = pd.read_csv(os.path.join(RAW_DIR, "dim_part.csv"))

print("Service jobs:", len(fact_service))
print("Parts lines:", len(fact_parts))
print("Inventory rows:", len(fact_inventory))

Service jobs: 11118
Parts lines: 22529
Inventory rows: 292800


In [2]:
service = fact_service.merge(
    fact_machine_sales[["machine_id", "customer_id"]],
    on="machine_id",
    how="left"
).merge(
    dim_customer[["customer_id", "customer_type", "size_band", "region_id"]],
    on="customer_id",
    how="left"
)

assert service["customer_id"].isna().mean() == 0.0
service.head()

Unnamed: 0,service_job_id,machine_id,service_date,dealer_id,model_id,job_type,failure_code,downtime_days,labor_hours,service_cost_labor_inr,is_warranty,resolution_status,customer_id,customer_type,size_band,region_id
0,SJ0000001,MC01292,2023-01-06,D039,M03,Breakdown,F009,2,3.4,4760,1,Repeat,C00876,Contractor,Medium,R01
1,SJ0000002,MC01173,2023-01-13,D020,M02,Breakdown,F010,2,3.6,5040,1,Fixed,C00286,Contractor,Large,R04
2,SJ0000003,MC00900,2023-01-21,D028,M02,Preventive,,1,2.0,2600,1,Fixed,C01730,Fleet,Small,R05
3,SJ0000004,MC00900,2023-01-21,D028,M02,Inspection,,0,2.5,3250,1,Fixed,C01730,Fleet,Small,R05
4,SJ0000005,MC00800,2023-01-22,D040,M01,Breakdown,F010,1,4.3,6450,1,Fixed,C00173,Rental,Medium,R02


In [3]:
region_to_warehouse = {
    "R01": "W02",
    "R02": "W01",
    "R03": "W03",
    "R04": "W04",
    "R05": "W03",
    "R06": "W04",
}

parts = fact_parts.merge(
    dim_dealer[["dealer_id", "region_id"]],
    on="dealer_id",
    how="left",
    suffixes=("", "_dealer")
)

if "region_id_dealer" in parts.columns:
    parts["region_id"] = parts["region_id"].fillna(parts["region_id_dealer"])
    parts.drop(columns=["region_id_dealer"], inplace=True)

parts["warehouse_id"] = parts["region_id"].map(region_to_warehouse)
assert parts["warehouse_id"].isna().mean() == 0.0

parts.head()

Unnamed: 0,parts_line_id,service_job_id,part_id,quantity,sale_date,dealer_id,region_id,revenue_inr,cost_inr,is_emergency_order,warehouse_id
0,PL00000001,SJ0000001,P0147,2,2023-01-06,D039,R04,3018.2,2076.44,0,W04
1,PL00000002,SJ0000002,P0044,1,2023-01-13,D020,R02,4049.03,2404.81,0,W01
2,PL00000003,SJ0000002,P0146,1,2023-01-13,D020,R02,785.9,576.39,0,W01
3,PL00000004,SJ0000002,P0155,1,2023-01-13,D020,R02,1396.53,866.67,0,W01
4,PL00000005,SJ0000003,P0146,1,2023-01-21,D028,R03,785.9,576.39,0,W03


In [4]:
inv_key = fact_inventory[["snapshot_date", "warehouse_id", "part_id", "stockout_flag"]].copy()

parts = parts.merge(
    inv_key,
    left_on=["sale_date", "warehouse_id", "part_id"],
    right_on=["snapshot_date", "warehouse_id", "part_id"],
    how="left"
)

parts["stockout_flag"] = parts["stockout_flag"].fillna(0).astype(int)
parts.drop(columns=["snapshot_date"], inplace=True)

parts[["service_job_id","part_id","warehouse_id","sale_date","stockout_flag"]].head()

Unnamed: 0,service_job_id,part_id,warehouse_id,sale_date,stockout_flag
0,SJ0000001,P0147,W04,2023-01-06,0
1,SJ0000002,P0044,W01,2023-01-13,0
2,SJ0000002,P0146,W01,2023-01-13,0
3,SJ0000002,P0155,W01,2023-01-13,0
4,SJ0000003,P0146,W03,2023-01-21,0


In [5]:
job_stockout = (
    parts.groupby("service_job_id")["stockout_flag"]
    .max()
    .reset_index()
    .rename(columns={"stockout_flag": "is_stockout_impacted"})
)

job_emergency = (
    parts.groupby("service_job_id")["is_emergency_order"]
    .max()
    .reset_index()
    .rename(columns={"is_emergency_order": "is_emergency"})
)

service = service.merge(job_stockout, on="service_job_id", how="left")
service = service.merge(job_emergency, on="service_job_id", how="left")

service["is_stockout_impacted"] = service["is_stockout_impacted"].fillna(0).astype(int)
service["is_emergency"] = service["is_emergency"].fillna(0).astype(int)

service[["service_job_id","job_type","downtime_days","resolution_status","is_stockout_impacted","is_emergency"]].head()

Unnamed: 0,service_job_id,job_type,downtime_days,resolution_status,is_stockout_impacted,is_emergency
0,SJ0000001,Breakdown,2,Repeat,0,0
1,SJ0000002,Breakdown,2,Fixed,0,0
2,SJ0000003,Preventive,1,Fixed,0,0
3,SJ0000004,Inspection,0,Fixed,0,0
4,SJ0000005,Breakdown,1,Fixed,0,0


In [6]:
def pain_score(row):
    score = 0.0
    
    # Downtime: big driver (cap impact)
    score += min(row["downtime_days"], 15) * 0.20  # up to 3 points
    
    # Breakdown is more painful
    if row["job_type"] == "Breakdown":
        score += 1.0
    
    # Repeat issue
    if row["resolution_status"] == "Repeat":
        score += 1.2
    
    # Stockout impact
    if row["is_stockout_impacted"] == 1:
        score += 1.1
    
    # Emergency orders indicate disruption
    if row["is_emergency"] == 1:
        score += 0.6
    
    # Warranty customers are slightly less angry about cost
    if row["is_warranty"] == 1:
        score -= 0.2
    
    # Noise (people variability)
    score += float(rng.normal(0, 0.35))
    
    return score

service["pain_score"] = service.apply(pain_score, axis=1)
service["pain_score"].describe()

count    11118.000000
mean         0.861771
std          0.940791
min         -1.318350
25%          0.046011
50%          0.899908
75%          1.501092
max          4.548761
Name: pain_score, dtype: float64

In [7]:
# CSAT 1-5 (higher is better)
# We convert pain to satisfaction by reversing
# Tune constants so distribution looks realistic
csat_cont = 5.2 - service["pain_score"]
csat = np.clip(np.round(csat_cont), 1, 5).astype(int)
service["csat_score"] = csat

# NPS 0-10
nps_cont = 10.5 - (service["pain_score"] * 1.6)
nps = np.clip(np.round(nps_cont), 0, 10).astype(int)
service["nps_score"] = nps

def nps_group(x):
    if x >= 9: return "Promoter"
    if x >= 7: return "Passive"
    return "Detractor"

service["nps_group"] = service["nps_score"].apply(nps_group)

service[["pain_score","csat_score","nps_score","nps_group"]].head()

Unnamed: 0,pain_score,csat_score,nps_score,nps_group
0,2.506651,3,6,Detractor
1,0.836006,4,9,Promoter
2,0.262658,5,10,Promoter
3,0.129198,5,10,Promoter
4,0.317138,5,10,Promoter


In [8]:
def complaint_reason(row):
    if row["csat_score"] >= 4 and row["nps_score"] >= 8:
        return "Satisfied"
    reasons = []
    if row["downtime_days"] >= 5:
        reasons.append("High downtime")
    if row["resolution_status"] == "Repeat":
        reasons.append("Repeat issue")
    if row["is_stockout_impacted"] == 1:
        reasons.append("Parts unavailable")
    if row["is_emergency"] == 1:
        reasons.append("Emergency order")
    if row["job_type"] == "Breakdown":
        reasons.append("Unexpected breakdown")
    return reasons[0] if reasons else "General dissatisfaction"

service["complaint_reason"] = service.apply(complaint_reason, axis=1)
service["complaint_reason"].value_counts().head(10)

complaint_reason
Satisfied               9000
Repeat issue             584
High downtime            560
Unexpected breakdown     479
Emergency order          314
Parts unavailable        181
Name: count, dtype: int64

In [9]:
# Response probability (unhappy customers respond more)
base_resp = 0.35
resp_prob = np.clip(base_resp + (3 - service["csat_score"]) * 0.08, 0.15, 0.70)

service["responded"] = (rng.random(len(service)) < resp_prob).astype(int)

feedback = service[service["responded"] == 1].copy()

feedback = feedback.sort_values(["service_date","service_job_id"]).reset_index(drop=True)
feedback["feedback_id"] = ["FB" + str(i).zfill(7) for i in range(1, len(feedback) + 1)]

fact_customer_feedback = feedback[[
    "feedback_id",
    "service_job_id",
    "customer_id",
    "service_date",
    "csat_score",
    "nps_score",
    "nps_group",
    "complaint_reason",
    "is_stockout_impacted",
    "is_emergency",
    "downtime_days",
    "job_type",
    "resolution_status"
]].copy()

fact_customer_feedback.to_csv(os.path.join(RAW_DIR, "fact_customer_feedback.csv"), index=False)

print("✅ Feedback rows:", len(fact_customer_feedback))
fact_customer_feedback.head()

✅ Feedback rows: 2808


Unnamed: 0,feedback_id,service_job_id,customer_id,service_date,csat_score,nps_score,nps_group,complaint_reason,is_stockout_impacted,is_emergency,downtime_days,job_type,resolution_status
0,FB0000001,SJ0000001,C00876,2023-01-06,3,6,Detractor,Repeat issue,0,0,2,Breakdown,Repeat
1,FB0000002,SJ0000008,C01373,2023-01-23,5,10,Promoter,Satisfied,0,0,0,Inspection,Fixed
2,FB0000003,SJ0000010,C01277,2023-02-03,2,6,Detractor,High downtime,0,0,5,Breakdown,Repeat
3,FB0000004,SJ0000020,C02478,2023-02-09,5,10,Promoter,Satisfied,0,0,0,Inspection,Fixed
4,FB0000005,SJ0000024,C00612,2023-02-12,2,5,Detractor,Repeat issue,0,1,3,Breakdown,Repeat


In [10]:
print(fact_customer_feedback["csat_score"].value_counts().sort_index())
print(fact_customer_feedback["nps_group"].value_counts())

# Relationship check: stockout-impacted should have lower CSAT on average
print("Avg CSAT (stockout=0):", fact_customer_feedback[fact_customer_feedback["is_stockout_impacted"]==0]["csat_score"].mean())
print("Avg CSAT (stockout=1):", fact_customer_feedback[fact_customer_feedback["is_stockout_impacted"]==1]["csat_score"].mean())

csat_score
1      14
2     121
3     636
4    1123
5     914
Name: count, dtype: int64
nps_group
Promoter     1551
Passive      1044
Detractor     213
Name: count, dtype: int64
Avg CSAT (stockout=0): 4.046244913059564
Avg CSAT (stockout=1): 2.7523809523809524


In [12]:
print("Avg CSAT by job_type")
print(fact_customer_feedback.groupby("job_type")["csat_score"].mean())

print("\nAvg CSAT by repeat vs not")
print(fact_customer_feedback.groupby("resolution_status")["csat_score"].mean())

print("\nAvg CSAT by downtime bucket")
fact_customer_feedback["downtime_bucket"] = pd.cut(
    fact_customer_feedback["downtime_days"],
    bins=[-1,1,3,7,30],
    labels=["0-1","2-3","4-7","8+"]
)
print(fact_customer_feedback.groupby("downtime_bucket")["csat_score"].mean())

Avg CSAT by job_type
job_type
Breakdown     3.575039
Inspection    4.974790
Preventive    4.928685
Name: csat_score, dtype: float64

Avg CSAT by repeat vs not
resolution_status
Fixed     4.134299
Repeat    2.625984
Name: csat_score, dtype: float64

Avg CSAT by downtime bucket
downtime_bucket
0-1    4.426179
2-3    3.603518
4-7    3.138393
8+     2.444444
Name: csat_score, dtype: float64
