# 03 Silver data profiling
- Load silver dataset from storage
- Validate required training columns
- Keep historical incidents only (Closed/Completed)
- Remove duplicate and incomplete records
- Profile timestamp quality, label balance, and text completeness
- Export a compact profiling summary for downstream steps

## 1) Config + load silver input


In [89]:
# Imports
from pathlib import Path
import io
import json
from datetime import datetime, timezone

import pandas as pd
from minio import Minio

In [90]:
# Read .env for credentials
def load_env_file(path: Path) -> dict:
    if not path.exists():
        raise FileNotFoundError(f"Missing env file: {path.resolve()}")

    env = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        k, v = line.split("=", 1)
        env[k.strip()] = v.strip()
    return env

# Env file location in repo
ENV_FILE = Path("../docker/.env")
# Storage endpoint and config
MINIO_ENDPOINT = "localhost:9000"
MINIO_SECURE = False

# Files locations in buckets
SILVER_BUCKET = "incident-pipeline-test"
SILVER_PREFIX = "silver/incidents"
SILVER_OBJECT = f"{SILVER_PREFIX}/incidents.parquet"
PROFILE_BUCKET = "data-profile-test"
PROFILE_PREFIX = "silver/incidents"
PROFILE_SUMMARY_OBJECT = f"{PROFILE_PREFIX}/silver_dq_summary.json"

# Authenticate to file storage list buckets to confirm connection
env = load_env_file(ENV_FILE)
client = Minio(
    MINIO_ENDPOINT,
    access_key=env["MINIO_ROOT_USER"],
    secret_key=env["MINIO_ROOT_PASSWORD"],
    secure=MINIO_SECURE,
)

print("Connected buckets:", [b.name for b in client.list_buckets()])

Connected buckets: ['incident-pipeline', 'incident-pipeline-test', 'mlflow-artifacts']


In [91]:
# Load silver parquet from storage
resp = client.get_object(SILVER_BUCKET, SILVER_OBJECT)
try:
    silver_df = pd.read_parquet(io.BytesIO(resp.read()))
finally:
    resp.close()
    resp.release_conn()

# Print number of rows and sample of the data
print(f"Rows: {len(silver_df)}")
print(f"Columns: {silver_df.columns.tolist()}")
silver_df.head(3)

Rows: 10000
Columns: ['sys_id', 'number', 'task_effective_number', 'sys_class_name', 'state', 'incident_state', 'priority', 'impact', 'urgency', 'severity', 'approval', 'escalation', 'notify', 'opened_at', 'resolved_at', 'closed_at', 'activity_due', 'due_date', 'sys_created_on', 'sys_updated_on', 'sys_created_by', 'sys_updated_by', 'opened_by', 'caller_id', 'resolved_by', 'closed_by', 'assignment_group', 'assigned_to', 'business_service', 'cmdb_ci', 'category', 'subcategory', 'contact_type', 'short_description', 'description', 'comments', 'work_notes', 'comments_and_work_notes', 'work_notes_list', 'parent_incident', 'problem_id', 'rfc', 'correlation_id', 'correlation_display', 'active', 'knowledge', 'made_sla', 'close_code', 'close_notes', 'calendar_duration', 'business_duration', 'calendar_stc', 'business_stc', 'sys_mod_count', 'reassignment_count', 'reopen_count', 'sla_due', 'sys_domain', 'sys_domain_path', 'watch_list', 'group_list', 'additional_assignee_list', 'u_system', 'u_system

Unnamed: 0,sys_id,number,task_effective_number,sys_class_name,state,incident_state,priority,impact,urgency,severity,...,u_system,u_system_criticality,u_initial_assignment_group,u_suggested_assignment_group,u_suggested_category,u_suggested_subcategory,u_outage_day,u_outage_system,bronze_run_id,ingested_at_utc
0,f15a43b287b3f46cb189b5c147a9f458,INC1200546,INC1200546,Incident,Closed,Closed,3 - Moderate,3 - Low,2 - High,2 - High,...,Core Network,High,Network Ops,Network Ops,Network,DNS,False,,20260228T130029Z,2026-02-28 13:02:01.566995+00:00
1,6f0c35505028c4a933934f9219a923e9,INC1205435,INC1205435,Incident,Closed,Closed,3 - Moderate,2 - Medium,2 - High,1 - Critical,...,VPN Service,High,Network Ops,Network Ops,Network,VPN,False,,20260228T130029Z,2026-02-28 13:02:01.566995+00:00
2,8fcc0f613c48a29a0ad6e49bae3e8b32,INC1203952,INC1203952,Incident,Closed,Closed,3 - Moderate,3 - Low,3 - Low,2 - High,...,Print Services,Low,End User Compute,End User Compute,Hardware,Printing,False,,20260228T130029Z,2026-02-28 13:02:01.566995+00:00


## 2) Training data filters + quality cleanup


In [92]:
# Define required columns and check they are present in the data
REQUIRED_COLS = ["sys_id", "sys_updated_on", "short_description", "description", "active", "state", "assignment_group"]
MISSING_COLS = [col for col in REQUIRED_COLS if col not in silver_df.columns]
if MISSING_COLS:
    raise ValueError(f"Missing required columns: {MISSING_COLS}")
print("All required columns found")

All required columns found


In [93]:
# Count open/closed and active true/false, then keep only closed/completed incidents
state_norm = silver_df["state"].astype(str).str.strip().str.lower()
active_norm = silver_df["active"].astype(str).str.strip().str.lower()

open_count = (state_norm == "open").sum()
closed_count = (state_norm == "closed").sum()
active_true_count = active_norm.isin(["true", "1", "yes", "y"]).sum()
active_false_count = active_norm.isin(["false", "0", "no", "n"]).sum()

print(f"Open records: {open_count}")
print(f"Closed records: {closed_count}")
print(f"Active=True records: {active_true_count}")
print(f"Active=False records: {active_false_count}")

keep_mask = state_norm.isin(["closed", "completed"])
silver_filtered_df = silver_df.loc[keep_mask].copy()

print(f"Filtered final record count: {len(silver_filtered_df)}")

Open records: 0
Closed records: 10000
Active=True records: 0
Active=False records: 10000
Filtered final record count: 10000


In [94]:
# Remove duplicate incidents by sys_id (keep first)
duplicate_mask = silver_filtered_df.duplicated(subset=["sys_id"], keep="first")
duplicate_count = duplicate_mask.sum()
silver_filtered_df = silver_filtered_df.loc[~duplicate_mask].copy()

print(f"Duplicate sys_id rows removed: {duplicate_count}")
print(f"Record count after de-duplication: {len(silver_filtered_df)}")

Duplicate sys_id rows removed: 0
Record count after de-duplication: 10000


In [95]:
# Remove rows with null/blank values in required training columns
required_view = silver_filtered_df[REQUIRED_COLS].copy()
null_or_blank = required_view.isna()

# Treat whitespace only strings as missing for text/object fields
for col in REQUIRED_COLS:
    null_or_blank[col] = null_or_blank[col] | required_view[col].astype(str).str.strip().eq("")

rows_with_missing_required = null_or_blank.any(axis=1)
missing_required_count = rows_with_missing_required.sum()
silver_filtered_df = silver_filtered_df.loc[~rows_with_missing_required].copy()

print(f"Rows removed for missing/blank required fields: {missing_required_count}")
print(f"Record count after required col cleanup: {len(silver_filtered_df)}")

Rows removed for missing/blank required fields: 0
Record count after required col cleanup: 10000


## 3) Profiling checks + output summary


In [96]:
# Final row count available for model training
print(f"Final training record count: {len(silver_filtered_df)}")

Final training record count: 10000


In [97]:
# Parse timestamp field used for recency and split logic in later stages
silver_filtered_df["_updated_dt"] = pd.to_datetime(
    silver_filtered_df["sys_updated_on"], errors="coerce", utc=True
)

# Report parse quality and observed time window after all filters
failures = silver_filtered_df["_updated_dt"].isna().sum()
print("sys_updated_on parse failures (training set):", failures)
print("sys_updated_on min (training set):", silver_filtered_df["_updated_dt"].min())
print("sys_updated_on max (training set):", silver_filtered_df["_updated_dt"].max())

sys_updated_on parse failures (training set): 0
sys_updated_on min (training set): 2025-02-01 09:13:18+00:00
sys_updated_on max (training set): 2026-01-31 23:58:32+00:00


In [98]:
# Count label frequency to spot obvious class imbalance before modeling
# Check label distribution of target variable (assignment_group)
silver_filtered_df["assignment_group_clean"] = (
    silver_filtered_df["assignment_group"].astype(str).str.strip()
)
label_counts = silver_filtered_df["assignment_group_clean"].value_counts()
total_records = len(silver_filtered_df)
label_percentages = (label_counts / total_records * 100).round(2)
print("Assignment group distribution:")
for group, count in label_counts.items():
    percentage = label_percentages[group]
    print(f"  {group}: {count} / {percentage}%")

Assignment group distribution:
  App Support - M365: 1762 / 17.62%
  Network Ops: 1339 / 13.39%
  App Support - Power BI: 1115 / 11.15%
  App Support - Power Platform: 1025 / 10.25%
  End User Compute: 976 / 9.76%
  Identity and User Access: 875 / 8.75%
  App Support - Microsoft Fabric: 833 / 8.33%
  Security Operations: 615 / 6.15%
  App Support - ERP: 407 / 4.07%
  App Support - Finance: 380 / 3.8%
  App Support - HRIS: 345 / 3.45%
  Integration & Middleware: 328 / 3.28%


In [99]:
# Flag rare classes for reporting
MIN_CLASS_COUNT = 20  

counts = silver_filtered_df["assignment_group_clean"].value_counts(dropna=True)
rare = counts[counts < MIN_CLASS_COUNT]

print(f"Classes with < {MIN_CLASS_COUNT} examples:", len(rare))
rare.to_frame("count")

Classes with < 20 examples: 0


Unnamed: 0_level_0,count
assignment_group_clean,Unnamed: 1_level_1


In [100]:
# Build combined text field and check for empty/short ticket content
text = (
    silver_filtered_df["short_description"].fillna("").astype(str).str.strip()
    + "\n" +
    silver_filtered_df["description"].fillna("").astype(str).str.strip()
).str.strip()

empty_rate = text.eq("").mean()
lengths = text.str.len()

print("Empty combined text rate:", f"{empty_rate:.2%}")
lengths.describe(percentiles=[0.5, 0.75, 0.9, 0.95])

Empty combined text rate: 0.00%


count    10000.000000
mean       411.844200
std         95.289842
min        116.000000
50%        431.000000
75%        452.000000
90%        483.000000
95%        503.050000
max        570.000000
dtype: float64

In [None]:
# Save profiling summary to dedicated profiling bucket in object storage
if not client.bucket_exists(PROFILE_BUCKET):
    client.make_bucket(PROFILE_BUCKET)
    print(f"Created bucket: {PROFILE_BUCKET}")
else:
    print(f"Using existing bucket: {PROFILE_BUCKET}")

summary = {
    "run_utc": datetime.now(timezone.utc).isoformat(),
    "silver_source": {"bucket": SILVER_BUCKET, "object": SILVER_OBJECT},
    "rows_raw": int(len(silver_df)),
    "rows_training": int(len(silver_filtered_df)),
    "fields": {
        "text": ["short_description", "description"],
        "label": "assignment_group",
        "label_clean": "assignment_group_clean",
    },
    "training_filters": {
        "state_keep": ["closed", "completed"],
        "dedupe": {"key": "sys_id", "keep": "first"},
        "required_cols": REQUIRED_COLS,
    },
    "label_stats": {
        "unique_classes": int(silver_filtered_df["assignment_group_clean"].nunique()),
        "min_class_count_reporting": 20,
        "rare_classes": rare.to_dict(),
    },
    "time_window_training": {
        "min": str(silver_filtered_df["_updated_dt"].min()),
        "max": str(silver_filtered_df["_updated_dt"].max()),
        "parse_failures": int(silver_filtered_df["_updated_dt"].isna().sum()),
    },
}

summary_bytes = json.dumps(summary, indent=2).encode("utf-8")
client.put_object(
    bucket_name=PROFILE_BUCKET,
    object_name=PROFILE_SUMMARY_OBJECT,
    data=io.BytesIO(summary_bytes),
    length=len(summary_bytes),
    content_type="application/json",
)
print(f"Wrote summary to s3://{PROFILE_BUCKET}/{PROFILE_SUMMARY_OBJECT}")

Created bucket: data-profile-test
Wrote summary to s3://data-profile-test/silver/incidents/silver_dq_summary.json
