In [1]:
import pandas as pd
import numpy as np

RAW_DIR = "data/raw"
PROCESSED_DIR = "data/processed"

issues = pd.read_csv(f"{RAW_DIR}/issues.csv")
history = pd.read_csv(f"{RAW_DIR}/issues_change_history.csv")

In [2]:
dt_cols = ["issue_created", "issue_resolution_date", "last_change_date"]

for c in dt_cols:
    if c in issues.columns:
        issues[c] = pd.to_datetime(issues[c], errors="coerce", utc=True)

In [3]:
def normalize_priority(p):
    if pd.isna(p):
        return "Unknown"
    p = str(p).lower()
    if p in ["blocker", "highest"]:
        return "Critical"
    if p == "high":
        return "High"
    if p == "medium":
        return "Medium"
    if p in ["low", "lowest"]:
        return "Low"
    return "Unknown"

issues["priority_norm"] = issues["issue_priority"].apply(normalize_priority)

issues["priority_norm"].value_counts(dropna=False)

priority_norm
Unknown     33965
Medium      24788
High         4554
Critical     2740
Low           644
Name: count, dtype: int64

In [4]:
issues["resolution_hours"] = (
    (issues["issue_resolution_date"] - issues["issue_created"])
    .dt.total_seconds() / 3600
)

In [5]:
cap = issues["resolution_hours"].quantile(0.99)
issues["resolution_hours_capped"] = issues["resolution_hours"].clip(upper=cap)

cap

np.float64(20564.896027777777)

In [6]:
SLA_MAP = {
    "Critical": 24,
    "High": 48,
    "Medium": 72,
    "Low": 120,
    "Unknown": 72
}

issues["sla_hours"] = issues["priority_norm"].map(SLA_MAP)

issues["sla_breach"] = np.where(
    (issues["resolution_hours"].notna()) &
    (issues["resolution_hours"] > issues["sla_hours"]),
    1,
    0
)

In [7]:
assignee_changes = history[history["field"].str.lower().str.contains("assignee", na=False)]

reassign_counts = (
    assignee_changes
    .groupby("issueid")
    .size()
    .reset_index(name="reassignment_count")
)

issues = issues.merge(
    reassign_counts,
    how="left",
    left_on="id",
    right_on="issueid"
)

issues["reassignment_count"] = issues["reassignment_count"].fillna(0).astype(int)

In [8]:
final_cols = [
    "id",
    "issue_created",
    "issue_resolution_date",
    "issue_type",
    "issue_status",
    "priority_norm",
    "resolution_hours_capped",
    "sla_breach",
    "reassignment_count"
]

final_df = issues[final_cols].copy()

final_df.head()

Unnamed: 0,id,issue_created,issue_resolution_date,issue_type,issue_status,priority_norm,resolution_hours_capped,sla_breach,reassignment_count
0,11887.0,2016-01-06 08:23:43+00:00,2016-01-06 08:56:55+00:00,Ticket,done,Medium,0.553333,0,0
1,11890.0,2016-01-11 10:06:19+00:00,2016-01-12 12:30:23+00:00,Ticket,done,Medium,26.401111,0,0
2,11904.0,2016-01-21 07:28:20+00:00,2016-01-26 08:21:47+00:00,Ticket,done,Medium,120.890833,1,1
3,11907.0,2016-01-26 07:44:54+00:00,2016-01-26 07:45:48+00:00,Vacation,done,Medium,0.015,0,0
4,11912.0,2016-02-01 13:45:47+00:00,2016-02-07 06:21:42+00:00,Story,done,Medium,136.598611,1,1


In [10]:
final_df.to_csv(f"{PROCESSED_DIR}/support_tickets_analytics.csv", index=False)