# Imports

In [1]:
from pathlib import Path
import os

cwd = Path.cwd()
repo_root = next(p for p in [cwd, *cwd.parents] if (p/"data").exists() and (p/"tools").exists())

print("CWD:", cwd)
print("Repo root:", repo_root)

data_path = repo_root / "data" / "raw" / "unh_hackathon_prompt_2_data.json"
print("Data path exists:", data_path.exists(), data_path)

os.chdir(repo_root)
print("Now CWD:", Path.cwd())


CWD: /Users/michaelbanks/Desktop/Hackathon/UNH-Hackathon-Code/notebooks
Repo root: /Users/michaelbanks/Desktop/Hackathon/UNH-Hackathon-Code
Data path exists: True /Users/michaelbanks/Desktop/Hackathon/UNH-Hackathon-Code/data/raw/unh_hackathon_prompt_2_data.json
Now CWD: /Users/michaelbanks/Desktop/Hackathon/UNH-Hackathon-Code


# Load

In [2]:
from tools.io import load_prompt2_json

df = load_prompt2_json("data/raw/unh_hackathon_prompt_2_data.json")

In [3]:
import pandas as pd
print("shape:", df.shape)
print("first 20 cols:", df.columns.tolist()[:20])
display(df.head(2))


shape: (3000, 29)
first 20 cols: ['Threat Type', 'enemy_unit_count', 'Enemy.Capability.Index', 'ThreatEscalationHours', 'friendlyUnitCount', 'LCS_COUNT', 'Aircraft Count', 'cyber_defense_teams', 'Patriot.Batteries', 'ISR_AssetCount', 'satellite coverage score', 'JointForceIntegration', 'EW_Capability', 'Supply Chain Resilience', 'PriorEngagements', 'force_readiness_score', 'Intel Confidence', 'ResponseTime_hrs', 'logistics_delay_hours', 'CMD_COORD_SCORE']


Unnamed: 0,Threat Type,enemy_unit_count,Enemy.Capability.Index,ThreatEscalationHours,friendlyUnitCount,LCS_COUNT,Aircraft Count,cyber_defense_teams,Patriot.Batteries,ISR_AssetCount,...,CMD_COORD_SCORE,roe_complexity_score,Operational Budget (MUSD),BudgetUtilization_pct,Weather_Severity,Theater Distance KM,Season,response_success,Financial_Loss_MUSD,actual_days_to_stabilization
0,Missile,25,8,34,21,0,0,0,4,12,...,7,3,98,84,3,5164.8,Winter,1,388,10.1
1,Missile,3,2,8,45,0,0,0,4,4,...,9,5,189,97,1,7149.7,Fall,1,244,2.0


# How bad is it

In [4]:
missing = (df.isna().mean() * 100).sort_values(ascending=False)
display(missing.head(20))

obj_cols = df.select_dtypes(include=["object", "string"]).columns
display(df[obj_cols].nunique().sort_values(ascending=False).head(30))

Threat Type                  0.0
force_readiness_score        0.0
Financial_Loss_MUSD          0.0
response_success             0.0
Season                       0.0
Theater Distance KM          0.0
Weather_Severity             0.0
BudgetUtilization_pct        0.0
Operational Budget (MUSD)    0.0
roe_complexity_score         0.0
CMD_COORD_SCORE              0.0
logistics_delay_hours        0.0
ResponseTime_hrs             0.0
Intel Confidence             0.0
PriorEngagements             0.0
enemy_unit_count             0.0
Supply Chain Resilience      0.0
EW_Capability                0.0
JointForceIntegration        0.0
satellite coverage score     0.0
dtype: float64

Intel Confidence    72
Threat Type         15
Season              14
EW_Capability       12
response_success     4
dtype: int64

In [5]:
import pandas as pd
suspects = []
for c in obj_cols:
    sample = df[c].dropna().astype(str).head(50)
    if len(sample) == 0:
        continue
    # heuristic: lots of digits means maybe numeric-as-string
    digitish = sample.str.replace(r"[.,-]", "", regex=True).str.isnumeric().mean()
    if digitish > 0.6:
        suspects.append((c, digitish))

display(pd.DataFrame(suspects, columns=["col", "pct_digitish"]).sort_values("pct_digitish", ascending=False).head(30))


Unnamed: 0,col,pct_digitish
0,EW_Capability,1.0
1,Intel Confidence,1.0
2,response_success,0.98


In [6]:
for t in ["response_success", "Financial_Loss_MUSD", "actual_days_to_stabilization"]:
    if t in df.columns:
        print("\n==", t, "==")
        display(df[t].value_counts(dropna=False).head(20))



== response_success ==


response_success
1      2525
0       415
Yes      54
No        6
Name: count, dtype: int64


== Financial_Loss_MUSD ==


Financial_Loss_MUSD
327    20
330    19
308    19
244    18
285    18
255    18
323    18
296    18
262    18
306    17
354    17
258    17
260    17
289    17
328    17
368    17
254    16
243    16
333    16
277    16
Name: count, dtype: int64


== actual_days_to_stabilization ==


actual_days_to_stabilization
2.0     306
8.6      32
9.8      27
8.4      27
5.1      26
8.8      26
10.9     25
6.3      25
7.4      24
12.2     24
8.3      24
10.4     24
10.8     24
6.5      23
7.9      23
6.2      23
6.8      23
7.8      23
7.5      23
11.0     22
Name: count, dtype: int64

# No missing values at least. DTypes are all over the place, not sure whats going on in the categorical features

# Clean

In [7]:

from tools.clean import clean_prompt2


clean_df, report = clean_prompt2(df)

print("Clean shape:", clean_df.shape)
print("\nresponse_success value counts BEFORE:")
print(report.get("response_success_value_counts_before"))
print("\nresponse_success value counts AFTER:")
print(report.get("response_success_value_counts_after"))

missing_clean = pd.Series(report["missingness_pct"]).sort_values(ascending=False)
display(missing_clean.head(20))

display(clean_df.head(3))
clean_df.info()


Clean shape: (3000, 30)

response_success value counts BEFORE:
{'1': 2525, '0': 415, 'Yes': 54, 'No': 6}

response_success value counts AFTER:
{1: 2579, 0: 421}


EW_Capability               1.00
intel_confidence            0.83
enemy_capability_index      0.00
threat_escalation_hours     0.00
friendly_unit_count         0.00
lcs_count                   0.00
aircraft_count              0.00
cyber_defense_teams         0.00
patriot_batteries           0.00
isr_asset_count             0.00
satellite_coverage_score    0.00
joint_force_integration     0.00
supply_chain_resilience     0.00
prior_engagements           0.00
enemy_unit_count            0.00
readiness_level             0.00
response_time_hrs           0.00
logistics_delay_hours       0.00
cmd_coord_score             0.00
roe_complexity              0.00
dtype: float64

Unnamed: 0,threat_type,enemy_unit_count,enemy_capability_index,threat_escalation_hours,friendly_unit_count,lcs_count,aircraft_count,cyber_defense_teams,patriot_batteries,isr_asset_count,...,roe_complexity,budget_musd,budget_utilization_pct,weather_severity,distance_to_theater_km,Season,response_success,Financial_Loss_MUSD,actual_days_to_stabilization,bad_success
0,missile,25,8,34,21,0,0,0,4,12,...,3,98,84,3,5164.8,winter,1,388,10.1,0
1,missile,3,2,8,45,0,0,0,4,4,...,5,189,97,1,7149.7,fall,1,244,2.0,0
2,air,7,3,48,45,0,5,0,0,14,...,1,92,88,0,6706.5,winter,1,280,2.0,0


<class 'pandas.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   threat_type                   3000 non-null   object 
 1   enemy_unit_count              3000 non-null   int64  
 2   enemy_capability_index        3000 non-null   int64  
 3   threat_escalation_hours       3000 non-null   int64  
 4   friendly_unit_count           3000 non-null   int64  
 5   lcs_count                     3000 non-null   int64  
 6   aircraft_count                3000 non-null   int64  
 7   cyber_defense_teams           3000 non-null   int64  
 8   patriot_batteries             3000 non-null   int64  
 9   isr_asset_count               3000 non-null   int64  
 10  satellite_coverage_score      3000 non-null   float64
 11  joint_force_integration       3000 non-null   int64  
 12  EW_Capability                 2970 non-null   object 
 13  supply_chain_r

# Thats better

In [8]:
out_path = repo_root / "data" / "processed" / "prompt2_clean.parquet"
out_path.parent.mkdir(parents=True, exist_ok=True)
clean_df.to_parquet(out_path, index=False)
print("Saved:", out_path)


Saved: /Users/michaelbanks/Desktop/Hackathon/UNH-Hackathon-Code/data/processed/prompt2_clean.parquet
