# Exposure Gap Diagnostics

In [4]:
import json
from pathlib import Path
import pandas as pd

GAPS_PATH = Path('../Results Datasets/exposures/exposure_gaps.json')
ENRICHED_PATH = Path('../Results Datasets/sjmm_ai_exposure.jsonl')

with GAPS_PATH.open() as fh:
    gaps = json.load(fh)

occupation_missing = pd.DataFrame([
    {
        'isco_2008': isco,
        'ads_missing': info['count'],
        'soc_codes_without_exposure': ', '.join(info.get('soc_codes_without_exposure', []))
    }
    for isco, info in gaps['dataset_gaps']['occupation_missing_exposure'].items()
]).sort_values('ads_missing', ascending=False)

industry_missing = pd.DataFrame([
    {
        'noga': noga,
        'ads_missing': info['count'],
        'reason': info.get('reason', '')
    }
    for noga, info in gaps['dataset_gaps']['industry_missing_exposure'].items()
]).sort_values('ads_missing', ascending=False)

occupation_missing.head()

Unnamed: 0,isco_2008,ads_missing,soc_codes_without_exposure
40,7200,1481,
21,3000,1085,
32,4320,646,
15,2500,544,
11,2140,516,


In [5]:
industry_missing.head()

Unnamed: 0,noga,ads_missing,reason
3,44,8714,NOGA not present in exposure lookup (likely cr...
1,34,2743,NOGA not present in exposure lookup (likely cr...
2,40,1066,NOGA not present in exposure lookup (likely cr...
0,4,961,NOGA not present in exposure lookup (likely cr...
4,48,915,NOGA not present in exposure lookup (likely cr...


In [6]:
lookup_gaps = gaps['lookup_gaps']
pd.DataFrame(
    list(lookup_gaps['industry_naics_crosswalk_gaps'].items()),
    columns=['naics', 'reason']
).head()

Unnamed: 0,naics,reason
0,9991,no ISIC match
1,9992,no ISIC match
2,9993,no ISIC match


In [7]:
enriched = pd.read_json(ENRICHED_PATH, lines=True)
enriched[['occupation_exposure', 'industry_exposure']].isna().mean()

occupation_exposure    0.079545
industry_exposure      0.131079
dtype: float64