# Phase 1: Data Processing

This notebook shows all the data processing steps and checks the data at each step

Steps:
1. load the raw data
2. clean up resumes  
3. clean up jobs
4. map occupations together
5. split into train/test sets
6. create name variants for each resume
7. make job-resume pairs

## 1. Import Libraries

import all the stuff we need

In [7]:
import pandas as pd
import json
from pathlib import Path
from collections import Counter
import difflib

# Add src to path
import sys
sys.path.insert(0, str(Path.cwd().parent))

## 2. Set Up File Paths

define where all the data files are

In [8]:
# Raw data
RAW_RESUMES = Path("../data/raw/resume-dataset.csv")
RAW_JOBS = Path("../data/raw/jobs.csv")

# paths to processed data files
RESUMES_CLEAN = Path("../data/processed/resumes_clean.jsonl")
JOBS_CLEAN = Path("../data/processed/jobs_clean.jsonl")
OCCUPATION_MAP = Path("../data/processed/occupation_map.json")
SPLIT_IDS = Path("../data/processed/split_ids.json")
RESUMES_AUGMENTED = Path("../data/processed/resumes_augmented.jsonl")
PAIRS = Path("../data/processed/job_resume_pairs_phase1.csv")

print("Paths configured")

Paths configured


## 3. Load Raw Data

load the original datasets before any cleaning

In [47]:
# read the raw csv files
df_resumes_raw = pd.read_csv(RAW_RESUMES)
df_jobs_raw = pd.read_csv(RAW_JOBS)

print("RAW DATA STATS")
print("~" * 50)
print(f"\n Resumes (resume-dataset.csv):")
print(f"   Total records: {len(df_resumes_raw):,}")
print(f"   Columns: {list(df_resumes_raw.columns)}")
print(f"   Unique categories: {df_resumes_raw['Category'].nunique()}")
print(f"   File size: {RAW_RESUMES.stat().st_size / (1024*1024):.1f} MB")

print(f"\n Jobs (jobs.csv):")
print(f"   Total records: {len(df_jobs_raw):,}")
print(f"   Columns: {list(df_jobs_raw.columns)}")
print(f"   Unique major_job categories: {df_jobs_raw['major_job'].nunique()}")
print(f"   File size: {RAW_JOBS.stat().st_size / (1024*1024):.1f} MB")

print("\nResume Categories:")
print(df_resumes_raw['Category'].value_counts().sort_index())

print("\n Job Major Categories:")
print(df_jobs_raw['major_job'].value_counts().sort_index())

RAW DATA STATS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Resumes (resume-dataset.csv):
   Total records: 2,484
   Columns: ['ID', 'Resume_str', 'Resume_html', 'Category']
   Unique categories: 24
   File size: 53.7 MB

 Jobs (jobs.csv):
   Total records: 4,412
   Columns: ['Unnamed: 0', 'ISCO', 'major_job', 'job', 'position', 'location', 'description']
   Unique major_job categories: 16
   File size: 5.3 MB

Resume Categories:
Category
ACCOUNTANT                118
ADVOCATE                  118
AGRICULTURE                63
APPAREL                    97
ARTS                      103
AUTOMOBILE                 36
AVIATION                  117
BANKING                   115
BPO                        22
BUSINESS-DEVELOPMENT      120
CHEF                      118
CONSTRUCTION              112
CONSULTANT                115
DESIGNER                  107
DIGITAL-MEDIA              96
ENGINEERING               118
FINANCE                   118
FITNESS                   117
HEALTHCAR

## 4. Resume Cleaning

check the cleaned resume data (HTML removed, short ones filtered out)

In [48]:
# load the cleaned resumes from jsonl file
resumes_clean = []
with open(RESUMES_CLEAN, 'r', encoding='utf-8') as f:
    for line in f:
        resumes_clean.append(json.loads(line))

df_resumes_clean = pd.DataFrame(resumes_clean)

print("RESUME CLEANING RESULTS")
print("~" * 50)
print(f"Before cleaning: {len(df_resumes_raw):,} resumes")
print(f"After cleaning:  {len(df_resumes_clean):,} resumes")
print(f"Dropped:         {len(df_resumes_raw) - len(df_resumes_clean)} resumes")
print(f"Retention rate:  {100 * len(df_resumes_clean) / len(df_resumes_raw):.2f}%")

print("\nText Length Statistics:")
text_lengths = df_resumes_clean['text'].str.len()
print(f"   Min:    {text_lengths.min():,} characters")
print(f"   Median: {text_lengths.median():,.0f} characters")
print(f"   Max:    {text_lengths.max():,} characters")

print("\ncategory distribution after cleaning:")
print(df_resumes_clean['occupation'].value_counts().sort_index())

RESUME CLEANING RESULTS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Before cleaning: 2,484 resumes
After cleaning:  2,483 resumes
Dropped:         1 resumes
Retention rate:  99.96%

Text Length Statistics:
   Min:    688 characters
   Median: 5,547 characters
   Max:    38,412 characters

category distribution after cleaning:
occupation
accountant                118
advocate                  118
agriculture                63
apparel                    97
arts                      103
automobile                 36
aviation                  117
banking                   115
bpo                        22
business_development      119
chef                      118
construction              112
consultant                115
designer                  107
digital_media              96
engineering               118
finance                   118
fitness                   117
healthcare                115
hr                        110
information_technology    120
public_relations        

## 5. Job Cleaning

check the cleaned job descriptions (parsed and sampled 300 per occupation)

In [50]:
# load the cleaned jobs
jobs_clean = []
with open(JOBS_CLEAN, 'r', encoding='utf-8') as f:
    for line in f:
        jobs_clean.append(json.loads(line))

df_jobs_clean = pd.DataFrame(jobs_clean)

print("JOB CLEANING RESULTS")
print("~" * 50)
print(f"before cleaning: {len(df_jobs_raw):,} jobs")
print(f"after cleaning:  {len(df_jobs_clean):,} jobs")
print(f"retention rate:  {100 * len(df_jobs_clean) / len(df_jobs_raw):.2f}%")

print("\nDescription Length Statistics:")
desc_lengths = df_jobs_clean['text'].str.len()
print(f"   Min:    {desc_lengths.min():,} characters")
print(f"   Median: {desc_lengths.median():,.0f} characters")
print(f"   Max:    {desc_lengths.max():,} characters")

print("\nOccupation distribution:")
print(df_jobs_clean['occupation'].value_counts().sort_index())

JOB CLEANING RESULTS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
before cleaning: 4,412 jobs
after cleaning:  3,307 jobs
retention rate:  74.95%

Description Length Statistics:
   Min:    51 characters
   Median: 953 characters
   Max:    6,536 characters

Occupation distribution:
occupation
administrative_and_commercial_managers                     300
business_and_administration_associate_professionals        251
business_and_administration_professionals                  286
cleaners_and_helpers                                       202
customer_services_clerks                                   237
drivers_and_mobile_plant_operators                         199
health_associate_professionals                              73
health_professionals                                       300
information_and_communications_technicians                  94
information_and_communications_technology_professionals    300
market_oriented_skilled_agricultural_workers               119
protect

## 6. Occupation Mapping

check how resume categories map to job categories

In [52]:
# load the occupation mapping file
with open(OCCUPATION_MAP, 'r', encoding='utf-8') as f:
    occupation_data = json.load(f)

# get the actual mapping dictionary
occupation_map = occupation_data['mapping']

print("OCCUPATION MAPPING")
print("~" * 50)
print(f"Resume categories mapped: {len(occupation_map)}")
print(f"Job occupations available: {df_jobs_clean['occupation'].nunique()}")

print("\nfirst 5 mappings:")
for i, (resume_cat, job_cats) in enumerate(list(occupation_map.items())[:5]):
    print(f"\n{i+1}. {resume_cat}:")
    for job_cat in job_cats:
        print(f"   -> {job_cat}")

# check if all resume categories are mapped
resume_categories = set(df_resumes_clean['occupation'].unique())
mapped_categories = set(occupation_map.keys())
unmapped = resume_categories - mapped_categories

print(f"\ncoverage: {len(mapped_categories)}/{len(resume_categories)} resume categories have mappings")
if unmapped:
    print(f"WARNING: Unmapped categories: {unmapped}")
else:
    print("SUCCESS: all resume categories have job mappings")

OCCUPATION MAPPING
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Resume categories mapped: 24
Job occupations available: 16

first 5 mappings:

1. information_technology:
   -> information_and_communications_technology_professionals
   -> information_and_communications_technicians

2. digital_media:
   -> information_and_communications_technology_professionals

3. engineering:
   -> science_and_engineering_professionals
   -> science_and_engineering_associate_professionals

4. aviation:
   -> science_and_engineering_professionals

5. automobile:
   -> science_and_engineering_professionals
   -> drivers_and_mobile_plant_operators

coverage: 24/24 resume categories have mappings
SUCCESS: all resume categories have job mappings


### Verify Mapping Application

Make sure the occupations were actually mapped in the data

In [None]:
# check if resumes have the mapped occupation column
print("checking resumes_clean data")
if 'mapped_occupation' in df_resumes_clean.columns:
    print(f"found 'mapped_occupation' column")
    print(f"non-null values: {df_resumes_clean['mapped_occupation'].notna().sum()}/{len(df_resumes_clean)}")
    print(f"null rate: {df_resumes_clean['mapped_occupation'].isna().mean()*100:.2f}%")
else:
    print("no 'mapped_occupation' column in resumes_clean")
    print("this is normal - mapping happens in build_pairs.py")

print("\nthe occupation mapping will be applied when we build pairs")
print("we'll verify it works correctly in the pairs section below")

checking resumes_clean data
no 'mapped_occupation' column in resumes_clean
this is normal - mapping happens in build_pairs.py

checking pairs data
has 'occupation_match' column: True
match rate: 48.4%
mismatch rate: 51.6%

example matches (first 5):
   Resume: business_development           -> Job: administrative_and_commercial_managers
   Resume: business_development           -> Job: administrative_and_commercial_managers
   Resume: business_development           -> Job: administrative_and_commercial_managers
   Resume: business_development           -> Job: administrative_and_commercial_managers
   Resume: business_development           -> Job: administrative_and_commercial_managers

Example mismatches (first 5):
   Resume: healthcare                     -> Job: administrative_and_commercial_managers
   Resume: healthcare                     -> Job: administrative_and_commercial_managers
   Resume: healthcare                     -> Job: administrative_and_commercial_managers
   Resu

## 7. Train/Test Split

check the split ratios and make sure theres no leakage

In [54]:
# load the split IDs
with open(SPLIT_IDS, 'r', encoding='utf-8') as f:
    split_ids = json.load(f)

train_ids = set(split_ids['train'])
val_ids = set(split_ids.get('val', []))
test_ids = set(split_ids['test'])

total = len(train_ids) + len(val_ids) + len(test_ids)

print("TRAIN/TEST SPLIT")
print("~" * 50)
print(f"total base resumes: {total:,}")
print(f"\ntrain: {len(train_ids):,} ({100 * len(train_ids) / total:.1f}%)")
if val_ids:
    print(f"val:   {len(val_ids):,} ({100 * len(val_ids) / total:.1f}%)")
print(f"test:  {len(test_ids):,} ({100 * len(test_ids) / total:.1f}%)")

# make sure theres no overlap between splits
overlap_train_test = train_ids & test_ids
overlap_train_val = train_ids & val_ids
overlap_val_test = val_ids & test_ids

print("\nsplit integrity check:")
print(f"   train intersect test: {len(overlap_train_test)} (should be 0)")
if val_ids:
    print(f"   Train INTERSECT Val:  {len(overlap_train_val)} (should be 0)")
    print(f"   Val INTERSECT Test:   {len(overlap_val_test)} (should be 0)")

if not overlap_train_test and not overlap_train_val and not overlap_val_test:
    print("\ngood, no leakage detected")
else:
    print("\nwarning: overlap detected between splits")

TRAIN/TEST SPLIT
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
total base resumes: 2,483

train: 1,727 (69.6%)
val:   361 (14.5%)
test:  395 (15.9%)

split integrity check:
   train intersect test: 0 (should be 0)
   Train INTERSECT Val:  0 (should be 0)
   Val INTERSECT Test:   0 (should be 0)

good, no leakage detected


## 8. Name Augmentation

check that we created 4 variants for each resume (one for each demographic group)

In [37]:
# load augmented resumes
resumes_augmented = []
with open(RESUMES_AUGMENTED, 'r', encoding='utf-8') as f:
    for line in f:
        resumes_augmented.append(json.loads(line))

df_augmented = pd.DataFrame(resumes_augmented)

print("NAME AUGMENTATION RESULTS")
print("~" * 50)
print(f"base resumes:      {len(df_resumes_clean):,}")
print(f"augmented resumes: {len(df_augmented):,}")
print(f"expected variants: {len(df_resumes_clean) * 4:,}")
print(f"multiplier:        {len(df_augmented) / len(df_resumes_clean):.1f}x")

print("\ndemographic group counts:")
group_counts = df_augmented['demographic_group'].value_counts().sort_index()
for group, cnt in group_counts.items():
    pct = 100 * cnt / len(df_augmented)
    print(f"   {group:20s}: {cnt:,} ({pct:.1f}%)")

# check if balanced
expected_per_group = len(df_resumes_clean)
all_balanced = all(cnt == expected_per_group for cnt in group_counts.values)

if all_balanced:
    print("\ngood, perfect balance - each group has exactly 25%")
else:
    print("\nwarning: imbalance in demographic groups")

NAME AUGMENTATION RESULTS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Base resumes:      2,483
Augmented resumes: 9,932
Expected variants: 9,932
Multiplier:        4.0x

Demographic Group Balance:
   black_female        : 2,483 (25.0%)
   black_male          : 2,483 (25.0%)
   white_female        : 2,483 (25.0%)
   white_male          : 2,483 (25.0%)

SUCCESS: Perfect balance - each group has exactly 25% of variants


## 9. Check that Only Names Changed

verify that the 4 variants of each resume are identical except for the name

In [42]:
# pick a random resume to check
sample_base_id = df_augmented['base_resume_id'].iloc[100]
variants = df_augmented[df_augmented['base_resume_id'] == sample_base_id].sort_values('demographic_group')

print("COUNTERFACTUAL CHECK")
print("~" * 50)
print(f"base resume ID: {sample_base_id}")
print(f"number of variants: {len(variants)}\n")

# get the base text (skip first 2 lines which have the name)
texts = []
for _, row in variants.iterrows():
    lines = row['text'].split('\n', 2)
    if len(lines) >= 3:
        base_text = lines[2]
    else:
        base_text = row['text']
    texts.append(base_text)
    print(f"{row['demographic_group']}:")
    print(f"   name: {row['name_first']} {row['name_last']}")
    print(f"   first 150 chars: {row['text'][:150]}\n")

# check if all base texts are the same
all_identical = all(text == texts[0] for text in texts)

print("\nbase text comparison:")
if all_identical:
    print("verified: all 4 variants have identical base text")
    print("only the name at the top is different")
else:
    print("warning: base texts are not identical")
    print("\nshowing differences:")
    for i in range(1, len(texts)):
        if texts[i] != texts[0]:
            diff = difflib.unified_diff(
                texts[0][:500].splitlines(keepends=True),
                texts[i][:500].splitlines(keepends=True),
                lineterm=''
            )
            print(f"\ndifference between variant 0 and {i}:")
            print(''.join(diff))

COUNTERFACTUAL PROPERTY VERIFICATION
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Base Resume ID: 46258701
Number of variants: 4

black_female:
   Name: Aisha Williams
   First 150 chars of text: Aisha Williams

HR COORDINATOR Professional Summary Highly efficient Hr Coordinator well established in administrative environments that are fast-pace...

black_male:
   Name: Jamal Williams
   First 150 chars of text: Jamal Williams

HR COORDINATOR Professional Summary Highly efficient Hr Coordinator well established in administrative environments that are fast-pace...

white_female:
   Name: Meredith Williams
   First 150 chars of text: Meredith Williams

HR COORDINATOR Professional Summary Highly efficient Hr Coordinator well established in administrative environments that are fast-p...

white_male:
   Name: Matthew Williams
   First 150 chars of text: Matthew Williams

HR COORDINATOR Professional Summary Highly efficient Hr Coordinator well established in administrative environments 

## 10. Job-Resume Pairs

Create (job, resume) pairs for evaluation

In [55]:
# load the pairs
df_pairs = pd.read_csv(PAIRS)

print("PAIR GENERATION RESULTS")
print("~" * 50)
print(f"total pairs: {len(df_pairs):,}")
print(f"unique jobs: {df_pairs['job_id'].nunique():,}")
print(f"unique resume variants: {df_pairs['resume_variant_id'].nunique():,}")
print(f"unique base resumes: {df_pairs['base_resume_id'].nunique():,}")

print("\npair stats:")
print(f"   avg candidates per job: {len(df_pairs) / df_pairs['job_id'].nunique():.1f}")
print(f"   avg jobs per base resume: {len(df_pairs) / df_pairs['base_resume_id'].nunique():.1f}")

PAIR GENERATION RESULTS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
total pairs: 9,920
unique jobs: 320
unique resume variants: 5,968
unique base resumes: 1,492

pair stats:
   avg candidates per job: 31.0
   avg jobs per base resume: 6.6


### Verify Occupation Matching in Pairs

check that the occupation matching worked correctly

In [None]:
print("OCCUPATION MATCHING VERIFICATION")
print("~" * 50)

# check if the pairs have occupation_match column
print(f"has 'occupation_match' column: {'occupation_match' in df_pairs.columns}")

if 'occupation_match' in df_pairs.columns:
    match_rate = df_pairs['occupation_match'].mean()
    print(f"match rate: {match_rate*100:.1f}%")
    print(f"mismatch rate: {(1-match_rate)*100:.1f}%")
    
    # show some example pairings
    print("\nexample matches (first 5):")
    sample_matches = df_pairs[df_pairs['occupation_match'] == True].head(5)
    for idx, row in sample_matches.iterrows():
        print(f"   Resume: {row['resume_occupation']:30s} -> Job: {row['job_occupation']}")
        
    print("\nexample mismatches (first 5):")
    sample_mismatches = df_pairs[df_pairs['occupation_match'] == False].head(5)
    for idx, row in sample_mismatches.iterrows():
        print(f"   Resume: {row['resume_occupation']:30s} -> Job: {row['job_occupation']}")
        
    # verify matches are correct using our mapping
    print("\nverifying matches are correct (first 10):")
    errors = 0
    for idx, row in df_pairs[df_pairs['occupation_match'] == True].head(10).iterrows():
        resume_occ = row['resume_occupation']
        job_occ = row['job_occupation']
        expected_jobs = occupation_map.get(resume_occ, [])
        is_correct = job_occ in expected_jobs
        status = "✓" if is_correct else "✗"
        if not is_correct:
            errors += 1
        print(f"   {status} {resume_occ} -> {job_occ}")
    
    if errors == 0:
        print("\ngood, all sampled matches are correct")
    else:
        print(f"\nwarning: {errors} errors found in sampled matches")

## 11. Check Pair Balance

make sure the pairs are balanced across demographics and splits

In [56]:
print("PAIR BALANCE ANALYSIS")
print("~" * 50)

# 1. Match/Mismatch ratio
print("\n1. Occupation Match Balance:")
match_counts = df_pairs['occupation_match'].value_counts()
for is_match, cnt in match_counts.items():
    label = "Match" if is_match else "Mismatch"
    pct = 100 * cnt / len(df_pairs)
    print(f"   {label:12s}: {cnt:,} ({pct:.1f}%)")

# 2. Demographic balance
print("\n2. Demographic Group Balance:")
demo_counts = df_pairs['demographic_group'].value_counts().sort_index()
for group, cnt in demo_counts.items():
    pct = 100 * cnt / len(df_pairs)
    print(f"   {group:20s}: {cnt:,} ({pct:.1f}%)")

# see if balanced within 1%
demo_pcts = [100 * cnt / len(df_pairs) for cnt in demo_counts.values]
demo_balanced = max(demo_pcts) - min(demo_pcts) < 1.0

if demo_balanced:
    print("   good, balanced within 1%")
else:
    print(f"   warning: imbalance detected ({max(demo_pcts) - min(demo_pcts):.2f}% range)")

# 3. Split balance
print("\n3. Train/Test Split Balance:")
split_counts = df_pairs['split'].value_counts().sort_index()
for split, cnt in split_counts.items():
    pct = 100 * cnt / len(df_pairs)
    print(f"   {split.capitalize():6s}: {cnt:,} ({pct:.1f}%)")

# 4. Quartet completeness
print("\n4. Counterfactual Quartet Completeness:")
quartets = df_pairs.groupby(['job_id', 'base_resume_id']).size()
incomplete_quartets = quartets[quartets != 4]

print(f"   Total (job, base_resume) pairs: {len(quartets):,}")
print(f"   Complete quartets (4 variants): {len(quartets[quartets == 4]):,}")
print(f"   Incomplete quartets: {len(incomplete_quartets):,}")

if len(incomplete_quartets) == 0:
    print("   good, all quartets are complete")
else:
    print(f"   warning: {len(incomplete_quartets)} incomplete quartets found")
    print("\n   incomplete quartet sizes:")
    print(incomplete_quartets.value_counts().sort_index())

PAIR BALANCE ANALYSIS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

1. Occupation Match Balance:
   Mismatch    : 5,120 (51.6%)
   Match       : 4,800 (48.4%)

2. Demographic Group Balance:
   black_female        : 2,480 (25.0%)
   black_male          : 2,480 (25.0%)
   white_female        : 2,480 (25.0%)
   white_male          : 2,480 (25.0%)
   good, balanced within 1%

3. Train/Test Split Balance:
   Test  : 1,656 (16.7%)
   Train : 6,772 (68.3%)
   Val   : 1,492 (15.0%)

4. Counterfactual Quartet Completeness:
   Total (job, base_resume) pairs: 2,480
   Complete quartets (4 variants): 2,480
   Incomplete quartets: 0
   good, all quartets are complete


## 12. Summary Table

overview of all the processing steps

In [58]:
print("PHASE 1 PIPELINE SUMMARY")
print("~" * 50)

summary = pd.DataFrame([
    {"step": "1. raw resumes", "count": f"{len(df_resumes_raw):,}", "details": f"{df_resumes_raw['Category'].nunique()} categories"},
    {"step": "2. raw jobs", "count": f"{len(df_jobs_raw):,}", "details": f"{df_jobs_raw['major_job'].nunique()} major categories"},
    {"step": "3. cleaned resumes", "count": f"{len(df_resumes_clean):,}", "details": f"{100 * len(df_resumes_clean) / len(df_resumes_raw):.1f}% retention"},
    {"step": "4. cleaned jobs", "count": f"{len(df_jobs_clean):,}", "details": f"300 per occupation, {df_jobs_clean['occupation'].nunique()} occupations"},
    {"step": "5. occupation mappings", "count": f"{len(occupation_map)}", "details": "100% coverage"},
    {"step": "6. train split", "count": f"{len(train_ids):,}", "details": f"{100 * len(train_ids) / total:.1f}% of base resumes"},
    {"step": "7. val split", "count": f"{len(val_ids):,}", "details": f"{100 * len(val_ids) / total:.1f}% of base resumes"},
    {"step": "8. test split", "count": f"{len(test_ids):,}", "details": f"{100 * len(test_ids) / total:.1f}% of base resumes"},
    {"step": "9. augmented variants", "count": f"{len(df_augmented):,}", "details": "4 per base resume"},
    {"step": "10. final pairs", "count": f"{len(df_pairs):,}", "details": f"{df_pairs['job_id'].nunique()} jobs x ~32 candidates"},
])

print(summary.to_string(index=False))

print("\n" + "~" * 50)
print("Phase 1 Complete")
print("~" * 50)

PHASE 1 PIPELINE SUMMARY
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                  step count                            details
        1. raw resumes 2,484                      24 categories
           2. raw jobs 4,412                16 major categories
    3. cleaned resumes 2,483                   100.0% retention
       4. cleaned jobs 3,307 300 per occupation, 16 occupations
5. occupation mappings    24                      100% coverage
        6. train split 1,727              69.6% of base resumes
          7. val split   361              14.5% of base resumes
         8. test split   395              15.9% of base resumes
 9. augmented variants 9,932                  4 per base resume
       10. final pairs 9,920          320 jobs x ~32 candidates

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Phase 1 Complete
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
