In [16]:
import pandas as pd
from pathlib import Path


def find_repo_root(start_path: Path) -> Path:
    """
    Walks up from start_path until it finds a folder containing “.git”.
    If none is found, raises an error.
    """
    for p in [start_path] + list(start_path.parents):
        if (p / ".git").is_dir():
            return p
    raise FileNotFoundError("Could not locate a parent directory with a .git folder.")

cwd = Path().resolve()
project_root = find_repo_root(cwd)

processed_dir = project_root / "data" / "processed"
output_file  = processed_dir / "expanded_resumes.csv"

processed_dir.mkdir(parents=True, exist_ok=True)

# Define groups of names categorized by race and gender
white_male_names   = ["John Smith",   "Michael Johnson", "James Brown"]
white_female_names = ["Emily Smith",  "Olivia Johnson",  "Ava Brown"]
black_male_names   = ["Darnell Smith","Malik Johnson",   "Tyrone Brown"]
black_female_names = ["Latisha Smith","Aaliyah Johnson", "Imani Brown"]

all_names = (
    white_male_names
    + white_female_names
    + black_male_names
    + black_female_names
)

name_to_group = {
    **{name: "White-Male"   for name in white_male_names},
    **{name: "White-Female" for name in white_female_names},
    **{name: "Black-Male"   for name in black_male_names},
    **{name: "Black-Female" for name in black_female_names},
}

# Find every “resumes_*.csv” file inside data/processed
resume_file_paths = list((processed_dir).glob("resumes_*.csv"))

if not resume_file_paths:
    raise FileNotFoundError(f"No files matching “resumes_*.csv” found in {processed_dir}")

augmented_records = []

for resume_file in resume_file_paths:
    # e.g. if resume_file.name == "resumes_15-0000.csv", then:
    soc_family_code = resume_file.stem.replace("resumes_", "")
    df_base = pd.read_csv(resume_file)
    
    for _, row in df_base.iterrows():
        original_id = row["ID"]
        resume_text = row["text"]
        
        for name in all_names:
            augmented_records.append({
                "orig_resume_id":   original_id,
                "soc_family":       soc_family_code,
                "name_variant":     name,
                "race_gender_group":name_to_group[name],
                "augmented_resume_text": f"{name}\n{resume_text}"
            })

# Shuffle and save
augmented_df = pd.DataFrame(augmented_records)
augmented_df = augmented_df.sample(frac=1, random_state=42).reset_index(drop=True)
augmented_df.to_csv(output_file, index=False)

print(f"Augmented resumes saved to: {output_file}")


Augmented resumes saved to: /Users/alexchon/CS281Project/data/processed/expanded_resumes.csv
