Cleans the raw union army data from NBER into a format
for mismo that minimizes the boilerplate that users will need.

In [1]:
import ibis
from ibis import _

ibis.options.interactive = True

cen_path_in = "../../../../.scratch/union_army/CEN.csv"
msr_path_in = "../../../../.scratch/union_army/MSR.csv"
cen = ibis.read_csv(cen_path_in)
msr = ibis.read_csv(msr_path_in)
cen

In [2]:
cen = cen.select(
    "record_id",
    _.label_true.cast("int64").name("label_true"),
    first_name=_.first_name.strip(),
    middle_name=_.middle_name.strip(),
    last_name=_.last_name.strip(),
    birth_year=_.birth_year.strip(),
    birth_month=_.birth_month.strip(),
    birth_place=_.birth_place.strip(),
    gender=_.gender.strip(),
)
cen

In [3]:
def fix_month(m):
    # We only fix unambiguous months. We leave eg "99" as is
    mapping = {
        "JAN": "01",
        "FEB": "02",
        "MAR": "03",
        "APR": "04",
        "MAY": "05",
        "JUN": "06",
        "JUL": "07",
        "AUG": "08",
        "SEP": "09",
        "OCT": "10",
        "NOV": "11",
        "DEC": "12",
        # We only fix unambiguous months. We leave eg "99" as is
        # "UNK": "--",
        # "UN": "--",
        # "99": "--",
        # "999": "--",
    }
    return m.substitute(mapping).nullif("--")


cen = cen.mutate(birth_month=fix_month(cen.birth_month))
cen.birth_month.topk(40).execute()

Unnamed: 0,birth_month,Count(birth_month)
0,03,5139
1,05,4803
2,10,4646
3,08,4640
4,04,4620
5,02,4566
6,09,4499
7,01,4453
8,06,4359
9,12,4250


In [4]:
cen[_.birth_year.length() != 4]

In [5]:
msr = msr.select(
    record_id=_.record_id,
    label_true=_.label_true,
    first_name=_.first_name.strip(),
    middle_name=_.middle_name.strip(),
    last_name=_.last_name.strip(),
    birth_date=_.birth_date.strip(),
    birth_place=_.birth_place,
    enlist_date=_.enlistment_date,
    enlist_age=_.enlistment_age,
)
msr

In [6]:
msr.enlist_age.topk(100).execute()

Unnamed: 0,enlist_age,Count(enlist_age)
0,18.0,5668
1,19.0,3251
2,21.0,3063
3,20.0,2655
4,22.0,2405
...,...,...
57,67.0,1
58,72.0,1
59,64.0,1
60,61.0,1


In [7]:
# Some dates in MSR are in MMDDYYYY format


def is_not_YYYYMMDD(date):
    return ~date[:2].isin(["18", "--"])


def fix_MMDDYYYY(date):
    bad = is_not_YYYYMMDD(date)
    fixed = date[4:8] + date[:2] + date[2:4]
    return bad.ifelse(fixed, date)


print(is_not_YYYYMMDD(msr.enlist_date).value_counts())
display(msr[is_not_YYYYMMDD(_.enlist_date)])
msr = msr.mutate(enlist_date=fix_MMDDYYYY(_.enlist_date))

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mNot(InValues(Substring(enlist_date, 0, 2)))[0m[1m [0m┃[1m [0m[1mNot(InValues(Substring(enlist_date, 0, 2)))_count[0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ [2mboolean[0m                                     │ [2mint64[0m                                             │
├─────────────────────────────────────────────┼───────────────────────────────────────────────────┤
│ True                                        │                                                [1;36m25[0m │
│ False                                       │                                             [1;36m39279[0m │
│ [2mNULL[0m                                        │                                                [1;36m36[0m │
└─────────────────────────────────────────────┴────────────────────────────────────────────────

In [8]:
# Verify integrity


def is_unique(vals):
    counts = vals.name("val").value_counts()
    nonnull = counts[_.val.notnull()]
    return (nonnull.val_count == 1).all().execute()


assert is_unique(msr.record_id)
assert is_unique(cen.record_id)
assert is_unique(msr.label_true)
assert is_unique(cen.label_true)

assert msr.label_true.isnull().sum().execute() == 0
assert cen.label_true.isnull().sum().execute() > 0

assert cen.gender.isnull().sum().execute() > 0
assert cen.gender.isin(["M", "F", "("]).all().execute()

# Dubious enlist age
assert msr.enlist_age.min().execute() == 2
assert msr.enlist_age.max().execute() >= 70

In [9]:
msr_path_out = "./MSR.csv"
cen_path_out = "./CEN.csv"
msr.to_csv(msr_path_out)
cen.to_csv(cen_path_out)