In [1]:
%load_ext autoreload
%autoreload 2

import re
import pandas as pd 
import seaborn as sns 
from typing import Any
import numpy as np 

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
# pd.set_option('display.width', 200)

from util_consts import HISTTYPES_PATH
from util_classes import SeerRecord
from util_enums import Grade, Behavior, RegionalNodes, Source
from util_maps import (
    TSTAGE_AJCC,
    TSTAGE_AJCC7th_AJCC_MAP,
    TSTAGE_SEER_AJCC_MAP,
    TSTAGE_EOD_AJCC_MAP,
    NSTAGE_AJCC,
    NSTAGE_SEER_AJCC_MAP,
    NSTAGE_EOD_AJCC_MAP,
    GSTAGE_AJCC,
    GSTAGE_SEER_AJCC_MAP,
    GSTAGE_EOD_AJCC_MAP,
    GRADE_SEER_STD_MAP,
    GRADE_NAACCR_STD_MAP,
    CANCERTYPE_CANCERGROUP_MAP,
    PRIMARYCODE_PRIMARYSITE_MAP,
    PRIMARYCODE_PRIMARYSITE_RANGES,
)
PRIMARYCODE_PRIMARYSITE_RANGES.sort()

INFILE_DATA = '/home/grace/work/SEER/data/SEER_2010_2020_RICH.tsv'
INFILE_DICT = '/home/grace/work/SEER/data/SEER_2010_2020_RICH.dic'
OUTFILE = '/home/grace/work/SEER/data/SEER_2010_2020_RICH.clean.tsv'


In [2]:
# multi field formatters 

NHL_BLACKLIST_CODES = {
    24, 98, 99, 111, 142, 379, 420, 421, 422, 424, 
    770, 771, 772, 773, 774, 775, 776, 777, 778, 779
}
LYMPHATICS_SITES = {
    'Myeloma',
    'Kaposi Sarcoma',
    'Mesothelioma',
    'Acute Lymphocytic Leukemia',
    'Acute Monocytic Leukemia',
    'Acute Myeloid Leukemia',
    'Aleukemic, Subleukemic and NOS',
    'Chronic Lymphocytic Leukemia',
    'Chronic Myeloid Leukemia',
    'Other Acute Leukemia',
    'Other Myeloid/Monocytic Leukemia',
    'Other Lymphocytic Leukemia',
    'Hodgkin - Extranodal',
    'Hodgkin - Nodal',
    'NHL - Extranodal',
    'NHL - Nodal',
}
MESOTHELIOMA_SITES = {
    'Pleura',
    'Peritoneum, Omentum and Mesentery',
}

def format_primary_type(row: pd.Series) -> str:
    """
    returns the site as per CANCER_TYPE. 
    Miscellaneous <-> Tonsil | Lymphatics (99)
    """
    code = int(row['Primary Site'])
    ctype = row['Site recode ICD-O-3/WHO 2008']
    ptype = None

    # map code to primary 
    if code in PRIMARYCODE_PRIMARYSITE_MAP:  # must be first
        ptype = PRIMARYCODE_PRIMARYSITE_MAP[code]
    if ptype is None:
        ptype = _binary_search_primary(code, PRIMARYCODE_PRIMARYSITE_RANGES)
    """
    Multimapping cases
        Hodgkin - Extranodal: All other sites 
        NHL - Extranodal: All sites except C024, C098-C099, C111, C142, C379, C420-C422, C424, C770-C779
    """
    if ptype is None:
        if code in NHL_BLACKLIST_CODES:
            ptype = 'Hodgkin - Extranodal'
        else:
            ptype = 'Hodgkin - Extranodal | NHL - Extranodal'
    assert ptype

    ### PRIMARY_TYPE == CANCER_TYPE ###
    # single mapping
    if _sites_match(ctype, ptype):
        return ctype
    # multiple mappings
    if '|' in ptype:
        selection = [p.strip() for p in ptype.split('|')]
        for sel in selection:
            if _sites_match(ctype, sel):
                return ctype
        
    # Miscellaneous (multiple sites of cancer)
    if ctype == 'Miscellaneous':
        return ctype

    ### PRIMARY_TYPE != CANCER_TYPE ###
    # ensure ptype is a valid SEER term, or multiple valid SEER terms. 
    if '|' in ptype:
        selection = [p.strip() for p in ptype.split('|')]
        for sel in selection:
            assert sel in CANCERTYPE_CANCERGROUP_MAP
    else:
        assert ptype in CANCERTYPE_CANCERGROUP_MAP
    return ptype

def _binary_search_primary(code: int, ranges: list) -> str|None:
    mid = len(ranges) // 2
    if len(ranges) == 0:
        return None
    elif ranges[mid][0] <= code and code <= ranges[mid][1]:
        return ranges[mid][2]
    elif mid == 0:
        return None
    elif ranges[mid][0] < code:
        return _binary_search_primary(code, ranges[mid:])
    else:
        return _binary_search_primary(code, ranges[:mid])
    
def _sites_match(site1: str, site2: str) -> bool:
    """
    special cases: Lymphatics, Miscellaneous
    """
    if site1 == site2:
        return True
    if site2 == 'Lymphatics' and site1 in LYMPHATICS_SITES:
        return True
    return False 

TSTAGE_PATTERN = r'^(T(is|[1234]))'
NSTAGE_PATTERN = r'^(N[0123])'
GSTAGE_PATTERN = r'^(IV|III|II|I|0)'

NA_STAGE_SEER = ['99']
NA_STAGE_EOD = ['88', '99', 'DMS code 90 (invalid inputs)']

# 8     Derived AJCC T, 6th ed (2004-2015)
# 9	    Derived AJCC N, 6th ed (2004-2015)
# 10	Derived AJCC T, 7th ed (2010-2015)
# 11	Derived AJCC N, 7th ed (2010-2015)
# 12	Derived SEER Combined T (2016-2017)
# 13	Derived SEER Combined N (2016-2017)
# 14	Derived EOD 2018 T (2018+)
# 15	Derived EOD 2018 N (2018+)
# 16	Derived AJCC Stage Group, 6th ed (2004-2015)
# 17	Derived AJCC Stage Group, 7th ed (2010-2015)
# 18	Derived SEER Cmb Stg Grp (2016-2017)
# 19	Derived EOD 2018 Stage Group (2018+)

def format_tstage_ajcc(row: pd.Series) -> Any:
    # target: {T1, T2, T3, T4}
    # T1a -> T1 etc
    # ignore T0, TX, Ta, Tis, Tispd etc.
    mapper = {
        'Tispu': 'Tis',
        'Ta': 'Tis',
    }
    tstage = _get_tstage_ajcc(row)
    if isinstance(tstage, str):
        tstage = mapper.get(tstage, tstage)
        m = re.match(TSTAGE_PATTERN, tstage)
        if m:
            return m.group(1)
    return np.nan

def format_nstage_ajcc(row: pd.Series) -> Any:
    """
    target: {N0, N1, N2, N3}
    ignore NX
    N0: no detectable lymph nodes (ie no spread)
    """
    mapper = {'N4': 'N3'}
    nstage = _get_nstage_ajcc(row)
    if isinstance(nstage, str):
        nstage = mapper.get(nstage, nstage)
        m = re.match(NSTAGE_PATTERN, nstage)
        if m:
            return m.group(1)
    return np.nan

def format_gstage_ajcc(row: pd.Series) -> Any:
    gstage = _get_gstage_ajcc(row)
    if isinstance(gstage, str):
        if gstage == 'OCCULT':
            return np.nan
        m = re.match(GSTAGE_PATTERN, gstage)
        assert m 
        return m.group(1)
    return np.nan

def _get_gstage_ajcc(row: pd.Series) -> Any:
    ajcc_6th = row['Derived AJCC Stage Group, 6th ed (2004-2015)']
    ajcc_7th = row['Derived AJCC Stage Group, 7th ed (2010-2015)']
    seer = row['Derived SEER Cmb Stg Grp (2016-2017)']
    eod = row['Derived EOD 2018 Stage Group (2018+)']

    # 2004-2015 prioritise 6th
    if isinstance(ajcc_6th, str):
        return ajcc_6th

    # 2010-2015 7th
    if isinstance(ajcc_7th, str):
        return ajcc_7th
    
    # 2016-2017
    if isinstance(seer, str) and seer not in NA_STAGE_SEER:
        seer = re.sub(r'^1', 'I', seer)
        seer = re.sub(r'^2', 'II', seer)
        seer = re.sub(r'^3', 'III', seer)
        seer = re.sub(r'^4', 'IV', seer)
        return seer if seer in GSTAGE_AJCC else GSTAGE_SEER_AJCC_MAP[seer] 
    
    # 2018+
    if isinstance(eod, str) and eod not in NA_STAGE_EOD:
        eod = re.sub(r'^1', 'I', eod)
        eod = re.sub(r'^2', 'II', eod)
        eod = re.sub(r'^3', 'III', eod)
        eod = re.sub(r'^4', 'IV', eod)
        return eod if eod in GSTAGE_AJCC else GSTAGE_EOD_AJCC_MAP[eod]
    
    return None

def _get_nstage_ajcc(row: pd.Series) -> Any:
    ajcc_6th = row['Derived AJCC N, 6th ed (2004-2015)']
    ajcc_7th = row['Derived AJCC N, 7th ed (2010-2015)']
    seer = row['Derived SEER Combined N (2016-2017)']
    eod = row['Derived EOD 2018 N (2018+)']
    uninformative = ['NX']

    # 2010-2015 uninformative 6th, informative 7th
    if isinstance(ajcc_6th, str) and isinstance(ajcc_7th, str):
        if ajcc_6th in uninformative and ajcc_7th not in uninformative:
            return ajcc_7th

    # 2004-2015
    if isinstance(ajcc_6th, str):
        return ajcc_6th

    # 2010-2015
    if isinstance(ajcc_7th, str):
        return ajcc_7th
    
    # 2016-2017
    if isinstance(seer, str) and seer not in NA_STAGE_SEER:
        seer = re.sub(r'^(c|p)', '', seer)
        if seer == 'X':
            return 'NX'
        nstage = 'N' + seer.lower()
        return nstage if nstage in NSTAGE_AJCC else NSTAGE_SEER_AJCC_MAP[seer] 
    
    # 2018+
    if isinstance(eod, str) and eod not in NA_STAGE_EOD:
        return eod if eod in NSTAGE_AJCC else NSTAGE_EOD_AJCC_MAP[eod] 
    
    return None

def _get_tstage_ajcc(row: pd.Series) -> Any:
    ajcc_6th = row['Derived AJCC T, 6th ed (2004-2015)']
    ajcc_7th = row['Derived AJCC T, 7th ed (2010-2015)']
    seer = row['Derived SEER Combined T (2016-2017)']
    eod = row['Derived EOD 2018 T (2018+)']
    uninformative = ['T0', 'TX']
    tstage = None

    # 2010-2015 prioritise 6th else convert 7th -> 6th
    if isinstance(ajcc_6th, str) and isinstance(ajcc_7th, str):
        if ajcc_6th in uninformative and ajcc_7th not in uninformative:
            return ajcc_7th if ajcc_7th in TSTAGE_AJCC else TSTAGE_AJCC7th_AJCC_MAP[ajcc_7th]
    # 2004-2015
    if isinstance(ajcc_6th, str):
        return ajcc_6th
    # 2010-2015
    if isinstance(ajcc_7th, str):
        return ajcc_7th if ajcc_7th in TSTAGE_AJCC else TSTAGE_AJCC7th_AJCC_MAP[ajcc_7th]
    # 2016-2017
    if isinstance(seer, str) and seer not in NA_STAGE_SEER:
        seer = re.sub(r'^(c|p)', '', seer)
        if seer == 'X':
            return 'TX'
        tstage = 'T' + seer.lower()
        return tstage if tstage in TSTAGE_AJCC else TSTAGE_SEER_AJCC_MAP[seer]
    # 2018+
    if isinstance(eod, str) and eod not in NA_STAGE_EOD:
        return eod if eod in TSTAGE_AJCC else TSTAGE_EOD_AJCC_MAP[eod] 

    return None 

def format_grade(row: pd.Series) -> str|float:
    """
    Grade is a measure of the aggressiveness of the tumor. 
    
    'Blank(s)'
    'Unknown'

    General Mapping ---
    NAACCR  SEER                                                                STD
    1,A     Well differentiated; Grade I                                        G1
    2,B     Moderately differentiated; Grade II                                 G2
    3,C     Poorly differentiated; Grade III                                    G3
    4,D     Undifferentiated; anaplastic; Grade IV                              G4
    9,9     Unknown                                                             None
    5       T-cell                                                              T_cell
    6       B-cell; pre-B; B-precursor                                          B_cell    
    7       Null cell; non T-non B                                              Null_cell
    8       NK cell; natural killer cell (1995+)                                NK_cell                
    L       low grade               Well differentiated; Grade I                G2
    M       intermediate grade      Moderately differentiated; Grade II         G3
    H       high grade              Undifferentiated; anaplastic; Grade IV      G4
    E       â€œGleason score 7"       Moderately differentiated; Grade II         G2
    S       sarcomatous overgrowth  Undifferentiated; anaplastic; Grade IV      G4
    
    """
    site = row['Site recode ICD-O-3/WHO 2008']
    grade_seer = row['Grade Recode (thru 2017)']
    grade_naaccr_clin = row['Grade Clinical (2018+)']
    grade_naaccr_path = row['Grade Pathological (2018+)']

    # NAACCR (path -> clin priority)
    if isinstance(grade_naaccr_path, str):
        return _cast_naaccrGrade_to_std(grade_naaccr_path, site)
    if isinstance(grade_naaccr_clin, str):
        return _cast_naaccrGrade_to_std(grade_naaccr_clin, site)
    # seer
    if isinstance(grade_seer, str):
        return GRADE_SEER_STD_MAP[grade_seer]
    return np.nan

def _cast_naaccrGrade_to_std(raw: str, site: str) -> str|float:
    # No G4 Group (no action)
    # Merged G3/G4 Group (no action)
    match raw:
        case 'S':
            return 'G4'
        case '8'|'9':
            return np.nan
        case 'M':
            if site == 'Adrenal Gland':
                return 'G4'
            elif site == 'Breast':
                return 'G3'
            else:
                raise RuntimeError
        case 'B':
            return 'G2'
        case 'E':
            assert site == 'Prostate'
            return 'G2'
        case _:
            return GRADE_NAACCR_STD_MAP[raw]

def format_gleason(row: pd.Series) -> float:
    clin = row['Gleason Score Clinical Recode (2010+)']
    path = row['Gleason Score Pathological Recode (2010+)']
    na_values = [
        'No needle core biopsy/TURP performed',
        'Not documented; Not assessed or unknown if assessed',
        'No prostatectomy done',
        'Not applicable: Information not collected for this case',
    ]
    clin = None if not isinstance(clin, str) else clin
    path = None if not isinstance(path, str) else path
    clin = None if clin in na_values else clin
    path = None if path in na_values else path
    if path: 
        return float(path.strip().split(' ')[-1])
    if clin: 
        return float(clin.strip().split(' ')[-1])
    return np.nan



In [3]:
# single field formatters
def format_cancer_group(cancer_type: str) -> str:
    return CANCERTYPE_CANCERGROUP_MAP[cancer_type]

def format_primary_group(primary_type: str) -> str:
    if primary_type in CANCERTYPE_CANCERGROUP_MAP:
        return CANCERTYPE_CANCERGROUP_MAP[primary_type]

    assert '|' in primary_type
    pgroups = set()
    for psite in primary_type.split('|'):
        psite = psite.strip()
        pgroups.add(CANCERTYPE_CANCERGROUP_MAP[psite])
    primary_group = ' | '.join(list(pgroups))
    return primary_group

def format_psa(text: str) -> float:
    # min val
    if text == '0.1 or less nanograms/milliliter (ng/ml)':
        return float(0.1)
    # max val
    elif text == '98.0 ng/ml or greater':
        return float(98.0)
    # [float in range 0-98]
    else:
        return float(text)
    
def format_regional_nodes_category(num_positive: int) -> str:
    """
    examined
    95	No regional nodes were removed, but aspiration of regional nodes was performed
    96	Regional lymph node removal was documented as a sampling, and the number of nodes is unknown/not stated
    97	Regional lymph node removal was documented as a dissection, and the number of nodes is unknown/not stated
    98	Regional lymph nodes were surgically removed, but the number of lymph nodes is unknown/not stated and not documented as a sampling or dissection; nodes were examined, but the number is unknown
    99	It is unknown whether nodes were examined; not stated in patient record
    
    positive
    95	Positive aspiration or core biopsy of lymph node(s)
    97	Positive nodes - number unspecified
    98	No nodes examined
    99	Unknown if nodes are positive; not applicable
    Not documented in patient record
    """
    if num_positive == 0:
        return 'NEG'
    elif num_positive == 95:
        return 'POS_ASPIRATION'
    elif num_positive == 98 or num_positive == 99:
        return 'NA'
    else:
        return 'POS_NODES'

def format_str_to_bool(text: str) -> bool:
    lut = {'yes': True, 'no': False}
    return lut[text.lower()]

def format_distant_ln_met(text: str) -> bool:
    lut = {
        'yes; distant lymph node metastases': True,
        'none; no lymph node metastases': False
    }
    return lut[text.lower()]

def format_other_met(text: str) -> bool:
    lut = {
        'yes; distant mets in known site(s) other than bone, brain, liver, lung, dist ln': True,
        'generalized metastases such as carinomatosis': True,
        'none; no other metastases': False
    }
    return lut[text.lower()]

def format_hist_group(text: str) -> str:
    text = text.split(':')[-1].strip()
    lut = {
        'epithelial neoplasms, NOS': 'epithelial neoplasms',
        'complex epithelial neoplasms': 'epithelial neoplasms',
        'soft tissue tumors and sarcomas, NOS': 'soft tissue tumors and sarcomas',
    }
    return lut.get(text, text)

hframe = pd.read_csv(HISTTYPES_PATH, sep='\t', header=None)
hframe.columns = ['code', 'description']
hframe['code'] = hframe['code'].astype(str)
hframe['description'] = hframe['description'].apply(lambda x: f"{x.split('|')[0]} [+ others]" if '|' in x else x)
HLUT = hframe.set_index('code')['description'].to_dict()

def format_hist_type(val: Any) -> str|float:
    if isinstance(val, int):
        val = str(val)
        if val not in HLUT:
            print(val)
            raise NotImplementedError
        return HLUT.get(val, val)
    return np.nan

def format_afp_post_orchiectomy(val: str) -> float:
    PATTERN = r'(\d+) - (\d+) ng/ml'
    lut = {
        '0 nanograms/milliliter (ng/ml)': 0.0,
        'Greater than or equal to 10, 000 ng/ml': 10000.0,
        'Not applicable; Information not collected for this case': np.nan,
        'Unknown or no information; Test not done': np.nan,
    }
    if val in lut:
        return lut[val]
    m = re.match(PATTERN, val)
    assert m is not None
    low, high = int(m.group(1)), int(m.group(2))
    return float(low + (high-low)//2)

def format_afp_pretreat_category(val: str) -> str|float:
    lut = {
        'Negative/normal; within normal limits': 'normal',
        'Positive/elevated': 'elevated',
        'Not applicable: Information not collected for this case': np.nan,
        'Borderline; undetermined if positive or negative': np.nan,
        'Not documented; Not assessed or unknown if assessed': np.nan,
    }
    return lut[val]

def format_b_symptoms(val: str) -> bool|float:
    lut = {
        'Any B symptom(s)-Night sweats, fever, weight loss, NOS; Phys classified as B': True,
        'No B symptoms (asymptomatic); Classified as A by physician when asymptomatic': False,
        'Not documented in medical record; B symptoms not assessed or unknown if assessed': np.nan,
    }
    return lut[val]
    
def format_breslow_thick(val: str) -> float:
    lut = {
        '9.8 millimeters or larger': 9.8,
        'Greater than 0.0 and less than or equal to 0.1': 0.0,
        'No mass/tumor found': 0.0,
        'Unknown/Indeterminate/Not assessed; In situ; Microinvasion/microscopic foci only': np.nan,
    }
    if val in lut:
        return lut[val]
    return float(val)

def format_ovarian_CA125(val: str) -> str|float:
    lut = {
        'Negative/normal; within normal limits': 'normal',
        'Not documented; CA-125 not assessed or unknown if assessed': np.nan,
        'Positive/elevated': 'elevated',
        'Stated as borderline; undetermined whether positive or negative': 'elevated',
    }
    return lut[val]

def format_CEA(val: str) -> str|float:
    lut = {
        'Borderline': 'normal',
        'CEA negative/normal; within normal limits': 'normal',
        'CEA positive/elevated': 'elevated',
        'Not documented; Interpretation not assessed or unknown if assessed': np.nan,
    }
    return lut[val]

def format_chr19q_loh(val: str) -> bool|float:
    lut = {
        'Chromosome 19q deletion/LOH not identified/not present': False,
        'Chromosome 19q deletion/LOH present': True,
        'Benign or borderline tumor': np.nan,
        'Not applicable: Information not collected for this case': np.nan,
        'Not documented; Cannot be determined; Not assessed or unknown if assessed': np.nan,
    }
    return lut[val]

def format_chr1p_loh(val: str) -> bool|float:
    lut = {
        'Chromosome 1p deletion/LOH not identified/not present': False,
        'Chromosome 1p deletion/LOH identified/present': True,
        'Chromosome 1p deletion/LOH present': True,
        'Benign or borderline tumor': np.nan,
        'Not applicable: Information not collected for this case': np.nan,
        'Not documented; Cannot be determined; Not assessed or unknown if assessed': np.nan,
    }
    return lut[val]

def format_fibrosis_score(val: str) -> str|float:
    lut = {
        'Ishak 0-4; No to moderate fibrosis; METAVIR F0-F3; Batt-Ludwig 0-3': 'Ishak 0-4;',
        'Ishak 5-6; Advanced/severe fibrosis; METAVIR F4; Batt-Ludwig 4; Cirrhosis': 'Ishak 5-6',
        'Not applicable: Information not collected for this case': np.nan,
        'Unknown; MR statement w/o hist conf; Uncategorized': np.nan,
    }
    return lut[val]

def format_capsule_invasion(val: str) -> bool|float:
    lut = {
        'Combination of perinephric fat/tissue and Renal Sinus/Gerotas fascia': True, 
        'Invasion beyond capsule not identified': False,
        'Invasion beyond capsule, NOS': True,
        'Not documented; Not assessed/unknown if assessed; No resection of primary site': np.nan,
        'Perinephric (beyond renal capsule) fat or tissue': True,
        'Renal Sinus; Gerotas fascia': True,
    }
    return lut[val]

def format_adrenal_involvement(val: str) -> bool|float:
    lut = {
        'Adrenal gland involvement by direct involvement (contiguous involvement)': True,
        'Adrenal gland involvement by separate nodule (noncontiguous involvement)': True,
        'Ipsilat. adrenal gland involvement, unk if direct involvement or separate nodule': True,
        'Ipsilateral adrenal gland involvement not present/not identified': False,
        'Not documented/assessed; No resection of primary and/or ipsilat. adrenal gland': np.nan,
        'Combination of code 1-2': True,
    }
    return lut[val]

def format_hGC_post_orchiectomy_elevation(val: str) -> str|float:
    lut = {
        '5,000-50,000 mIU/mL': 'medium',
        'Above normal and less than 5,000 milli-International Units/milliliter (mIU/mL)': 'low',
        'Greater than 50,000 mIU/mL': 'high',
        'Not documented; Not performed; Not assessed or unknown if assessed': np.nan,
        'Post-orchiectomy human chorionic gonadotropin (hCG) stated to be elevated': 'low',
        'Within normal limits': 'normal',
    }
    return lut[val]

def format_LDH_post_orchiectomy_elevation(val: str) -> str|float:
    lut = {
        'Greater than 10 x N (Greater than 10 times the upper limit of normal for LDH)': 'high',
        '1.5 to 10 x N (Between 1.5 and 10 times the upper limit of normal for LDH)': 'high',
        'Less than 1.5 x N (Less than 1.5 times the upper limit of normal for LDH)': 'low',
        'Not documented; No orchiectomy performed; Not assessed or unknown if assessed': np.nan,
        'Post-Orchiectomy lactate dehydrogenase (LDH) range stated to be elevated': 'low',
        'Within normal limits': 'normal',
    }
    return lut[val]

def format_LDH_pretreatment(val: str) -> str|float:
    lut = {
        'Above normal LDH level; High': 'elevated',
        'Normal LDH level; Low, below normal': 'normal',
        'Not documented Not assessed or unknown if assessed': np.nan,
    }
    return lut[val]

def format_major_vein_involvement(val: str) -> bool|float:
    lut = {
        'Both renal vein and IVC involvement; Major vein invasion, NOS': True,
        'Inferior vena cava (IVC)': True,
        'Renal vein or its segmental branches': True,
        'Major vein involvement not present/not identified': False,
        'Not documented; Not assessed or unknown if assessed; No resection of primary': np.nan,
    }
    return lut[val]

def format_mitotic_rate_melanoma(val: str) -> float:
    lut = {
        '0 mitoses per square mm; Mitoses absent; No mitoses present': 0.0,
        '11 or more mitoses/square mm': 11.0,
        'Mitotic rate described with denominator other than square millimeter (mm)': np.nan,
        'Not documented; Not assessed or unknown if assessed': np.nan,
        'Stated as "at least 1 mitosis/square mm"; Stated as "mitogenic"': 1.0,
        'Stated as "less than 1 mitosis/square mm"; Stated as "nonmitogenic"': 0.0,
    }
    if val in lut:
        return lut[val]
    return float(int(val))

def format_perineural_invasion(val: str) -> bool|float:
    lut = {
        'Not documented/assessed; No mention in path report; Pathologist cant determine': np.nan,
        'Perineural invasion identified/present': True,
        'Perineural invasion not identified/not present': False,
    }
    return lut[val]
    
def format_peripheral_blood_involvement(val: str) -> str|float:
    lut = {
        'B0; No sig. involvement; 0-5%  lymphocytes are atypical/Sezary; Clone unknown': 'no',
        'B0a; No sig. involvement; 0-5%  lymphocytes are atypical/Sezary; Clone negative': 'no',
        'B1; Low burden; >5% lymphocytes are atypical/Sezary, but not B2; Clone unknown': 'low',
        'B2; High burden; 1000+ Sezary cells per microliter (uL); Clone positive': 'high',
        'Not documented; Not assessed or unknown if assessed': np.nan,
        'B1a; Low burden; >5% lymphocytes are atypical/Sezary, but not B2; Clone negative': 'low',
        'B1b; Low burden; >5% lymphocytes are atypical/Sezary, but not B2; Clone positive': 'low',
        'B0b; No sig. involvement; 0-5%  lymphocytes are atypical/Sezary; Clone positive': 'no',
    }
    return lut[val]

def format_peritoneal_cytology(val: str) -> str|float:
    lut = {
        'Not documented; Not assessed or unknown if assessed': np.nan,
        'Peritoneal cytology/washing atypical and/or suspicious': 'suspicious',
        'Peritoneal cytology/washing malignant (positive for malignancy)': 'malignant',
        'Peritoneal cytology/washing negative for malignancy': 'negative',
    }
    return lut[val]

def format_pleural_effusion(val: str) -> bool|float:
    lut = {
        'Not documented; Not assessed or unknown if assessed': np.nan,
        'Pleural effusion not identified/not present': False,
        'Pleural effusion present, malignant (positive)': True,
        'Pleural effusion present, non-malignant (negative)': True,
        'Pleural effusion, NOS': True,
    }
    return lut[val]

def format_pleural_invasion(val: str) -> str|float:
    lut = {
        'Not documented; No resection of primary; Not assessed or unknown if assessed': np.nan,
        'PL0; No evidence; Tumor does not completely traverse the elastic layer of pleura': 'PL0',
        'PL1 or PL2; Invasion of visceral pleura present, NOS': 'PL1/PL2',
        'PL3; Tumor invades into or through the parietal pleura OR chest wall': 'PL3',
        'Tumor extends to pleura, NOS; not stated if visceral or parietal': 'PL1/PL2',
    }
    return lut[val]

def format_ulceration(val: str) -> bool|float:
    lut = {
        'Not documented/assessed; No mention in path report; Pathologist cant determine': np.nan,
        'Ulceration not identified/not present': False,
        'Ulceration present': True,
    }
    return lut[val]
    
def format_tumor_deposits(val: str) -> float:
    lut = {
        '81 or more Tumor Deposits': 81.0,
        'No tumor deposits': np.nan,
        'Not documented/assessed; Indeterminate; No mention in path report; No resection': np.nan,
        'Tumor Deposits identified, number unknown': np.nan,
    }
    if val in lut:
        return lut[val]
    return float(int(val))

# def format_systemic_therapy_presurgery(val: str) -> bool|float:
#     lut = {
#         'Intraop systemic rx & oth systemic rx before/after surg': True,
#         'Intraoperative systemic therapy': False,
#         'No systemic therapy and/or surgical procedures': False,
#         'Sequence unknown': np.nan,
#         'Surgery both before and after systemic therapy': False,
#         'Systemic therapy after surgery': False,
#         'Systemic therapy before surgery': True,
#         'Systemic therapy both before and after surgery': True,
#     }
#     return lut[val]

# def format_systemic_therapy_postsurgery(val: Any) -> bool|float:
#     lut = {
#         'Intraop systemic rx & oth systemic rx before/after surg': True,
#         'Intraoperative systemic therapy': True,
#         'No systemic therapy and/or surgical procedures': False,
#         'Sequence unknown': np.nan,
#         'Surgery both before and after systemic therapy': True,
#         'Systemic therapy after surgery': True,
#         'Systemic therapy before surgery': False,
#         'Systemic therapy both before and after surgery': True,
#     }
#     if not isinstance(val, str):
#         return np.nan
#     return lut[val]
    

In [4]:
# read column format
with open(INFILE_DICT, 'r') as fp:
    lines = fp.readlines()
    lines = [ln.strip() for ln in lines]
    lines = [ln for ln in lines if ln!='']
    idx = lines.index('[Variables]')
    lines = lines[idx+1:]
    fields = []
    for i in range(0, len(lines), 2):
        fields.append(lines[i].split('=')[-1])

na_values = [
    '.',
    'Unknown',
    'Blank(s)', 
    'Not documented; not assessed; unknown', 
    'Test ordered, results not in chart',
    'Recode not available',
    'Not applicable',
    'UNK Stage',
    'NA',
]

# load data 
table = pd.read_csv(INFILE_DATA, sep='\t', header=None, na_values=na_values)
table.columns = fields
table['PSA Lab Value Recode (2010+)'].value_counts()

  table = pd.read_csv(INFILE_DATA, sep='\t', header=None, na_values=na_values)


PSA Lab Value Recode (2010+)
98.0 ng/ml or greater    26085
5.0                       8202
5.2                       7272
5.1                       7184
4.5                       7038
                         ...  
89.1                         9
81.9                         9
76.9                         8
85.8                         7
75.3                         6
Name: count, Length: 980, dtype: int64

In [None]:

# formatting
formatters = [
    # ('Site recode ICD-O-3/WHO 2008', format_cancer_group, 'cancer_group'),
    # (None, format_primary_type, 'primary_type'),
    # ('primary_type', format_primary_group, 'primary_group'),
    # ('Histologic Type ICD-O-3', format_hist_type, 'hist_type'),
    # ('Histology recode - broad groupings', format_hist_group, 'hist_group'),
    # (None, format_tstage_ajcc, 'TSTAGE_STD'),
    # (None, format_nstage_ajcc, 'NSTAGE_STD'),
    # (None, format_gstage_ajcc, 'GSTAGE_STD'),
    # (None, format_grade, 'GRADE_STD'),
    # ('SEER Combined Mets at DX-brain (2010+)', format_str_to_bool, 'brain_met'),
    # ('SEER Combined Mets at DX-bone (2010+)', format_str_to_bool, 'bone_met'),
    # ('SEER Combined Mets at DX-liver (2010+)', format_str_to_bool, 'liver_met'),
    # ('SEER Combined Mets at DX-lung (2010+)', format_str_to_bool, 'lung_met'),
    # ('Mets at DX-Distant LN (2016+)', format_distant_ln_met, 'distant_ln_met'),
    # ('Mets at DX-Other (2016+)', format_other_met, 'other_met'),
    # ('Regional nodes positive (1988+)', format_regional_nodes_category, 'regional_nodes'),
    ('PSA Lab Value Recode (2010+)', format_psa, 'PSA'),
    # ('AFP Post-Orchiectomy Lab Value Recode (2010+)', format_afp_post_orchiectomy, 'AFP_post_orchiectomy'),
    # ('AFP Pretreatment Interpretation Recode (2010+)', format_afp_pretreat_category, 'AFP_pretreat_category'),
    # ('B Symptoms Recode (2010+)', format_b_symptoms, 'B_symptoms'),
    # ('Breslow Thickness Recode (2010+)', format_breslow_thick, 'breslow_thick'),
    # ('CA-125 Pretreatment Interpretation Recode (2010+)', format_ovarian_CA125, 'ovarian_CA125'),
    # ('CEA Pretreatment Interpretation Recode (2010+)', format_CEA, 'CEA_pretreat'),
    # ('Chromosome 19q: Loss of Heterozygosity (LOH) Recode (2010+)', format_chr19q_loh, 'chr19q_loh'),
    # ('Chromosome 1p: Loss of Heterozygosity (LOH) Recode (2010+)', format_chr1p_loh, 'chr1p_loh'),
    # ('Fibrosis Score Recode (2010+)', format_fibrosis_score, 'fibrosis_score'),
    # (None, format_gleason, 'gleason'),
    # ('Invasion Beyond Capsule Recode (2010+)', format_capsule_invasion, 'capsule_invasion'),
    # ('Ipsilateral Adrenal Gland Involvement Recode (2010+)', format_adrenal_involvement, 'adrenal_involvement'),
    # ('hCG Post-Orchiectomy Range Recode (2010+)', format_hGC_post_orchiectomy_elevation, 'hGC_post_orchiectomy_elevation'),
    # ('LDH Post-Orchiectomy Range Recode (2010+)', format_LDH_post_orchiectomy_elevation, 'LDH_post_orchiectomy_elevation'),
    # ('LDH Pretreatment Level Recode (2010+)', format_LDH_pretreatment, 'LDH_pretreatment'),
    # ('Major Vein Involvement Recode (2010+)', format_major_vein_involvement, 'major_vein_involvement'),
    # ('Mitotic Rate Melanoma Recode (2010+)', format_mitotic_rate_melanoma, 'mitotic_rate_melanoma'),
    # ('Perineural Invasion Recode (2010+)', format_perineural_invasion, 'perineural_invasion'),
    # ('Peripheral Blood Involvement Recode (2010+)', format_peripheral_blood_involvement, 'peripheral_blood_involvement'),
    # ('Peritoneal Cytology Recode (2010+)', format_peritoneal_cytology, 'peritoneal_cytology'),
    # ('Pleural Effusion Recode (2010+)', format_pleural_effusion, 'pleural_effusion'),
    # ('Visceral and Parietal Pleural Invasion Recode (2010+)', format_pleural_invasion, 'pleural_invasion'),
    # ('Ulceration Recode (2010+)', format_ulceration, 'ulceration'),
    # ('Tumor Deposits Recode (2010+)', format_tumor_deposits, 'tumor_deposits'),
]
for field, func, newfield in formatters:
    print(newfield)
    # single field 
    if field is not None:
        mask = table[field].notna()
        table.loc[mask, newfield] = table.loc[mask, field].apply(func)
    # multi field 
    else:
        table[newfield] = table.apply(func, axis=1)

table['any_met'] = table[['brain_met', 'bone_met', 'lung_met', 'liver_met', 'distant_ln_met', 'other_met']].any(axis=1)


cancer_group
primary_type
primary_group
hist_type
hist_group
TSTAGE_STD
NSTAGE_STD
GSTAGE_STD
GRADE_STD
brain_met
bone_met
liver_met
lung_met
distant_ln_met
other_met
regional_nodes
PSA
AFP_post_orchiectomy
AFP_pretreat_category
B_symptoms
breslow_thick
ovarian_CA125
CEA_pretreat
chr19q_loh
chr1p_loh
fibrosis_score
gleason
capsule_invasion
adrenal_involvement
hGC_post_orchiectomy_elevation
LDH_post_orchiectomy_elevation
LDH_pretreatment
major_vein_involvement
mitotic_rate_melanoma
perineural_invasion
peripheral_blood_involvement
peritoneal_cytology
pleural_effusion
pleural_invasion
ulceration
tumor_deposits


In [None]:

table = table.rename(columns={
    'Patient ID': 'patient_id',
    'Record number recode': 'record_number',
    'Sex': 'sex',
    'Age recode with <1 year olds and 90+': 'age',
    'Year of diagnosis': 'diagnosis_year',
    'Year of death recode': 'death_year',
    'Site recode ICD-O-3/WHO 2008': 'cancer_type',
    'Behavior code ICD-O-3': 'behavior',
    'Months from diagnosis to treatment': 'diagnosis_to_treatment_months',
    'Survival months': 'survival_months',
    'COD to site recode': 'COD',
    'Breast Subtype (2010+)': 'HER2_type',
    'Derived HER2 Recode (2010+)': 'HER2_status',
})

# selecting & sorting final columns
table = table[[
# patient info
'patient_id',
'record_number',
'behavior',
'sex',
'age',
'diagnosis_year',
'death_year',
'survival_months',
'COD',

# general cancer information
'cancer_type',
'cancer_group',
'primary_type',
'primary_group',
'hist_type',
'hist_group',
'TSTAGE_STD',
'NSTAGE_STD',
'GSTAGE_STD',
'GRADE_STD',
'regional_nodes',

# cancer type specific
'chr19q_loh',
'chr1p_loh',
'B_symptoms',
'PSA',
'gleason',
'AFP_post_orchiectomy',
'AFP_pretreat_category',
'hGC_post_orchiectomy_elevation',
'LDH_post_orchiectomy_elevation',
'LDH_pretreatment',
'HER2_type',
'HER2_status',
'breslow_thick',
'ovarian_CA125',
'CEA_pretreat',
'fibrosis_score',
'adrenal_involvement',
'major_vein_involvement',
'mitotic_rate_melanoma',
'capsule_invasion',
'perineural_invasion',
'peripheral_blood_involvement',
'peritoneal_cytology',
'pleural_effusion',
'pleural_invasion',
'ulceration',
'tumor_deposits',

# metastasis
'brain_met',
'bone_met',
'liver_met',
'lung_met',
'distant_ln_met',
'other_met',
'any_met',
]]


# currently ignoring:
# diagnosis_to_treatment_months
# Gestational Trophoblastic Prognostic Scoring Index Recode (2010+)
# RX Summ--Systemic/Sur Seq (2007+)
# RX Summ--Surg Prim Site (1998+)
# RX Summ--Scope Reg LN Sur (2003+)
# RX Summ--Surg Oth Reg/Dis (2003+)
# RX Summ--Surg/Rad Seq
# Chemotherapy recode (yes, no/unk)
# Radiation recode

In [6]:
from util_funcs import format_cancer_subtypes
from util_funcs import do_basic_filtering
from util_funcs import remove_identical_primary_secondary_cases

df = do_basic_filtering(table)
df = remove_identical_primary_secondary_cases(df)
df = format_cancer_subtypes(df)

df.to_csv(OUTFILE, sep='\t', index=False)

Beginning
- 4733269 patients, 5604583 records.

Filtered records not in range (2010, 2020)
- 4733269 patients, 5233074 records.

Removed patients with multiple records of same cancer_type.
- 4733269 patients, 5083913 records.

Removed MET records where primary tissue is identical to secondary tissue.
- 4669817 patients, 5014112 records.


In [7]:
# print(table['behavior'].value_counts())

In [8]:
# print(table['brain_met'].value_counts(dropna=False).sort_index())
# print(table['bone_met'].value_counts(dropna=False).sort_index())
# print(table['liver_met'].value_counts(dropna=False).sort_index())
# print(table['lung_met'].value_counts(dropna=False).sort_index())
# print(table['distant_ln_met'].value_counts(dropna=False).sort_index())
# print(table['other_met'].value_counts(dropna=False).sort_index())

In [9]:
# import matplotlib.pyplot as plt
# ax = sns.histplot(table, x='cancer_group')
# ax.tick_params(axis='x', rotation=90)
# plt.show()
# ax = sns.histplot(table, x='cancer_type')
# ax.tick_params(axis='x', rotation=90)
# plt.show()
# ax = sns.histplot(table, x='psa')
# ax.tick_params(axis='x', rotation=90)
# plt.show()
# print()
# print(table['TSTAGE_STD'].value_counts())
# print()
# print(table['NSTAGE_STD'].value_counts())
# print()
# print(table['GSTAGE_STD'].value_counts())
# print()
# print(table['GRADE_STD'].value_counts())