In [426]:
# Import Libraries

import pandas as pd
import numpy as np

In [453]:
# Import lung cancer dataset (2018-2021) from SEER National Cancer Database

df = pd.read_csv('Lung_Cancer_D0.csv')
df

Unnamed: 0,Age recode with <1 year olds,Primary Site - labeled,Histologic Type ICD-O-3,EOD Primary Tumor (2018+),EOD Regional Nodes (2018+),EOD Mets (2018+),Laterality,RX Summ--Scope Reg LN Sur (2003+),RX Summ--Surg/Rad Seq,Reason no cancer-directed surgery,Radiation recode,"Chemotherapy recode (yes, no/unk)",RX Summ--Systemic/Sur Seq (2007+),Time from diagnosis to treatment in days recode,SEER cause-specific death classification
0,65-69 years,"C34.1-Upper lobe, lung",8140,450,300,0,Right - origin of primary,4 or more regional lymph nodes removed,No radiation and/or no surgery; unknown if sur...,Surgery performed,None/Unknown,Yes,Systemic therapy after surgery,058,Alive or dead of other cause
1,65-69 years,"C34.1-Upper lobe, lung",8140,300,0,0,Right - origin of primary,,No radiation and/or no surgery; unknown if sur...,Not recommended,None/Unknown,No/Unknown,No systemic therapy and/or surgical procedures,045,Alive or dead of other cause
2,85+ years,"C34.3-Lower lobe, lung",8140,300,0,0,Left - origin of primary,4 or more regional lymph nodes removed,No radiation and/or no surgery; unknown if sur...,Surgery performed,None/Unknown,No/Unknown,No systemic therapy and/or surgical procedures,023,Alive or dead of other cause
3,55-59 years,"C34.1-Upper lobe, lung",8070,400,0,0,Right - origin of primary,"Biopsy or aspiration of regional lymph node, NOS",Radiation after surgery,Not recommended,Beam radiation,No/Unknown,No systemic therapy and/or surgical procedures,148,Alive or dead of other cause
4,65-69 years,"C34.9-Lung, NOS",8144,999,999,30,"Paired site, but no information concerning lat...",,No radiation and/or no surgery; unknown if sur...,Not recommended,None/Unknown,Yes,No systemic therapy and/or surgical procedures,057,Dead (attributable to this cancer dx)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193214,65-69 years,"C34.9-Lung, NOS",8000,999,999,99,"Paired site, but no information concerning lat...",Unknown or not applicable,No radiation and/or no surgery; unknown if sur...,Unknown; death certificate; or autopsy only (2...,None/Unknown,No/Unknown,No systemic therapy and/or surgical procedures,Unable to calculate,Dead (attributable to this cancer dx)
193215,80-84 years,"C34.9-Lung, NOS",8000,999,999,99,"Paired site, but no information concerning lat...",Unknown or not applicable,No radiation and/or no surgery; unknown if sur...,Unknown; death certificate; or autopsy only (2...,None/Unknown,No/Unknown,No systemic therapy and/or surgical procedures,Unable to calculate,Dead (attributable to this cancer dx)
193216,85+ years,"C34.9-Lung, NOS",8000,999,999,99,"Paired site, but no information concerning lat...",Unknown or not applicable,No radiation and/or no surgery; unknown if sur...,Unknown; death certificate; or autopsy only (2...,None/Unknown,No/Unknown,No systemic therapy and/or surgical procedures,Unable to calculate,Dead (attributable to this cancer dx)
193217,65-69 years,"C34.9-Lung, NOS",8000,999,999,99,"Paired site, but no information concerning lat...",Unknown or not applicable,No radiation and/or no surgery; unknown if sur...,Unknown; death certificate; or autopsy only (2...,None/Unknown,No/Unknown,No systemic therapy and/or surgical procedures,Unable to calculate,Dead (attributable to this cancer dx)


In [454]:
# Rename Columns into a Readable Format

df = df.rename(columns={'Age recode with <1 year olds':'Age',
                        'Primary Site - labeled':'Tumor Location',
                        'Histologic Type ICD-O-3':'Cancer Cell Type',
                        'EOD Primary Tumor (2018+)':'Tumor Extent at Diagnosis',
                        'EOD Regional Nodes (2018+)':'Regional Lymph Node Involvement',
                        'EOD Mets (2018+)':'Metastatic Spread',
                        'Laterality':'Tumor Laterality',
                        'RX Summ--Scope Reg LN Sur (2003+)':'Extent of Regional Lymph Node Surgery',
                        'RX Summ--Surg/Rad Seq':'Surgery/Radiation Sequence',
                        'Reason no cancer-directed surgery':'Surgery Recommended',
                        'Radiation recode':'Radiation Therapy',
                        'Chemotherapy recode (yes, no/unk)':'Chemotherapy',
                        'RX Summ--Systemic/Sur Seq (2007+)':'Surgery/Systemic Therapy Sequence',
                        'Time from diagnosis to treatment in days recode':'Days from Diagnosis to Treatment',
                        'SEER cause-specific death classification':'Cause of Death'})

In [455]:
# 1. Combine Age Groups 00-49 into single bucket


young_adult_ages = ['00 years', '01-04 years', '05-09 years', '10-14 years','15-19 years', 
                        '20-24 years', '25-29 years', '30-34 years', '35-39 years', '40-44 years',
                        '45-49 years']

middle_early_old_ages = ['50-54 years','55-59 years','60-64 years','65-69 years','70-74 years','75-79 years']

def recode_age(age):
    if age in young_adult_ages:
        return '00-49 years'
    elif age in middle_early_old_ages:
        return '50-79 years'
    else:
        return age

df['Age'] = df['Age'].apply(recode_age)

category_percentages = df['Age'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Age
50-79 years    76.832506
80-84 years    11.277876
85+ years       9.312231
00-49 years     2.577386
Name: proportion, dtype: float64

In [430]:
# 2. Change Tumor Location Wording

# Build mapping dictionary
mapping = {
    'C34.1-Upper lobe, lung': 'Upper lobe',
    'C34.3-Lower lobe, lung': 'Lower lobe',
    'C34.9-Lung, NOS': 'Not Specified',
    'C34.2-Middle lobe, lung': 'Middle lobe',
    'C34.0-Main bronchus': 'Main bronchus'
}

# Apply mapping: anything not in mapping becomes "Overlapping lesion"
df['Tumor Location'] = df['Tumor Location'].map(mapping).fillna('Overlapping lesion')

category_percentages = df['Tumor Location'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Tumor Location
Upper lobe            49.534984
Lower lobe            26.755650
Not Specified         14.731988
Middle lobe            4.340153
Main bronchus          3.672517
Overlapping lesion     0.964708
Name: proportion, dtype: float64

In [431]:
# 3. Mapping ICD-O-3 SEER Histology codes into Readable Buckets

# Dictionary of ICD-O-3 codes to labels
histology_map = {
    8140: 'NSCLC-Adenocarcinoma, NOS',
    8070: 'NSCLC-Squamous, NOS',
    8041: 'SCLC',
    8000: 'Malignant Tumor Cells, NOS',
    8551: 'NSCLC-Adenocarcinoma, NOS',
    8010: 'Carcinoma, NOS',
    8046: 'NSCLC-NOS',
    8240: 'Typical Carcinoid',
    8071: 'NSCLC-Squamous, NOS',
    8250: 'NSCLC-lepidic predominant adenocarcinoma',
    8072: 'NSCLC-Squamous, NOS',
    8246: 'Neuroendocrine Carcinoma',
    8253: 'NSCLC-Adenocarcinoma, NOS',
    8560: 'NSCLC-Adenosquamous Carcinoma',
    8230: 'NSCLC-Solid Predominant Adenocarcinoma',
    8013: 'NSCLC-Large Cell Neuroendocrine Carcinoma',
    8260: 'NSCLC-Papillary Predominant Adenocarcinoma',
    8255: 'NSCLC-Adenocarcinoma w/ Mixed Subtypes',
    8480: 'NSCLC-Mucinous Adenocarcinoma',
    8249: 'Atypical Carciniod',
    9699: 'Not Lung Cancer',
    8265: 'NSCLC-Micropapillary Predominant Adenocarcinoma',
    8033: 'NSCLC-Sarcomatoid Carcinoma',
    8045: 'Combined SCLC',
    8012: 'NSCLC-Large Cell, NOS',
    8022: 'NSCLC-Sarcomatoid Carcinoma',
    8254: 'NSCLC-Mixed Mucinous and Non-Mucinous Adenocarcinoma',
    9680: 'Not Lung Cancer',
    8256: 'NSCLC-Non-Mucinous Adenocarcinoma',
    8083: 'NSCLC-Basaloid Squamous',
    8032: 'NSCLC-Sarcomatoid Carcinoma',
    8430: 'NSCLC-Mucoepidermoid Carcinoma',
    8574: 'NSCLC-Adenocarcinoma w/ Neuroendocrine Differentiation',
    8481: 'NSCLC-Mucin-producing Adenocarcinoma',
    9591: 'Not Lung Cancer',
    8980: 'NSCLC-Carcinosarcoma',
    8200: 'NSCLC-Adenoid Cystic Carcinoma',
    8020: 'NSCLC-Undifferentiated Carcinoma, NOS',
    8144: 'Other / Rare',
    8800: 'Sarcoma',
    8257: 'NSCLC-Carcinosarcoma',
    9050: 'Not Lung Cancer',
    8042: 'SCLC',
    8490: 'NSCLC-Signet Ring Cell Adenocarcinoma',
    8031: 'NSCLC-Sarcomatoid Carcinoma',
    9052: 'Not Lung Cancer',
    8001: 'Malignant Tumor Cells, NOS',
    8890: 'Not Lung Cancer',
    9120: 'Not Lung Cancer',
    8310: 'NSCLC-Clear Cell Adenocarcinoma',
    8815: 'Sarcoma',
    9734: 'Not Lung Cancer',
    8023: 'NSCLC-Undifferentiated Carcinoma, NOS',
    8074: 'NSCLC-Spindle Cell Squamous',
    8576: 'NSCLC-Hepatoid Adenocarcinoma',
    8973: 'Other / Rare'
}

# Map codes to readable categories, with default "Other / Rare"
df['Cancer Cell Type'] = df['Cancer Cell Type'].map(histology_map).fillna('Other / Rare')

# Combine Rare Histologies into "Other / Rare" category
rare_histologies = [
    'Atypical Carciniod','NSCLC-Micropapillary Predominant Adenocarcinoma','Combined SCLC',                                             
    'NSCLC-Large Cell, NOS','NSCLC-Mixed Mucinous and Non-Mucinous Adenocarcinoma',       
    'NSCLC-Non-Mucinous Adenocarcinoma','NSCLC-Basaloid Squamous','NSCLC-Carcinosarcoma',                                       
    'NSCLC-Mucoepidermoid Carcinoma','NSCLC-Adenocarcinoma w/ Neuroendocrine Differentiation',     
    'NSCLC-Mucin-producing Adenocarcinoma','NSCLC-Undifferentiated Carcinoma, NOS','Sarcoma',                                                   
    'NSCLC-Adenoid Cystic Carcinoma','NSCLC-Signet Ring Cell Adenocarcinoma','NSCLC-Clear Cell Adenocarcinoma',                            
    'NSCLC-Spindle Cell Squamous','NSCLC-Hepatoid Adenocarcinoma','NSCLC-Sarcomatoid Carcinoma',
    'NSCLC-Mucinous Adenocarcinoma'
]

df['Cancer Cell Type'] = df['Cancer Cell Type'].replace(rare_histologies, 'Other / Rare')
        
category_percentages = df['Cancer Cell Type'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Cancer Cell Type
NSCLC-Adenocarcinoma, NOS                     41.708114
NSCLC-Squamous, NOS                           19.371801
SCLC                                          10.498450
Malignant Tumor Cells, NOS                     8.479497
Carcinoma, NOS                                 4.331355
NSCLC-NOS                                      4.070511
Other / Rare                                   3.244505
Typical Carcinoid                              1.968233
NSCLC-lepidic predominant adenocarcinoma       1.352869
Neuroendocrine Carcinoma                       0.877760
NSCLC-Adenosquamous Carcinoma                  0.859646
NSCLC-Solid Predominant Adenocarcinoma         0.711110
NSCLC-Large Cell Neuroendocrine Carcinoma      0.706970
NSCLC-Papillary Predominant Adenocarcinoma     0.635548
NSCLC-Adenocarcinoma w/ Mixed Subtypes         0.635031
Not Lung Cancer                                0.548600
Name: proportion, dtype: float64

In [432]:
# 4. Mapping EOD Primary Tumor codes into Readable Buckets

extent_mapping = {
    300: 'Localized to Lung Only',
    999: 'Unknown',
    450: 'Pleural Involvement/Lung Collapse',
    700: 'Same Lung, Different Lobe Nodules',
    500: 'Pleural Invasion/Chest Wall Invasion/Same Lobe Nodules',
    650: 'Major Adjacent Structure Invasion',
    400: 'Hilus / Adjacent Lobe Invasion',
    100: 'Minimally Invasive Adenocarcinoma, Lepidic',
    675: 'Rib/Muscle/Sternum Invasion',
    200: 'Superficially Spreading Tumor',
    800: 'Occult Primary Tumor',
    600: 'Pleural Invasion/Chest Wall Invasion/Same Lobe Nodules',
    980: 'Occult Carcinoma',
    0:   'Carcinoma in situ'
}

df['Tumor Extent at Diagnosis'] = df['Tumor Extent at Diagnosis'].map(extent_mapping).fillna('Rare/Other Extensions')

# Bucket into broader groups
bucket_mapping = {
    'Localized to Lung Only': 'Localized/Early disease',
    'Minimally Invasive Adenocarcinoma, Lepidic': 'Localized/Early disease',
    'Superficially Spreading Tumor': 'Localized/Early disease',
    'Carcinoma in situ': 'Localized/Early disease',
    
    'Same Lung, Different Lobe Nodules': 'Spread within Lung',
    'Hilus / Adjacent Lobe Invasion': 'Spread within Lung',
    
    'Occult Primary Tumor': 'Occult/Hidden Tumors',
    'Occult Carcinoma': 'Occult/Hidden Tumors'
}

df['Tumor Extent at Diagnosis'] = df['Tumor Extent at Diagnosis'].replace(bucket_mapping)

category_percentages = df['Tumor Extent at Diagnosis'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Tumor Extent at Diagnosis
Localized/Early disease                                   47.876244
Unknown                                                   14.188046
Spread within Lung                                        12.568122
Pleural Involvement/Lung Collapse                         10.152211
Pleural Invasion/Chest Wall Invasion/Same Lobe Nodules     7.031400
Major Adjacent Structure Invasion                          6.242140
Rib/Muscle/Sternum Invasion                                1.083744
Occult/Hidden Tumors                                       0.847743
Rare/Other Extensions                                      0.010351
Name: proportion, dtype: float64

In [433]:
# 5. Mapping EOD Regional Lymph Node codes into Readable Buckets 

lymph_node_mapping = {
    0:   'No Nodal Involvement',
    400: 'Mediastinal Ipsilateral',
    999: 'Unknown Node Status',
    700: 'Bilateral / Contralateral',
    300: 'Ipsilateral Nodes Only',
    600: 'Ipsilateral / Contralateral',
    800: 'Node Involvement, NOS',
    888: 'Regional Node Involvement, NOS',
    987: 'Not Applicable',
    200: 'Not Applicable',
    100: 'Not Applicable'
}

df['Regional Lymph Node Involvement'] = df['Regional Lymph Node Involvement'].map(lymph_node_mapping)

category_percentages = df['Regional Lymph Node Involvement'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Regional Lymph Node Involvement
No Nodal Involvement              43.291291
Mediastinal Ipsilateral           25.125893
Unknown Node Status               11.158840
Bilateral / Contralateral          7.849125
Ipsilateral Nodes Only             6.632371
Ipsilateral / Contralateral        4.449355
Node Involvement, NOS              0.951252
Regional Node Involvement, NOS     0.538249
Not Applicable                     0.003623
Name: proportion, dtype: float64

In [434]:
# 6. Mapping EOD Metastisis codes into Readable Buckets

metastatic_mapping = {
    0:  'No Distant Metastasis',
    50: 'Other Specified Distant Metastasis',
    10: 'Lung',
    30: 'Distant Lymph Node',
    70: 'Distant Metastasis, NOS',
    99: 'Unknown',
    20: 'Entry Error',
    88: 'Entry Error',
    5:  'Entry Error'
}

df['Metastatic Spread'] = df['Metastatic Spread'].map(metastatic_mapping)

category_percentages = df['Metastatic Spread'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Metastatic Spread
No Distant Metastasis                 54.486878
Other Specified Distant Metastasis    21.523246
Lung                                   9.119186
Distant Lymph Node                     8.008012
Distant Metastasis, NOS                2.988319
Unknown                                2.463008
Entry Error                            1.411352
Name: proportion, dtype: float64

In [435]:
# 7. Reword Tumor Laterality Values

laterality_mapping = {
    'Right - origin of primary': 'Right',
    'Left - origin of primary': 'Left',
    'Paired site, but no information concerning laterality': 'Unspecified',
    'Bilateral, single primary': 'Bilateral',
    'Only one side - side unspecified': 'Only One Side, NOS'
}

df['Tumor Laterality'] = df['Tumor Laterality'].map(laterality_mapping).fillna('Inapplicable')

category_percentages = df['Tumor Laterality'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Tumor Laterality
Right                 53.844084
Left                  38.331117
Unspecified            6.562502
Bilateral              1.002489
Only One Side, NOS     0.159922
Inapplicable           0.099887
Name: proportion, dtype: float64

In [436]:
# 8. Reword Values for Extent of Regional Lymph Node Surgery

ln_surgery_mapping = {
    '4 or more regional lymph nodes removed': '4+ Nodes Removed',
    'Biopsy or aspiration of regional lymph node, NOS': 'Regional Biopsy/aspiration only',
    'Unknown or not applicable': 'Unknown/Inapplicable',
    '1 to 3 regional lymph nodes removed': '1-3 Nodes Removed',
    'Number of regional lymph nodes removed unknown': 'Node(s) Removed, NOS',
    'Sentinel node biopsy and lym nd removed same/unstated time': 'Sentinel Biopsy + Dissection',
    'Sentinel lymph node biopsy': 'Sentinel Biopsy only',
    'Sentinel node biopsy and lym nd removed different times': 'Sentinel Biopsy + Dissection'
}

df['Extent of Regional Lymph Node Surgery'] = df['Extent of Regional Lymph Node Surgery'].map(ln_surgery_mapping).fillna('Unknown/Inapplicable')

category_percentages = df['Extent of Regional Lymph Node Surgery'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Extent of Regional Lymph Node Surgery
Unknown/Inapplicable               68.013498
4+ Nodes Removed                   16.075024
Regional Biopsy/aspiration only    12.882273
1-3 Nodes Removed                   2.395727
Node(s) Removed, NOS                0.518583
Sentinel Biopsy + Dissection        0.072974
Sentinel Biopsy only                0.041921
Name: proportion, dtype: float64

In [437]:
# 9. Combine/Rename Buckets for Surgery Recommended variable

surgery_mapping = {
    'Not recommended, contraindicated due to other cond; autopsy only (1973-2002)': 'Not recommended',
    'Unknown; death certificate; or autopsy only (2003+)': 'Unknown',
    'Recommended, unknown if performed': 'Unknown',
    'Recommended but not performed, patient refused': 'Surgery not Performed',
    'Recommended but not performed, unknown reason': 'Surgery not Performed',
    'Not performed, patient died prior to recommended surgery': 'Surgery not Performed'
}

df['Surgery Recommended'] = df['Surgery Recommended'].map(surgery_mapping).fillna(df['Surgery Recommended'])


category_percentages = df['Surgery Recommended'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Surgery Recommended
Not recommended          73.560054
Surgery performed        20.895461
Unknown                   3.235707
Surgery not Performed     2.308779
Name: proportion, dtype: float64

In [438]:
# 10. Combine/Rename buckets for Surgery/Radiation Sequence

rad_seq_mapping = {
    'No radiation and/or no surgery; unknown if surgery and/or radiation given': 'Unknown',
    'Sequence unknown, but both were given': 'Radiation and Surgery (Sequence Unknown)'
}

df['Surgery/Radiation Sequence'] = df['Surgery/Radiation Sequence'].map(rad_seq_mapping).fillna(df['Surgery/Radiation Sequence'])

# Now replace everything not in the "allowed list"
allowed = {
    'Radiation after surgery',
    'Radiation prior to surgery',
    'Radiation before and after surgery',
    'Surgery both before and after radiation',
    'Radiation and Surgery (Sequence Unknown)',
    'Unknown'
}

df['Surgery/Radiation Sequence'] = df['Surgery/Radiation Sequence'].where(df['Surgery/Radiation Sequence'].isin(allowed),'Other')

category_percentages = df['Surgery/Radiation Sequence'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Surgery/Radiation Sequence
Unknown                                     90.767471
Radiation after surgery                      8.620270
Radiation prior to surgery                   0.421801
Radiation before and after surgery           0.082290
Surgery both before and after radiation      0.045544
Radiation and Surgery (Sequence Unknown)     0.044509
Other                                        0.018114
Name: proportion, dtype: float64

In [439]:
# 11. Combine/Rename buckets for Radiation Therapy

radiation_therapy = [
    'Beam radiation',
    'Radiation, NOS  method or source not specified',
    'Radioactive implants (includes brachytherapy) (1988+)',
    'Combination of beam with implants or isotopes',
    'Radioisotopes (1988+)'
]

df['Radiation Therapy'] = df['Radiation Therapy'].replace(
    {'Recommended, unknown if administered': 'Recommended, Unknown'}
)

# Map all "radiation" types to "Yes"
df['Radiation Therapy'] = df['Radiation Therapy'].where(
    ~df['Radiation Therapy'].isin(radiation_therapy),'Yes'                                              
)

category_percentages = df['Radiation Therapy'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Radiation Therapy
None/Unknown            60.360006
Yes                     36.946677
Refused (1988+)          2.063979
Recommended, Unknown     0.629338
Name: proportion, dtype: float64

In [440]:
# 12. Display Chemotherapy value frequency

category_percentages = df['Chemotherapy'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Chemotherapy
No/Unknown    62.922901
Yes           37.077099
Name: proportion, dtype: float64

In [441]:
# 12. Combine/Rename buckets for Surgery/Systemic Therapy Sequence

df['Surgery/Systemic Therapy Sequence'] = df['Surgery/Systemic Therapy Sequence'].replace({
    'No systemic therapy and/or surgical procedures': 'Not Both',
    'Intraop systemic rx & oth systemic rx before/after surg': 'Other',
    'Intraoperative systemic therapy': 'Other'
})

category_percentages = df['Surgery/Systemic Therapy Sequence'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Surgery/Systemic Therapy Sequence
Not Both                                          86.053131
Systemic therapy after surgery                    12.855361
Systemic therapy before surgery                    0.626750
Systemic therapy both before and after surgery     0.320362
Surgery both before and after systemic therapy     0.105062
Sequence unknown                                   0.031053
Other                                              0.008281
Name: proportion, dtype: float64

In [442]:
# 13. Display treatment variable combinations by frequency 

df_treatment_combos = df[['Surgery Recommended','Surgery/Radiation Sequence',
                          'Radiation Therapy','Chemotherapy','Surgery/Systemic Therapy Sequence']]
df_combo_counts = df_treatment_combos.value_counts(normalize=True) * 100
df_top_combo_counts = df_combo_counts.head(60).copy()
df_top_combo_counts = df_top_combo_counts.reset_index(name='count')
df_top_combo_counts

Unnamed: 0,Surgery Recommended,Surgery/Radiation Sequence,Radiation Therapy,Chemotherapy,Surgery/Systemic Therapy Sequence,count
0,Not recommended,Unknown,None/Unknown,No/Unknown,Not Both,24.658031
1,Surgery performed,Unknown,None/Unknown,No/Unknown,Not Both,14.730435
2,Not recommended,Unknown,Yes,No/Unknown,Not Both,13.692235
3,Not recommended,Unknown,Yes,Yes,Not Both,13.205223
4,Not recommended,Unknown,None/Unknown,Yes,Not Both,10.431686
5,Not recommended,Radiation after surgery,Yes,Yes,Systemic therapy after surgery,4.944131
6,Surgery performed,Unknown,None/Unknown,Yes,Systemic therapy after surgery,3.364576
7,Unknown,Unknown,None/Unknown,No/Unknown,Not Both,3.062328
8,Not recommended,Unknown,None/Unknown,Yes,Systemic therapy after surgery,2.041207
9,Not recommended,Radiation after surgery,Yes,No/Unknown,Not Both,1.454826


In [443]:
# 14. Combine Treatment Variables into One Broad Treatment Plan Variable

top_combos = df_combo_counts.index[:60]

# Mapping from top_combos to treatment labels
treatment_mapping = {
    top_combos[0]: "Unknown Treatment",
    top_combos[1]: "Surgery",
    top_combos[2]: "Radiotherapy",
    top_combos[3]: "Chemotherapy and Radiotherapy",
    top_combos[4]: "Chemotherapy",
    top_combos[5]: "Chemotherapy and Radiotherapy after Surgery",
    top_combos[6]: "Chemotherapy after Surgery",
    top_combos[7]: "Unknown Treatment",
    top_combos[8]: "Chemotherapy after Surgery",
    top_combos[9]: "Radiotherapy after Surgery",
    top_combos[10]: "Refused Treatment",
    top_combos[11]: "Chemotherapy and Radiotherapy after Surgery",
    top_combos[12]: "Unknown Treatment",
    top_combos[13]: "Radiotherapy",
    top_combos[14]: "Refused Treatment",
    top_combos[15]: "Radiotherapy after Surgery",
    top_combos[16]: "Unknown Treatment",
    top_combos[17]: "Radiotherapy after Surgery",
    top_combos[18]: "Unknown Treatment",
    top_combos[19]: "Chemotherapy before Surgery",
    top_combos[20]: "Chemotherapy",
    top_combos[21]: "Radiotherapy and Chemotherapy before Surgery",
    top_combos[22]: "Surgery",
    top_combos[23]: "Chemotherapy",
    top_combos[24]: "Chemotherapy before and after Surgery",
    top_combos[25]: "Chemotherapy and Radiotherapy",
    top_combos[26]: "Surgery",
    top_combos[27]: "Chemotherapy before Surgery, Radiotherapy after",
    top_combos[28]: "Chemotherapy before and after Surgery, Radiotherapy after",
    top_combos[29]: "Radiotherapy before Surgery, Chemotherapy after",
    top_combos[30]: "Radiotherapy after Surgery",
    top_combos[31]: "Chemotherapy after Surgery",
    top_combos[32]: "Radiotherapy before Surgery",
    top_combos[33]: "Radiotherapy before Surgery, Chemotherapy before and after",
    top_combos[34]: "Surgery",
    top_combos[35]: "Chemotherapy after Surgery",
    top_combos[36]: "Chemotherapy after Surgery",
    top_combos[37]: "Chemotherapy",
    top_combos[38]: "Radiotherapy",
    top_combos[39]: "Radiotherapy and Chemotherapy before Surgery",
    top_combos[40]: "Chemotherapy after Surgery",
    top_combos[41]: "Radiotherapy after Surgery",
    top_combos[42]: "Chemotherapy and Radiotherapy"
}

# Apply mapping with df.apply
df['Treatment Plan'] = df.apply(
    lambda row: treatment_mapping.get(
        (row['Surgery Recommended'], row['Surgery/Radiation Sequence'], 
         row['Radiation Therapy'], row['Chemotherapy'], 
         row['Surgery/Systemic Therapy Sequence']),
        "Other"
    ),
    axis=1
)

# Bucket rare treatment labels into "Other"
other_treatments = [
    'Chemotherapy before and after Surgery',
    'Chemotherapy before Surgery, Radiotherapy after',
    'Chemotherapy before and after Surgery, Radiotherapy after',
    'Radiotherapy before Surgery, Chemotherapy after',
    'Radiotherapy before Surgery',
    'Radiotherapy before Surgery, Chemotherapy before and after'
]

df['Treatment Plan'] = df['Treatment Plan'].where(
    ~df['Treatment Plan'].isin(other_treatments), 'Other'
)

category_percentages = df['Treatment Plan'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Treatment Plan
Unknown Treatment                               29.337177
Surgery                                         15.007841
Radiotherapy                                    14.316915
Chemotherapy and Radiotherapy                   13.358935
Chemotherapy                                    10.746873
Chemotherapy and Radiotherapy after Surgery      6.058410
Chemotherapy after Surgery                       5.588995
Radiotherapy after Surgery                       2.282384
Refused Treatment                                1.709459
Other                                            1.153613
Chemotherapy before Surgery                      0.262397
Radiotherapy and Chemotherapy before Surgery     0.177001
Name: proportion, dtype: float64

In [444]:
# 15. Cast Days from Diagnosis to Treatment to integer type

df = df.rename(columns={'Days from Diagnosis to Treatment':'Days from Diagnosis to Treatment (Old)'})

def str_to_int(days_str):
    if(days_str in ['Unable to calculate','731+ days']):
        return None
    else:
        return int(days_str)

df['Days from Diagnosis to Treatment'] = df['Days from Diagnosis to Treatment (Old)'].apply(str_to_int)
df['Days from Diagnosis to Treatment'].describe()

count    139106.000000
mean         42.220551
std          46.191043
min           0.000000
25%          12.000000
50%          33.000000
75%          57.000000
max         706.000000
Name: Days from Diagnosis to Treatment, dtype: float64

In [445]:
# 16. Rename labels for Cause of Death 
    # - Died by Cancer
    # - Did Not Die by Cancer

df = df.rename(columns={'Cause of Death':'Cause of Death (Old)'})

def rename_labels(label):
    if(label == 'Alive or dead of other cause'):
        return 'Alive or Not Cancer-related'
    elif(label == 'Dead (attributable to this cancer dx)'):
        return 'Cancer'
    elif(label == 'Dead (missing/unknown COD)'):
        return 'Unknown'
    else:
        return 'Not Applicable'

df['Cause of Death'] = df['Cause of Death (Old)'].apply(rename_labels)

category_percentages = df['Cause of Death'].value_counts(normalize=True, dropna=False) * 100
category_percentages

Cause of Death
Alive or Not Cancer-related    55.366191
Cancer                         43.999296
Unknown                         0.628303
Not Applicable                  0.006211
Name: proportion, dtype: float64

In [446]:
# 17. Display Cleaned Dataset

df_cleaned = df[['Age','Tumor Location','Cancer Cell Type','Tumor Extent at Diagnosis','Regional Lymph Node Involvement',
                 'Metastatic Spread','Tumor Laterality','Extent of Regional Lymph Node Surgery','Treatment Plan',
                 'Days from Diagnosis to Treatment', 'Cause of Death']]
df_cleaned

Unnamed: 0,Age,Tumor Location,Cancer Cell Type,Tumor Extent at Diagnosis,Regional Lymph Node Involvement,Metastatic Spread,Tumor Laterality,Extent of Regional Lymph Node Surgery,Treatment Plan,Days from Diagnosis to Treatment,Cause of Death
0,50-79 years,Upper lobe,"NSCLC-Adenocarcinoma, NOS",Pleural Involvement/Lung Collapse,Ipsilateral Nodes Only,No Distant Metastasis,Right,4+ Nodes Removed,Chemotherapy after Surgery,58.0,Alive or Not Cancer-related
1,50-79 years,Upper lobe,"NSCLC-Adenocarcinoma, NOS",Localized/Early disease,No Nodal Involvement,No Distant Metastasis,Right,Unknown/Inapplicable,Unknown Treatment,45.0,Alive or Not Cancer-related
2,85+ years,Lower lobe,"NSCLC-Adenocarcinoma, NOS",Localized/Early disease,No Nodal Involvement,No Distant Metastasis,Left,4+ Nodes Removed,Surgery,23.0,Alive or Not Cancer-related
3,50-79 years,Upper lobe,"NSCLC-Squamous, NOS",Spread within Lung,No Nodal Involvement,No Distant Metastasis,Right,Regional Biopsy/aspiration only,Radiotherapy after Surgery,148.0,Alive or Not Cancer-related
4,50-79 years,Not Specified,Other / Rare,Unknown,Unknown Node Status,Distant Lymph Node,Unspecified,Unknown/Inapplicable,Chemotherapy,57.0,Cancer
...,...,...,...,...,...,...,...,...,...,...,...
193214,50-79 years,Not Specified,"Malignant Tumor Cells, NOS",Unknown,Unknown Node Status,Unknown,Unspecified,Unknown/Inapplicable,Unknown Treatment,,Cancer
193215,80-84 years,Not Specified,"Malignant Tumor Cells, NOS",Unknown,Unknown Node Status,Unknown,Unspecified,Unknown/Inapplicable,Unknown Treatment,,Cancer
193216,85+ years,Not Specified,"Malignant Tumor Cells, NOS",Unknown,Unknown Node Status,Unknown,Unspecified,Unknown/Inapplicable,Unknown Treatment,,Cancer
193217,50-79 years,Not Specified,"Malignant Tumor Cells, NOS",Unknown,Unknown Node Status,Unknown,Unspecified,Unknown/Inapplicable,Unknown Treatment,,Cancer


In [447]:
# 18. Export Cleaned Dataset

df_cleaned.to_csv('Cleaned_Lung_Cancer_dataset.csv', index=False)