In [1]:
import pandas as pd
import numpy as np
import datetime

# hate the warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import small and large data

large_data = pd.read_csv("../data/large_data.csv")
small_data = pd.read_csv("../data/small_data.csv")

In [3]:
 # Combined large and small datasets
data = pd.concat([large_data, small_data], ignore_index = True)

# 971 rows × 236 columns
data.shape

(974, 279)

In [4]:
data.loc[(data["total_chemo_received"] == "") |
         (data["total_chemo_received"] == " "), "total_chemo_received"] = np.nan
data["total_chemo_received"] = data["total_chemo_received"].astype(float)
data.loc[data["total_chemo_received"] == 0, "chemo"] = 0
data.loc[data["total_chemo_received"] > 0, "chemo"] = 1

In [5]:
data.loc[data["ebrt_curr_dose"] == 0, "received_ebrt"] = 0
data.loc[data["ebrt_curr_dose"] > 0, "received_ebrt"] = 1

data.loc[data["brachy_curr_dose"] == 0, "received_brachy"] = 0
data.loc[data["brachy_curr_dose"] > 0, "received_brachy"] = 1

data.loc[data["ebrt_boost_curr_dose"] == 0, "received_ebrt_boost"] = 0
data.loc[data["ebrt_boost_curr_dose"] > 0, "received_ebrt_boost"] = 1

data.loc[data["eqd2"] == "", "eqd2"] = np.nan
data["eqd2"] = data["eqd2"].astype(float)
data.loc[data["eqd2"] == 0, "received_eqd2"] = 0
data.loc[data["eqd2"] > 0, "received_eqd2"] = 1


data.loc[(data["enroll_age"] >= 21) &
                 (data["enroll_age"] <= 39), "age_cat"] = 1
data.loc[(data["enroll_age"] >= 40) &
                 (data["enroll_age"] <= 59), "age_cat"] = 2
data.loc[(data["enroll_age"] >= 60) &
                 (data["enroll_age"] <= 96), "age_cat"] = 3

In [6]:
# time_alive will not be able to be calculated on these patients 
data.loc[data["treat_start_date"] == "", "treat_start_date"] = np.nan
data.loc[data["death_date"] == "", "death_date"] = np.nan
print(list(data[(data["death_date"].isnull()) | (data["treat_start_date"].isnull())]["patient_id"]))


[365.0, 828.0, 6495.0, 6496.0, 'OB046']


In [7]:
print(len(set(data["patient_id"]))) # all patients from large and small data combined
print(len(set(data[data["vital_status"] == 1]["patient_id"])))
print(len(set(data[data["vital_status"] == 0]["patient_id"])))

974
366
607


In [8]:
data["time_alive"] = abs(pd.to_datetime(data["death_date"]) - pd.to_datetime(data["treat_start_date"])).dt.days

In [9]:
data.loc[data["other_arv_name"] == "ALLUVIA", "aluvia"] = 1
data.loc[data["other_arv_name"] == "Lamivoline", "lamivudine"] = 1
data.loc[data["other_arv_name"] == "Started Truvada and NVP 2014", "truvada"] = 1
data.loc[data["other_arv_name"] == "Started Truvada and NVP 2014", "nevirapine"] = 1
data.loc[data["other_arv_name"] == "TLD", "tenofovir"] = 1
data.loc[data["other_arv_name"] == "TLD", "lamivudine"] = 1
data.loc[data["other_arv_name"] == "TLD", "dolutegravir"] = 1

data.loc[(data["tenofovir"] == 1) &
         (data["lamivudine"] == 1) &
         (data["dolutegravir"] == 1), "tld"] = 1

data.loc[data["other_arv_name"] == "TLD", "tld"] = 1

In [10]:
# Create binary column to indicate whether or not patients have symptoms at presentation
set(data["vaginal_hemorrhage"].dropna()) # {0,1,2}
data["vaginal_hemorrhage_symp"] = np.nan
data.loc[data.vaginal_hemorrhage == 0, "vaginal_hemorrhage_symp"] = 0
data.loc[data.vaginal_hemorrhage == 1, "vaginal_hemorrhage_symp"] = 1
data.loc[data.vaginal_hemorrhage == 2, "vaginal_hemorrhage_symp"] = 1

set(data["vaginal_discharge"].dropna()) # {0,1,2}
data["vaginal_discharge_symp"] = np.nan
data.loc[data.vaginal_discharge == 0, "vaginal_discharge_symp"] = 0
data.loc[data.vaginal_discharge == 1, "vaginal_discharge_symp"] = 1
data.loc[data.vaginal_discharge == 2, "vaginal_discharge_symp"] = 1

set(data["pelvic_pain"].dropna()) # {0,1,2,3}
data["pelvic_pain_symp"] = np.nan
data.loc[data.pelvic_pain == 0, "pelvic_pain_symp"] = 0
data.loc[data.pelvic_pain == 1, "pelvic_pain_symp"] = 1
data.loc[data.pelvic_pain == 2, "pelvic_pain_symp"] = 1
data.loc[data.pelvic_pain == 3, "pelvic_pain_symp"] = 1

In [11]:
# Create ARV groupings

data["nrti"] = 0
data["nnrti"] = 0
data["pi"] = 0

data.loc[(data["abacavir"] == 1) | (data["azt"] == 1) |
         (data["tdf"] == 1) | (data["lamivudine"] == 1) |
         (data["stavudine"] == 1) | (data["azt"] == 1) |
         (data["emtricitabine"] == 1) | (data["didanosine"] == 1) |
         (data["atripla"] == 1) | (data["combivir"] == 1) |
         (data["truvada"] == 1) | (data["tld"] == 1) | (data["dolutegravir"] == 1), "nrti"] = 1

data.loc[(data["nevirapine"] == 1) | (data["efavirenz"] == 1) |
         (data["atripla"] == 1), "nnrti"] = 1
         
data.loc[(data["atazanavir_boosted"] == 1) | (data["atazanavir"] == 1) |
         (data["amprenavir"] == 1) | (data["darunavir"] == 1) |
         (data["darunavir_boosted"] == 1) | (data["nelfinavir"] == 1) |
         (data["tipranavir_boosted"] == 1) | (data["tipranavir"] == 1) |
         (data["amprenavir"] == 1) | (data["darunavir"] == 1), "pi"] = 1  

In [12]:
# Index columns necessary for analysis
data = data[["patient_id", "enroll_age", "age_cat", "hiv_status", "marital", "distance", "cancer_screening", "combined_cancer_stage", 
             "cr_result", "hb_result", "neut_result", "wbc_result", "init_performance_status", "total_chemo_received", 
             "ebrt_curr_dose", "brachy_curr_dose", "eqd2", "ebrt_boost_curr_dose", "treat_duration", "treat_response", "vital_status",
             "death_date", "treat_start_date", "vaginal_hemorrhage", "vaginal_discharge", "pelvic_pain", "cd4_final", "vl_final", 
             "on_arv", "nevirapine", "efavirenz", "atripla", "nnrti",
             "abacavir", "azt", "tdf", "lamivudine", "stavudine", "emtricitabine", "didanosine", "atripla", "combivir", "truvada", "tld", "dolutegravir", "nrti", 
             "atazanavir_boosted", "atazanavir", "amprenavir", "darunavir", "darunavir_boosted", "nelfinavir", "tipranavir_boosted", "tipranavir", "amprenavir", "darunavir", "pi",
             "cr_result_tox", "pelvic_pain_tox", "vaginal_discharge_tox", "vaginal_hemorrhage_tox", "fatigue_tox", "weight_loss_tox", 
             "nausea_tox", "vomiting_tox", "urine_freq_tox", "urine_incontinence_tox", "urine_urge_tox", "diarrhea_tox", "dermatitis_tox",
             "hb_min_result_tox","wbc_min_result_tox", "neut_min_result_tox", "alb_min_result_tox"]]


In [13]:
data.to_csv("./data.csv")