In [1]:
from IPython.display import display, HTML

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib as mlp
import matplotlib.pyplot as plt
import geopandas as gp
import datetime
import re

import os

os.chdir("../../../core")
import GT_helper_functions as hf
import GT_load_data as data

base_dir = "../../Outcome Measurement Data/"

In [15]:
%matplotlib inline
plt.style.use('ggplot')
mlp.rcParams["figure.facecolor"] = "white"
mlp.rcParams["figure.dpi"] = 100
mlp.rcParams["figure.figsize"] = [10,7]
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [104]:
mdr14_16 = pd.read_excel(base_dir + "TUBERCULOSIS/COHORTES/COHORTE TB - MDR 2014 - 2016.xls", 0)
mdr15_17 = pd.read_excel(base_dir + "TUBERCULOSIS/COHORTES/COHORTE TB - MDR 2015 - 2017.xls",0)
mdr16_18 = pd.read_excel(base_dir + "TUBERCULOSIS/COHORTES/COHORTE TB - MDR 2016 - 2018.xlsx",0)

In [120]:
def tryfun(_fun):
    def inStr(inputValue):
        try: 
            return _fun(inputValue)
        except:
            return np.NaN
    return inStr

def gen_inStr(values):
    def inStr(inputStr):
        if type(inputStr) == str:
            inputStr = inputStr.lower()
            return any([val in inputStr for val in values])
        return False
    return inStr

depto2code = {
    "san marcos": 12,
    "escuintla": 5,
    "guatemala": 1,
    "suchitepequez": 10,
    "quetzaltenango": 9,
    "huehuetenango": 13,
    "mexico": np.NaN,
    "retalhuleu": 11,
    "izabal": 18,
    "peten": 17, 
    "solola": 7,
    "el progreso": 2,
    "jutiapa": 22,
    "nd": np.NaN,
    "quiché": 14,
    "petén": 17,
    "jalapa": 21,
    "sacatepequez": 3,
    "baja verapaz": 15,
    "santa rosa": 7,
    "alta verapaz": 16,
    "izabal":  18,
    "totonicapan": 8
}

In [106]:
mdr14_16.columns = ['ID', 'X_0', 'Gender', 'Age', 'X_1',
       'Muni', 'Depto', 'DAS', 'PxNew', 'PxRelapse', 'PxLost',
       'TxFailure', 'TBType', 'PDSDate', 'PDSResult',
       'X_2', 'X_3', 'TxDate', 'DateBk_1',
       'ResultBk_1', 'DateCulture_1', 'ResultCulture_1',
       'DateCulture_2', 'ResultCulture_2', 'DateCulture_3',
       'ResultCulture_3', 'DateCulture_4', 'ResultCulture_4',
       'DateCulture_5', 'ResultCulture_5', 'DateCulture_6', 'ResultCulture_6',
       'DateCulture_7', 'ResultCulture_7', 'DateCulture_8', 'ResultCulture_8',
       'DateCulture_9', 'ResultCulture_9', 'DateCulture_10', 'ResultCulture_10',
       'DateCulture_11', 'ResultCulture_11', 'DateCulture_12', 'ResultCulture_12',
       'DateCulture_13', 'ResultCulture_13', 'DateCulture_14', 'ResultCulture_14',
       'DateCulture_15', 'ResultCulture_15', 'DateCulture_16', 'ResultCulture_16',
       'DateCulture_17', 'ResultCulture_17', 'DateCulture_18', 'ResultCulture_18',
       'DateCulture_19', 'ResultCulture_19', 'DateCulture_20', 'ResultCulture_20',
       'DateCulture_21', 'ResultCulture_21', 'DateCulture_22', 'ResultCulture_22',
       'DateCulture_23', 'ResultCulture_23', 'DateCulture_24', 'ResultCulture_24',
       'DiabetesMel', 'HIV', 'ExitCondition', 'ExitDate', 'Dead', 'Observations',
       'Modifications', 'X_4']
mdr14_16["BD_ID"] = "MDR14-16"

In [107]:
mdr15_17.columns = ['ID', 'X_0', 'X_1', 'Gender',
       'Age', 'X_2', 'Muni', 'Depto', 'DAS', 'DxDate', 'PxNew', 'PxRelapse', 'PxLost',
       'TxFailure', 'Referred', 'TBType',
       'TxDate', 'DateGXPERT', 'PDSDate', 'PDSResult', 'TTODate',
       'DateBk_1', 'ResultBk_1', 'DateCulture_1', 'ResultCulture_1',
       'DateCulture_2', 'ResultCulture_2', 'DateCulture_3', 'ResultCulture_3', 'DateCulture_4', 'ResultCulture_4',
       'DateCulture_5', 'ResultCulture_5', 'DateCulture_6', 'ResultCulture_6', 
       'DateCulture_7', 'ResultCulture_7', 'DateCulture_8', 'ResultCulture_8',
       'DateCulture_9', 'ResultCulture_9', 'DateCulture_10', 'ResultCulture_10',
       'DateCulture_11', 'ResultCulture_11', 'DateCulture_12', 'ResultCulture_12',
       'DateCulture_13', 'ResultCulture_13', 'DateCulture_14', 'ResultCulture_14',
       'DateCulture_15', 'ResultCulture_15', 'DateCulture_16', 'ResultCulture_16',
       'DateCulture_17', 'ResultCulture_17', 'DateCulture_18', 'ResultCulture_18',
       'DateCulture_19', 'ResultCulture_19', 'DateCulture_20', 'ResultCulture_20',
       'DateCulture_21', 'ResultCulture_21', 'DateCulture_22', 'ResultCulture_22',
       'DateCulture_23', 'ResultCulture_23', 'DateCulture_24', 'ResultCulture_24',
       'DiabetesMel', 'HIV', 'ExitCondition', 'ExitDate', 'Dead',
       'Observations']
mdr15_17["BD_ID"] = "MDR15-17"

In [108]:
mdr16_18.columns = ['ID', 'Gender', 'Age', 'X_0', 'Muni',
       'Depto', 'DAS', 'DxDate', 'PxCondition', 'TxFailure',
       'TBPulmonar', 'TBExtrapulmonar', 'TxDate', 'DateGXPERT',
       'GENOTYPE', 'ResistType', 'PDSDate', 'PDSResult',
       'TxDate2', 'DateBk_1',
       'ResultBk_1', 'DateCulture_1', 'ResultCulture_1',
       'DateCulture_2', 'ResultCulture_2', 'DateCulture_3',
       'ResultCulture_3', 'DateCulture_4', 'ResultCulture_4',
       'DateCulture_5', 'ResultCulture_5', 'DateCulture_6', 'ResultCulture_6',
       'DateCulture_7', 'ResultCulture_7', 'DateCulture_8', 'ResultCulture_8',
       'DateCulture_9', 'ResultCulture_9', 'DateCulture_10', 'ResultCulture_10',
       'DateCulture_11', 'ResultCulture_11', 'DateCulture_12', 'ResultCulture_12',
       'DateCulture_13', 'ResultCulture_13', 'DateCulture_14', 'ResultCulture_14',
       'DateCulture_15', 'ResultCulture_15', 'DateCulture_16', 'ResultCulture_16',
       'DateCulture_17', 'ResultCulture_17', 'DateCulture_18', 'ResultCulture_18',
       'DateCulture_19', 'ResultCulture_19', 'DateCulture_20', 'ResultCulture_20',
       'DateCulture_21', 'ResultCulture_21', 'DateCulture_22', 'ResultCulture_22',
       'DateCulture_23', 'ResultCulture_23', 'DateCulture_24', 'ResultCulture_24',
       'DiabetesMel', 'HIV', 'ExitCondition', 'ExitDate', 'Dead', 'Observations',
       'Modifications'] 
mdr16_18["BD_ID"] = "MDR16-18"

In [109]:
mdrAll = pd.concat([mdr14_16, mdr15_17, mdr16_18], sort=False, ignore_index=True)

In [110]:
def gen_inStr(values):
    def inStr(inputStr):
        if type(inputStr) == str:
            inputStr = inputStr.lower()
            return any([val in inputStr for val in values])
        return False
    return inStr

mdrAll.loc[mdrAll.BD_ID == "MDR16-18", "PxNew"] = mdrAll[mdrAll.BD_ID == "MDR16-18"]\
    .PxCondition.map(gen_inStr(["nuevo"]))
mdrAll.loc[mdrAll.BD_ID == "MDR16-18", "PxRelapse"] = mdrAll[mdrAll.BD_ID == "MDR16-18"]\
    .PxCondition.map(gen_inStr(["recaída", "antes tratado"]))
mdrAll.loc[mdrAll.BD_ID == "MDR16-18", "PxLost"] = mdrAll[mdrAll.BD_ID == "MDR16-18"]\
    .PxCondition.map(gen_inStr(["abandon"]))
mdrAll.loc[mdrAll.BD_ID == "MDR16-18", "TxFailure"] = mdrAll[mdrAll.BD_ID == "MDR16-18"]\
    .PxCondition.map(gen_inStr(["fracaso"]))

In [111]:
mdrAll_valid = mdrAll[mdrAll.DAS.isna() == False].copy() # .loc[(mdrAll.BD_ID == "MDR16-18") & (mdrAll.ID.isna()== False), ]
#mdrAll.ID.isna().sum()

In [122]:
mdrAll_valid["Year"] = mdrAll_valid.BD_ID.map(lambda x: int(x[3:5]))
mdrAll_valid["TxDate_"] = pd.to_datetime(mdrAll_valid.TxDate, errors="ignore")
mdrAll_valid["TxDate_Year"] = mdrAll_valid.TxDate_.map(tryfun(lambda x: x.year))
mdrAll_valid["DxDate_"] = pd.to_datetime(mdrAll_valid.DxDate, errors="ignore")
mdrAll_valid["DxDate_Year"] = mdrAll_valid.DxDate_.map(tryfun(lambda x: x.year))
mdrAll_valid["ExitDate_"] = pd.to_datetime(mdrAll_valid.ExitDate, errors="ignore")
mdrAll_valid["ExitDate_Year"] = mdrAll_valid.ExitDate_.map(tryfun(lambda x: x.year))
mdrAll_valid["PDSDate_"] = mdrAll_valid.PDSDate.map(tryfun(lambda x: re.sub("[a-zA-Z\,\.]", "", x).strip() if type(x) == str else x ))\
    .map(lambda x: pd.to_datetime(x, errors="ignore"))
mdrAll_valid["PDSDate_Year"] = mdrAll_valid.PDSDate_.map(tryfun(lambda x: x.year))
mdrAll_valid["GxDate_"] = pd.to_datetime(mdrAll_valid.DateGXPERT, errors="ignore")
mdrAll_valid["deptocode"] = mdrAll_valid.Depto.map(str.lower).map(str.strip).map(tryfun(depto2code.get))

In [95]:
mdrAll_valid.to_csv("../../Outcome Measurement Data/TUBERCULOSIS/COHORTES/MDR-2014-2017")

Unnamed: 0,PDSDate,BD_ID,ID,ExitCondition
2,HR,MDR14-16,3,Vivo
8,SospechosoMDR,MDR14-16,9,ambulatorio
16,SospechosoMDR,MDR14-16,17,ambulatorio
18,SospechosoMDR,MDR14-16,19,ambulatorio
37,,MDR14-16,38,fallecido
62,NaT,MDR15-17,15,AMBULATORIO
73,NaT,MDR15-17,26,
74,NaT,MDR15-17,27,
84,NaT,MDR15-17,37,
85,NaT,MDR15-17,38,
