In [1]:
import nltk
import pandas as pd
import requests
import seaborn as sns
import shutil
from IPython.display import display

from pathlib import Path
from typing import Optional

In [2]:
sns.set_style("whitegrid")
sns.set_context("talk")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def download_file(url:str, local_filename:Optional[str]=None):
    if local_filename is None:
      local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    return local_filename

def unzip_file(path_to_zip_file:str, directory_to_extract_to:str):
  shutil.unpack_archive(path_to_zip_file, directory_to_extract_to)

In [4]:
# Check if the data already exists, if not, download and unzip it
data_path = Path('data')
if not (data_path / 'laboratorios.csv').is_file() and not (data_path / 'notas.csv').is_file() and not (data_path / 'sociodemografico.csv').is_file():
  data_zip = download_file('https://www.dropbox.com/sh/xgs3kyvyn7lmr6p/AACB4eORnqsJpRsjv9-56eUHa?dl=1', 'data.zip')
  unzip_file(data_zip, data_path)
  unzip_file(data_path / 'Diana Buitrago - IQVIA_NLPmediaclNotes_DianaBuitrago.zip', data_path)

In [5]:
# Check if the data already exists, if not, download and unzip it
data_path = Path('data')
if not (data_path / 'laboratorios.csv').is_file() and not (data_path / 'notas.csv').is_file() and not (data_path / 'sociodemografico.csv').is_file():
  data_zip = download_file('https://www.dropbox.com/sh/xgs3kyvyn7lmr6p/AACB4eORnqsJpRsjv9-56eUHa?dl=1', 'data/data.zip')
  unzip_file(data_zip, data_path)
  unzip_file(data_path / 'Diana Buitrago - IQVIA_NLPmediaclNotes_DianaBuitrago.zip', data_path)

In [6]:
notas = pd.read_csv(str(data_path / "notas.csv"),sep=';')
notas.head()

Unnamed: 0,IDRecord,Código,Nombre,Tipo,Plan
0,44600,A539,"SIFILIS, NO ESPECIFICADA",Confirmado Repetido,- ORDENO TAR ABC +3TC +ATV/r - PROFILAXIS NO ...
1,45038,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,- TAF/FTC/EVG/C MIPRES POR 2 MESES 20200602158...
2,40391,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,usuaria la cual se ve pertinente seguimiento d...
3,106350,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,1. Se formula TAR (TDF/FTC+EFV) 2. S/S Paracl...
4,105840,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,EDUCACIÓN Se brinda retroalimentación con rel...


Cleanup based on initial EDA

In [7]:
from utils.preprocessing_utils import preprocess_notas

notas = preprocess_notas(notas)
notas.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,IDRecord,Código,Nombre,Tipo,Plan
0,44600.0,A539,"SIFILIS, NO ESPECIFICADA",Confirmado Repetido,- ORDENO TAR ABC +3TC +ATV/r - PROFILAXIS - F...
1,45038.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,- TAF/FTC/EVG/C MIPRES 2 MESES 202006021580194...
2,40391.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,usuaria ve pertinente seguimiento dentro mes m...
3,106350.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,1. formula TAR (TDF/FTC+EFV) 2. S/S Paraclini...
4,105840.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,EDUCACION brinda retroalimentacion relacion r...


Let's check our taget feature distribution

In [8]:
name = notas[['Nombre', 'Código']].value_counts().to_frame('Count')
name['Percentage'] = (name.Count / sum(name.Count) * 100).round(2)
name

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Percentage
Nombre,Código,Unnamed: 2_level_1,Unnamed: 3_level_1
"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O TARDIA",A530,60587,43.22
"SIFILIS, NO ESPECIFICADA",A539,47408,33.82
DIABETES MELLITUS NOINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E119,17439,12.44
DIABETES MELLITUSINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E109,6278,4.48
"DIABETES MELLITUS, NO ESPECIFICADA SIN MENCION DE COMPLICACION",E149,2808,2.0
OTRAS SIFILIS SECUNDARIAS,A514,2614,1.86
"SIFILIS TARDIA, NO ESPECIFICADA",A529,1970,1.41
SIFILIS GENITAL PRIMARIA,A510,977,0.7
SIFILIS PRIMARIA ANAL,A511,94,0.07


After the data cleanup, we only have less than 1% for two of the 9 classes, representing a total of ~1000 samples of the ~150,000 in our dataset.
We can't easily create a prediction algorithm out of these small number of samples, so let's either drop them or merge them into similar categories.

## Classes merge

Let's try merging A510 and A511 with A514, as they all belong to the [A51 Early syphilis](https://icd.who.int/browse10/2019/en#/A51) ICD-10 denomination, indicating they share symptoms. The equivalent Spanish name for this category is [Sífilis precoz](http://ais.paho.org/classifications/chapters/CAP01.html?zoom_highlight=a51)

In [9]:
from utils.preprocessing_utils import merge_classes

notas = merge_classes(notas)
name = notas[['Nombre', 'Código']].value_counts().to_frame('Count')
name['Percentage'] = (name.Count / sum(name.Count) * 100).round(2)
name

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Percentage
Nombre,Código,Unnamed: 2_level_1,Unnamed: 3_level_1
"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O TARDIA",A530,60587,43.22
"SIFILIS, NO ESPECIFICADA",A539,47408,33.82
DIABETES MELLITUS NOINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E119,17439,12.44
DIABETES MELLITUSINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E109,6278,4.48
SIFILIS PRECOZ,A51,3685,2.63
"DIABETES MELLITUS, NO ESPECIFICADA SIN MENCION DE COMPLICACION",E149,2808,2.0
"SIFILIS TARDIA, NO ESPECIFICADA",A529,1970,1.41


## Word count

According to the [CDC](https://www.cdc.gov/std/syphilis/stdfact-syphilis-detailed.htm), primary syphilis is characterized by a chancre mark where the disease enters the body. There is also a possibility of having extra sores in your body, but there does not seem to be any difference per se in the development of its condition based on where the disease started. Another clear indication of syphilis is Saber shin (pierna/tibia en sable).
There is also a reduction in cognitive abilities for patients who have been suffering of syphilis for some time, and this can be tested for using a simple test called the [Clock Drawing Test](https://www2.gov.bc.ca/assets/gov/health/practitioner-pro/bc-guidelines/cogimp-clock-drawing-test.pdf), or [Test del Reloj](https://www.sanitas.es/sanitas/seguros/es/particulares/biblioteca-de-salud/tercera-edad/demencias/test-reloj.html) in Spanish.

Additionally, a main characteristic of primary syphilis seems to be chancres, as well as sores for both Primary and Secundary Syphilis, making a case for creating a new numerical variable called "chancres". Another main characterstic of Syphilis is the push to use preservatives in order to reduce the possibility of other people being infected as well, which could help differentiate between Syphilis and Diabetes.

In [26]:

notas_eda = notas.copy()
words_to_check = ['chancro', 'llaga', 'preservativo', 'sifili', 'asintoma', 'placa', r'(test.*reloj)', 'sable', 'penici', 'antibio']
aggregate_dict = {}
for word in words_to_check:
    notas_eda[word] = notas_eda.Plan.str.lower().str.count(word)
    notas_eda.loc[notas_eda[word] > 1, word] = 1
    aggregate_dict[word] = ['sum']
aggregate_dict['Nombre'] = ['count']
notas_eda = notas_eda.groupby(['Nombre', 'Código'])[words_to_check + ['Nombre']].aggregate(aggregate_dict)

for word in words_to_check:
    notas_eda.loc(axis=1)[word, '%'] = (notas_eda.loc(axis=1)[word, 'sum'] / notas_eda.loc(axis=1)['Nombre', 'count'] * 100).round(2)
with pd.option_context('display.max_columns', None):
    display(notas_eda.sort_index(axis=1).sort_values(by=['Código']))

Unnamed: 0_level_0,Unnamed: 1_level_0,(test.*reloj),(test.*reloj),Nombre,antibio,antibio,asintoma,asintoma,chancro,chancro,llaga,llaga,penici,penici,placa,placa,preservativo,preservativo,sable,sable,sifili,sifili
Unnamed: 0_level_1,Unnamed: 1_level_1,%,sum,count,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum
Nombre,Código,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
SIFILIS PRECOZ,A51,4.91,181,3685,0.11,4,5.37,198,0.03,1,0.0,0,3.01,111,0.14,5,52.81,1946,15.2,560,3.66,135
"SIFILIS TARDIA, NO ESPECIFICADA",A529,3.96,78,1970,0.0,0,2.18,43,0.0,0,0.05,1,3.86,76,1.02,20,44.37,874,12.94,255,2.03,40
"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O TARDIA",A530,2.91,1761,60587,0.1,58,3.07,1861,0.0,1,0.01,4,2.95,1785,1.14,691,49.07,29732,9.24,5596,3.67,2225
"SIFILIS, NO ESPECIFICADA",A539,1.62,767,47408,0.06,28,1.49,705,0.0,2,0.0,1,4.68,2219,0.58,277,36.76,17428,5.12,2429,16.93,8025
DIABETES MELLITUSINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E109,1.47,92,6278,2.8,176,1.85,116,0.0,0,0.02,1,0.33,21,0.57,36,22.35,1403,5.35,336,2.63,165
DIABETES MELLITUS NOINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E119,1.32,230,17439,2.99,522,2.44,426,0.0,0,0.0,0,0.19,33,0.63,110,26.4,4604,5.05,880,0.88,154
"DIABETES MELLITUS, NO ESPECIFICADA SIN MENCION DE COMPLICACION",E149,1.28,36,2808,5.38,151,2.35,66,0.0,0,0.0,0,0.21,6,0.36,10,20.55,577,5.02,141,2.81,79


For diabetes, we can try adding insulin and glucose as words of interest. Ketoacidosis is another relevant word which we can separate into keto and acido to see if we can capture more information. Carbo, as in carbohidrates, are normally something diabetics need to keep a tight control on their ingestion, so we can try and see if the doctors indicate any recommendation on their diet.

In [25]:
notas_eda = notas.copy()
words_to_check = ['ampolla', 'diabet', 'insulin', 'gluco', 'carbo', 'keto', 'acido', 'nutri', 'diet', 'dependiente']
aggregate_dict = {}
for word in words_to_check:
    notas_eda[word] = notas_eda.Plan.str.lower().str.count(word)
    notas_eda.loc[notas_eda[word] > 1, word] = 1
    aggregate_dict[word] = ['sum']
aggregate_dict['Nombre'] = ['count']
notas_eda = notas_eda.groupby(['Nombre', 'Código'])[words_to_check + ['Nombre']].aggregate(aggregate_dict)

for word in words_to_check:
    notas_eda.loc(axis=1)[word, '%'] = (notas_eda.loc(axis=1)[word, 'sum'] / notas_eda.loc(axis=1)['Nombre', 'count'] * 100).round(2)
notas_eda.sort_index(axis=1).sort_values(by=['Código'])
with pd.option_context('display.max_columns', None):
    display(notas_eda.sort_index(axis=1).sort_values(by=['Código']))

Unnamed: 0_level_0,Unnamed: 1_level_0,Nombre,acido,acido,ampolla,ampolla,carbo,carbo,dependiente,dependiente,diabet,diabet,diet,diet,gluco,gluco,insulin,insulin,keto,keto,nutri,nutri
Unnamed: 0_level_1,Unnamed: 1_level_1,count,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum
Nombre,Código,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
SIFILIS PRECOZ,A51,3685,0.41,15,0.24,9,3.88,143,1.22,45,0.14,5,7.54,278,1.03,38,0.03,1,0.08,3,26.92,992
"SIFILIS TARDIA, NO ESPECIFICADA",A529,1970,1.52,30,0.2,4,2.99,59,1.68,33,0.15,3,8.93,176,1.73,34,0.0,0,0.0,0,31.73,625
"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O TARDIA",A530,60587,0.92,558,0.38,233,3.26,1973,1.44,875,0.26,160,8.38,5078,1.08,652,0.09,56,0.07,42,26.07,15795
"SIFILIS, NO ESPECIFICADA",A539,47408,1.7,808,0.99,467,2.18,1035,0.7,332,0.28,133,7.66,3633,1.01,478,0.14,67,0.06,29,29.69,14076
DIABETES MELLITUSINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E109,6278,6.9,433,2.1,132,3.17,199,0.67,42,5.16,324,13.35,838,12.68,796,20.56,1291,2.2,138,34.12,2142
DIABETES MELLITUS NOINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E119,17439,6.46,1126,1.29,225,2.07,361,1.23,214,5.55,968,11.72,2044,5.84,1019,4.03,703,2.24,390,25.53,4452
"DIABETES MELLITUS, NO ESPECIFICADA SIN MENCION DE COMPLICACION",E149,2808,5.59,157,1.07,30,2.6,73,0.68,19,6.16,173,13.35,375,9.44,265,5.31,149,4.95,139,24.64,692


- There does not seem to be many mentions of chancre (chancro) or sore (llaga/placa)
- Ulcer (ulcera) does seem to be more common in patients with diabetes, although it still is negligible.
- Saber (sable) seems to be able to help differentiate for all but 1 types of syphilis against diabetes.
- asintoma seems to be useful for separating other secondary syphilis from the rest of the diseases.
- There does seem to be a significant difference between the times the word preservative (preservativo) is used between Syphilis and Diabetes.
- insulin seems like a good choice for separating E109 from the rest.
- acido, keto and diet seem to also help differentiate between syphilis and diabetes.
- Using 'diabet' and 'sifili' could help differentiate between the diagnoses of Diabetes and Syphilis.

In [19]:
from utils.preprocessing_utils import word_count_feat_engineering

notas = word_count_feat_engineering(notas)
with pd.option_context('display.max_columns', None):
    display(notas.head(10))

Unnamed: 0,IDRecord,Código,Nombre,Tipo,Plan,acido,diabet,diet,gluco,insulina,keto,(test.*reloj.*orden),asintoma,preservativo,sable,sifili
0,44600.0,A539,"SIFILIS, NO ESPECIFICADA",Confirmado Repetido,- ORDENO TAR ABC +3TC +ATV/r - PROFILAXIS - F...,0,0,0,0,0,0,0,0,1,0,0
1,45038.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,- TAF/FTC/EVG/C MIPRES 2 MESES 202006021580194...,0,0,0,0,0,0,0,0,0,0,0
2,40391.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,usuaria ve pertinente seguimiento dentro mes m...,0,0,0,0,0,0,0,0,0,0,0
3,106350.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,1. formula TAR (TDF/FTC+EFV) 2. S/S Paraclini...,0,0,0,0,0,0,0,0,0,0,0
4,105840.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,EDUCACION brinda retroalimentacion relacion r...,0,0,0,0,0,0,0,0,1,0,0
5,188030.0,A539,"SIFILIS, NO ESPECIFICADA",Confirmado Repetido,1. Continuar TAR : ABC/3TC+EFV. 2. Tomar TAR ...,0,0,0,0,0,0,0,0,2,0,0
6,41590.0,A539,"SIFILIS, NO ESPECIFICADA",Confirmado Repetido,) continua manejo iniciado 19/02/2020 ABC/3TC/...,0,0,0,0,0,0,0,0,1,0,0
7,111487.0,A539,"SIFILIS, NO ESPECIFICADA",Confirmado Repetido,Control segun frecuencias,0,0,0,0,0,0,0,0,0,0,0
8,69603.0,E119,DIABETES MELLITUS NOINSULINODEPENDIENTE SIN ME...,Confirmado Repetido,realiza control enfermeria 16 OCTUBRE 2020. As...,0,0,0,0,0,0,0,0,2,0,0
9,69563.0,A51,SIFILIS PRECOZ,Confirmado Repetido,1. TRATAMIENTO TDF/FTC/EFV 2. PROFILAXIS:3....,0,0,0,0,0,0,0,0,2,1,0
