In [4]:
import matplotlib.pyplot as plt
import missingno as msno
import nltk
import numpy as np
import pandas as pd
import re
import requests
import seaborn as sns
import shutil
import textwrap
import matplotlib
from nltk.corpus import stopwords
from pathlib import Path
from wordcloud import WordCloud
import unicodedata
from collections import Counter
from typing import Optional

In [18]:
sns.set_style("whitegrid")
sns.set_context("talk")
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
def download_file(url:str, local_filename:Optional[str]=None):
    if local_filename is None:
      local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    return local_filename

def unzip_file(path_to_zip_file:str, directory_to_extract_to:str):
  shutil.unpack_archive(path_to_zip_file, directory_to_extract_to)

In [23]:
# Text-related functionality
def remove_stop_words(string_data:str, extra_stop_words:list=[]) -> str:
    stop_words = stopwords.words('spanish')
    stop_words.extend(extra_stop_words)
    string_data = re.sub(r'\b(' + r'|'.join(stop_words) + r')\b\s*', '', string_data, flags=re.IGNORECASE)
    return string_data

def remove_characters(string_data:str, character_list:list)-> str:
    for character in character_list:
        string_data = string_data.replace(character, '')
    return string_data

def convert_to_long_string(series_data:pd.Series) -> str:
    regex_compile = re.compile(r'[^A-Za-z ]')
    string_data = series_data.str.replace(regex_compile, '', regex=True).str.cat(sep=' ').lower()

    string_data = remove_stop_words(string_data)
    string_data = remove_characters(string_data, ['-', ','])
    string_data = re.sub(' {2,}', ' ', string_data)
    return string_data

def strip_accents(accented_string:str) -> str:
  clean_string = unicodedata.normalize('NFD', accented_string) \
            .encode('ascii', 'ignore') \
            .decode("utf-8")
  return clean_string

def find_top_k_words(string_value:str, k:int=5) -> list:
  string_value = str(string_value).lower()
  # Some text cleaning
  string_value = remove_stop_words(string_value)
  string_value = remove_characters(string_value, ['-', ','])
  # Split into lists for the counter
  split_it = string_value.split()
  counter = Counter(split_it)
  most_common = counter.most_common(k)
  return most_common

In [8]:
# Check if the data already exists, if not, download and unzip it
data_path = Path('data')
if not (data_path / 'laboratorios.csv').is_file() and not (data_path / 'notas.csv').is_file() and not (data_path / 'sociodemografico.csv').is_file():
  data_zip = download_file('https://www.dropbox.com/sh/xgs3kyvyn7lmr6p/AACB4eORnqsJpRsjv9-56eUHa?dl=1', 'data.zip')
  unzip_file(data_zip, data_path)
  unzip_file(data_path / 'Diana Buitrago - IQVIA_NLPmediaclNotes_DianaBuitrago.zip', data_path)

In [9]:
# Check if the data already exists, if not, download and unzip it
data_path = Path('data')
if not (data_path / 'laboratorios.csv').is_file() and not (data_path / 'notas.csv').is_file() and not (data_path / 'sociodemografico.csv').is_file():
  data_zip = download_file('https://www.dropbox.com/sh/xgs3kyvyn7lmr6p/AACB4eORnqsJpRsjv9-56eUHa?dl=1', 'data/data.zip')
  unzip_file(data_zip, data_path)
  unzip_file(data_path / 'Diana Buitrago - IQVIA_NLPmediaclNotes_DianaBuitrago.zip', data_path)

In [24]:
notas = pd.read_csv(str(data_path / "notas.csv"),sep=';')
notas.head()

Unnamed: 0,IDRecord,Código,Nombre,Tipo,Plan
0,44600,A539,"SIFILIS, NO ESPECIFICADA",Confirmado Repetido,- ORDENO TAR ABC +3TC +ATV/r - PROFILAXIS NO ...
1,45038,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,- TAF/FTC/EVG/C MIPRES POR 2 MESES 20200602158...
2,40391,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,usuaria la cual se ve pertinente seguimiento d...
3,106350,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,1. Se formula TAR (TDF/FTC+EFV) 2. S/S Paracl...
4,105840,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,EDUCACIÓN Se brinda retroalimentación con rel...


Cleanup based on initial EDA

In [25]:
# Dropping null values from IDRecord
notas.dropna(subset=['IDRecord'], inplace=True)

# Drop samples where both Code and Name are null
notas.dropna(how='all', subset=['Código', 'Nombre'], inplace = True)

# Drop bad data form IDRecord
notas['IDRecord'] = pd.to_numeric(notas['IDRecord'], errors='coerce')
notas.dropna(subset=['IDRecord'], inplace = True)

# Remove bad data from Nombre
index = notas[notas.Nombre == 'Confirmado Repetido'].index
notas.loc[index, ['Nombre', 'Tipo', 'Plan']] = notas.loc[index, ['Código', 'Nombre', 'Tipo']].to_numpy()
notas.loc[index, 'Código'] = notas[notas['Nombre'] == notas.loc[index, 'Código'].iat[0]]['Código'].iloc[0]

# Remove accents from Plan
notas['Plan'] = notas.Plan.astype(str).apply(lambda x: strip_accents(x))

# Remove stop words from Plan
notas['Plan'] = notas.Plan.astype(str).apply(lambda x: remove_stop_words(x))
notas.head()

Unnamed: 0,IDRecord,Código,Nombre,Tipo,Plan
0,44600.0,A539,"SIFILIS, NO ESPECIFICADA",Confirmado Repetido,- ORDENO TAR ABC +3TC +ATV/r - PROFILAXIS - F...
1,45038.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,- TAF/FTC/EVG/C MIPRES 2 MESES 202006021580194...
2,40391.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,usuaria ve pertinente seguimiento dentro mes m...
3,106350.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,1. formula TAR (TDF/FTC+EFV) 2. S/S Paraclini...
4,105840.0,A530,"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O...",Confirmado Repetido,EDUCACION brinda retroalimentacion relacion r...


Let's check our taget feature distribution

In [29]:
name = notas[['Nombre', 'Código']].value_counts().to_frame('Count')
name['Percentage'] = name.Count / sum(name.Count) * 100
name

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Percentage
Nombre,Código,Unnamed: 2_level_1,Unnamed: 3_level_1
"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O TARDIA",A530,60587,43.222401
"SIFILIS, NO ESPECIFICADA",A539,47408,33.820581
DIABETES MELLITUS NOINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E119,17439,12.440877
DIABETES MELLITUSINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,E109,6278,4.478687
"DIABETES MELLITUS, NO ESPECIFICADA SIN MENCION DE COMPLICACION",E149,2808,2.00321
OTRAS SIFILIS SECUNDARIAS,A514,2614,1.864812
"SIFILIS TARDIA, NO ESPECIFICADA",A529,1970,1.405386
SIFILIS GENITAL PRIMARIA,A510,977,0.696986
SIFILIS PRIMARIA ANAL,A511,94,0.067059


After the data cleanup, we only have less than 1% for two of the 9 classes, representing a total of ~1000 samples of the ~150,000 in our dataset.
We can't easily create a prediction algorithm out of these small number of samples, so let's either drop them or merge them into similar categories.

## Word count

According to the [CDC](https://www.cdc.gov/std/syphilis/stdfact-syphilis-detailed.htm), primary syphilis is characterized by a chancre mark where the disease enters the body. There is also a possibility of having extra sores in your body, but there does not seem to be any difference per se in the development of its condition based on where the disease started. Based on this, an argument can be made for mergin A511 with A510 in order to reduce the number of classes using the similarity of those two conditions.

Additionally, a main characteristic of primary syphilis seems to be chancres, as well as sores for both Primary and Secundary Syphilis, making a case for creating a new numerical variable called "chancres". Another main characterstic of Syphilis is the push to use preservatives in order to reduce the possibility of other people being infected as well, which could help differentiate between Syphilis and Diabetes.

In [107]:
notas_eda = notas.copy()
words_to_check = ['chancro', 'llaga', 'ulcera', 'preservativo', 'diabet', 'sifilis', 'asintoma']
aggregate_dict = {}
for word in words_to_check:
    notas_eda[word] = notas_eda.Plan.str.lower().str.count(word)
    notas_eda.loc[notas_eda[word] > 1, word] = 1
    aggregate_dict[word] = ['sum']
aggregate_dict['Nombre'] = ['count']
notas_eda = notas_eda.groupby('Nombre')[words_to_check + ['Nombre']].aggregate(aggregate_dict)

for word in words_to_check:
    notas_eda.loc(axis=1)[word, '%'] = (notas_eda.loc(axis=1)[word, 'sum'] / notas_eda.loc(axis=1)['Nombre', 'count'] * 100).round(2)
notas_eda.sort_index(axis=1)

Unnamed: 0_level_0,Nombre,asintoma,asintoma,chancro,chancro,diabet,diabet,llaga,llaga,preservativo,preservativo,sifilis,sifilis,ulcera,ulcera
Unnamed: 0_level_1,count,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum,%,sum
Nombre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
DIABETES MELLITUS NOINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,17439,2.44,426,0.0,0,5.55,968,0.0,0,26.4,4604,0.88,154,1.27,221
"DIABETES MELLITUS, NO ESPECIFICADA SIN MENCION DE COMPLICACION",2808,2.35,66,0.0,0,6.16,173,0.0,0,20.55,577,2.81,79,2.74,77
DIABETES MELLITUSINSULINODEPENDIENTE SIN MENCION DE COMPLICACION,6278,1.85,116,0.0,0,5.16,324,0.02,1,22.35,1403,2.63,165,1.72,108
OTRAS SIFILIS SECUNDARIAS,2614,7.19,188,0.0,0,0.19,5,0.0,0,55.28,1445,3.79,99,0.08,2
SIFILIS GENITAL PRIMARIA,977,1.02,10,0.1,1,0.0,0,0.0,0,44.32,433,3.07,30,0.31,3
"SIFILIS LATENTE, NO ESPECIFICADA COMO PRECOZ O TARDIA",60587,3.07,1861,0.0,1,0.26,160,0.01,4,49.07,29732,3.64,2203,0.04,24
SIFILIS PRIMARIA ANAL,94,0.0,0,0.0,0,0.0,0,0.0,0,72.34,68,4.26,4,1.06,1
"SIFILIS TARDIA, NO ESPECIFICADA",1970,2.18,43,0.0,0,0.15,3,0.05,1,44.37,874,1.98,39,0.0,0
"SIFILIS, NO ESPECIFICADA",47408,1.49,705,0.0,2,0.28,133,0.0,1,36.76,17428,16.91,8015,0.07,32


- There does not seem to be that many mentions of chancre (chancro) or sore (llaga)
- There does seem to be a significant difference between the times the word preservative (preservativo) is used between Syphilis and Diabetes.
- Using 'diabet' and 'sifili' could help differentiate between the diagnoses of Diabetes and Syphilis.