In [112]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
import pyAgrum as gum
from sklearn.dummy import DummyClassifier

# Modelos utilizando os diagnósticos de entrada

In [113]:
df = pd.read_csv('https://raw.githubusercontent.com/ImagineDogs/TranstornosMentais/main/dados/df_cid10.csv')
table = df[['cid10_faixa', 'cid10_faixa_alta', 'cid10_seg_faixa']].loc[~df['cid10_faixa'].isnull()]
table = table.fillna('Sem')

Para dar inicio as predições é necessário utilizar um encoder adequado para os dados, por se tratarem de categorias independentes, ou seja, não possuem ordem, será utilizaro o OneHotEncoder que transforma cada categoria em uma coluna de valores binarios.

In [114]:
encX = OneHotEncoder()
ency = OrdinalEncoder()

X = table[['cid10_faixa', 'cid10_seg_faixa']]
X = encX.fit_transform(X)
y = np.array(table['cid10_faixa_alta'])
y = ency.fit_transform(y.reshape(-1, 1)).reshape(1, -1)[0]

Para fazer análises mais profundas de configurações podemos separar nossos dados em  treino, teste e validação, porém neste primeiro momento vamos utilizar apenas treino e teste por se tratar de uma abordagem mais simplista inicialmente. Contúdo uma função para as três separações já pode ser mantida pronta.

In [115]:
def train_test_val_split(X, y, test_size=None, val_size = None,random_state=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=random_state)
    
    return X_train.toarray(), X_test.toarray(), X_val.toarray(), y_train, y_test, y_val

# X_train, X_test, X_val, y_train, y_test, y_val = train_test_val_split(X, y, test_size=0.2, val_size = 0.05,random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, = X_train.toarray(), X_test.toarray()

In [116]:
print(X_train.shape)
X_train

(6574, 23)


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [117]:
print(y_train.shape)
y_train

(6574,)


array([1., 1., 2., ..., 1., 3., 3.])

Serão utilizados modelos explicativos em um primeiro momento para mantermos a explicabilidade das predições.

Os dois modelos escolhidos foram NaiveBayes e DecisionTree por serem algoritmos básicos porém eficazes.

Será utilizado também um modelo baseline para comparação.

In [118]:
# Avaliação dos modelos
def evaluate(y_pred, y_test):
    acuracia = accuracy_score(y_pred, y_test)
    f1 = f1_score(y_pred, y_test, average='weighted')

    print('Resultados:')
    print(f'    Acuracia: {acuracia}')
    print(f'    F1: {f1}')
    return acuracia, f1

### Modelo Dummy para baseline de comparacao

In [119]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)

y_pred = dummy.predict(X_test)
acuracia_dummy, f1_dummy = evaluate(y_pred, y_test)

Resultados:
    Acuracia: 0.37165450121654503
    F1: 0.5419068736141907


### Método Naive Bayes

In [120]:
nb = CategoricalNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
acuracia_nb, f1_nb = evaluate(y_pred, y_test)

Resultados:
    Acuracia: 0.7390510948905109
    F1: 0.7488247891294786


### Árvore de Decisão

Para a Árvore de Decisão será utilizado GridSearch para encontrar os melhores parâmetros dentro de um escopo que pode ser escolhido.

Vale lembrar que GridSearch testa cada combinação de parâmetros passados em um K-Fold Cross Validation, utilizando a combinação que melhor desempenhar.
Neste caso serão 5 folds com a métrica F1 balanceada.

In [121]:
tc = DecisionTreeClassifier(random_state=42)
param_tc = {'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random'], 'max_depth': range(5, 20, 1)}


gs_tc = GridSearchCV(tc, param_tc, cv=5, scoring='f1_weighted')


best_model_tc = gs_tc.fit(X_train, y_train)
print(best_model_tc.best_params_)



{'criterion': 'entropy', 'max_depth': 7, 'splitter': 'best'}


In [122]:
# Gerador de imagem pra a árvore

# Definindo os labels das features
model = DecisionTreeClassifier(random_state=42, criterion=best_model_tc.best_params_['criterion'], max_depth=best_model_tc.best_params_['max_depth'], splitter=best_model_tc.best_params_['splitter'])
model.fit(X_train, y_train)
df2 = pd.DataFrame(encX.inverse_transform(X_train), columns=['cid10_faixa', 'cid10_seg_faixa'])
labels = df2['cid10_faixa'].apply(lambda x: 'cid10_faixa_' + x).unique().tolist()
labels.extend(df2['cid10_seg_faixa'].apply(lambda x: 'cid10_seg_faixa_' + x).unique().tolist())
labels

#Gerando o gráfico
dot_data = export_graphviz(model, out_file=None, 
                           feature_names=labels,  
                           class_names=np.unique(ency.inverse_transform(y_train.reshape(-1, 1))).tolist(),  
                           filled=True, rounded=True,  
                           special_characters=True)  

graph = graphviz.Source(dot_data)
graph.render("iris_tree")

'iris_tree.pdf'

In [123]:
y_pred = best_model_tc.predict(X_test)
acuracia_dt, f1_dt = evaluate(y_pred, y_test)

Resultados:
    Acuracia: 0.7402676399026764
    F1: 0.7511145511008342


### KNN

In [124]:
knn = KNeighborsClassifier()
param_knn = {'weights': ['uniform', 'distance'], 'metric': ['manhattan', 'euclidean', 'cosine', 'nan_euclidean'], 'n_neighbors': range(1, 50)}


gs_knn = GridSearchCV(knn, param_knn, cv=5, scoring='f1_weighted')


best_model_knn = gs_knn.fit(X_train, y_train)
print(best_model_knn.best_params_)



{'metric': 'cosine', 'n_neighbors': 48, 'weights': 'distance'}


In [125]:
y_pred = best_model_knn.predict(X_test)
acuracia_knn, f1_knn = evaluate(y_pred, y_test)

Resultados:
    Acuracia: 0.7427007299270073
    F1: 0.7537847989988821


### Resultados

Podemos observar que os modelos obtiveram um desempenho extremamente próximo, com menos de 0.5% de diferença entre si.

Contudo todos obtiveram um resultado muito superior ao Dummy, o que era esperado.

In [126]:
resultados = pd.DataFrame()
resultados['NaiveBayes'] = pd.Series([acuracia_nb, f1_nb])*100
resultados['DecisionTree'] = pd.Series([acuracia_dt, f1_dt])
resultados['KNN'] = pd.Series([acuracia_knn, f1_knn])

resultados['Dummy'] = pd.Series([acuracia_dummy, f1_dummy])
resultados.index = ['Acuracia', 'F1 Balanceado']
resultados

Unnamed: 0,NaiveBayes,DecisionTree,KNN,Dummy
Acuracia,73.905109,0.740268,0.742701,0.371655
F1 Balanceado,74.882479,0.751115,0.753785,0.541907


# Modelos utilizando os diagnósticos de entrada e de comorbidade

In [171]:
df = pd.read_csv('https://raw.githubusercontent.com/ImagineDogs/TranstornosMentais/main/dados/df_cid10.csv')
diagnosticos = df[['cid10_faixa', 'cid10_faixa_alta', 'cid10_seg_faixa', 'sexo', 'faixa_etaria', 'etnia', 'qtd_internacoes', 'traumatismo',
                        'prob_respiratorios', 'avc', 'convulsao', 'has','gravidez', 'diabetes', 'doenca_infecto']].loc[~df['cid10_faixa'].isnull()]
diagnosticos = diagnosticos.fillna('Sem')

In [172]:
oneEncX = OneHotEncoder()
ordEncX = OrdinalEncoder()
ency = OrdinalEncoder()

oneCat = diagnosticos.drop(['faixa_etaria', 'qtd_internacoes'], axis=1)
oneCat = pd.DataFrame.sparse.from_spmatrix(oneEncX.fit_transform(oneCat))

ordCat = np.array(diagnosticos['faixa_etaria'])
ordCat = pd.Series(ordEncX.fit_transform(ordCat.reshape(-1, 1)).reshape(1, -1)[0])

y = np.array(diagnosticos['cid10_faixa_alta'])
y = ency.fit_transform(y.reshape(-1, 1)).reshape(1, -1)[0]

In [181]:
pd.concat([diagnosticos['qtd_internacoes'], ordCat, oneCat], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,60,61,62,63,64,65,66,67
2,8.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,8.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,8.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,8.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,8.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8126,,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
8139,,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
8163,,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
8193,,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [182]:
def train_test_val_split(X, y, test_size=None, val_size = None,random_state=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=random_state)
    
    return X_train.toarray(), X_test.toarray(), X_val.toarray(), y_train, y_test, y_val

# X_train, X_test, X_val, y_train, y_test, y_val = train_test_val_split(X, y, test_size=0.2, val_size = 0.05,random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, = X_train.toarray(), X_test.toarray()

In [183]:
print(X_train.shape)
X_train

(6574, 84)


array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [184]:
print(y_train.shape)
y_train

(6574,)


array([1., 1., 2., ..., 1., 3., 3.])

### Modelo Dummy para baseline de comparacao

In [185]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)

y_pred = dummy.predict(X_test)
acuracia_dummy, f1_dummy = evaluate(y_pred, y_test)

Resultados:
    Acuracia: 0.37165450121654503
    F1: 0.5419068736141907


In [188]:
X_test.shape

(1644, 84)

In [190]:
X_train.shape

(6574, 84)

### Método Naive Bayes

Por motivos de erro na execução não foi possível utilizar o metodo de Naive Bayes para estes dados

### Árvore de Decisão

Para a Árvore de Decisão será utilizado GridSearch para encontrar os melhores parâmetros dentro de um escopo que pode ser escolhido.

Vale lembrar que GridSearch testa cada combinação de parâmetros passados em um K-Fold Cross Validation, utilizando a combinação que melhor desempenhar.
Neste caso serão 5 folds com a métrica F1 balanceada.

In [193]:
tc = DecisionTreeClassifier(random_state=42)
param_tc = {'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random'], 'max_depth': range(5, 20, 1)}


gs_tc = GridSearchCV(tc, param_tc, cv=5, scoring='f1_weighted')


best_model_tc = gs_tc.fit(X_train, y_train)
print(best_model_tc.best_params_)



{'criterion': 'gini', 'max_depth': 6, 'splitter': 'random'}


In [194]:
y_pred = best_model_tc.predict(X_test)
acuracia_dt, f1_dt = evaluate(y_pred, y_test)

Resultados:
    Acuracia: 0.7360097323600974
    F1: 0.7474538583908896


### KNN

In [195]:
knn = KNeighborsClassifier()
param_knn = {'weights': ['uniform', 'distance'], 'metric': ['manhattan', 'euclidean', 'cosine', 'nan_euclidean'], 'n_neighbors': range(1, 50)}


gs_knn = GridSearchCV(knn, param_knn, cv=5, scoring='f1_weighted')


best_model_knn = gs_knn.fit(X_train, y_train)
print(best_model_knn.best_params_)



{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}


In [196]:
y_pred = best_model_knn.predict(X_test)
acuracia_knn, f1_knn = evaluate(y_pred, y_test)

Resultados:
    Acuracia: 0.6982968369829684
    F1: 0.7362615249187895


### Resultados

Podemos observar que os modelo de Árvore de decisão manteve um desempenho próximo ao anterior, porém o modelo de KNN obteve um desempenho pior que o anterior.

Contudo todos obtiveram um resultado muito superior ao Dummy, o que era esperado.

In [197]:
resultados = pd.DataFrame()
resultados['DecisionTree'] = pd.Series([acuracia_dt, f1_dt])
resultados['KNN'] = pd.Series([acuracia_knn, f1_knn])

resultados['Dummy'] = pd.Series([acuracia_dummy, f1_dummy])
resultados.index = ['Acuracia', 'F1 Balanceado']
resultados

Unnamed: 0,NaiveBayes,DecisionTree,KNN,Dummy
Acuracia,73.905109,0.73601,0.698297,0.371655
F1 Balanceado,74.882479,0.747454,0.736262,0.541907
