<a href="https://colab.research.google.com/github/MatheusSteinDeAguiar/MedicalCost/blob/main/MedicalCost2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medical Cost

## Importação de dados

In [1]:
import pandas as pd

url = 'https://raw.githubusercontent.com/gitflai/DDS4/main/insurance.csv'
dados = pd.read_csv(url)
dados.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## "*Mise en place*" dos dados

In [2]:
dados['smoker'] = dados['smoker'].map({'yes': 1, 'no': 0})
dados['charges'] = dados['charges']/1000 
dados['sex'] = dados['sex'].map({'male': 1, 'female': 0})
dados['region'] = dados['region'].map({'northeast': 3, 'northwest': 2, 'southeast': 1, 'southwest': 0})

## Tecnica de validação K-fold

In [3]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

validacao = KFold(n_splits = 10, shuffle = True)

In [4]:
from sklearn.linear_model import LogisticRegression

X = dados[['bmi', 'charges']]
y = dados['smoker'] 

modelo = LogisticRegression()

In [5]:
reglog = cross_validate(modelo, X, y, cv = validacao, scoring=['accuracy', 'recall', 'precision'])
reglog

{'fit_time': array([0.04039478, 0.01613569, 0.02551055, 0.01496267, 0.01572776,
        0.01576662, 0.01458859, 0.01465273, 0.01353145, 0.017066  ]),
 'score_time': array([0.00469351, 0.00518966, 0.00614047, 0.00417876, 0.00422406,
        0.004004  , 0.00432777, 0.00448418, 0.00409555, 0.00446677]),
 'test_accuracy': array([0.90298507, 0.92537313, 0.91791045, 0.94776119, 0.95522388,
        0.93283582, 0.98507463, 0.85820896, 0.95488722, 0.92481203]),
 'test_precision': array([0.92307692, 0.76190476, 0.88      , 0.86956522, 0.82758621,
        0.85185185, 1.        , 0.78787879, 0.86956522, 0.81481481]),
 'test_recall': array([0.68571429, 0.76190476, 0.73333333, 0.83333333, 0.96      ,
        0.82142857, 0.91304348, 0.68421053, 0.86956522, 0.81481481])}

In [6]:
acuracia = round(reglog['test_accuracy'].mean(), 4)
sensibilidade = round(reglog['test_recall'].mean(), 4)
precisao = round(reglog['test_precision'].mean(), 4)

print("Acurácia: ", acuracia)
print("Sensibilidade: ", sensibilidade)
print("Precisão: ", precisao)

Acurácia:  0.9305
Sensibilidade:  0.8077
Precisão:  0.8586


Sendo assim pode-se extrair disso que:
- a taxa de acerto (acurácia) do modelo é estimada em aproximadamente 93,1%.
- a capacidade de avaliar os verdadeiros positivos (sensibilidade) é estimada em aproximadamente 80,1%.
- e a capacidade de acertar (precisão) as estimativas, é de aproximadamente 85,3%


## KNN

In [7]:
from sklearn.neighbors import KNeighborsClassifier

X = dados[['bmi', 'charges']]
y = dados['smoker'] 

k = 27
modelo = KNeighborsClassifier(n_neighbors = k)
knn = cross_validate(modelo, X, y, cv = validacao, scoring=['accuracy', 'recall', 'precision'])

acuracia = round(knn['test_accuracy'].mean(), 4)
sensibilidade = round(knn['test_recall'].mean(), 4)
precisao = round(knn['test_precision'].mean(), 4)

print("Acurácia: ", acuracia)
print("Sensibilidade: ", sensibilidade)
print("Precisão: ", precisao)

Acurácia:  0.9589
Sensibilidade:  0.9593
Precisão:  0.8594


## Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

X = dados[['bmi', 'charges']]
y = dados['smoker'] 

profundidade = 4
modelo = DecisionTreeClassifier(max_depth = profundidade, min_samples_leaf= 4)
tree = cross_validate(modelo, X, y, cv = validacao, scoring=['accuracy', 'recall', 'precision'])

acuracia = round(tree['test_accuracy'].mean(), 4)
sensibilidade = round(tree['test_recall'].mean(), 4)
precisao = round(tree['test_precision'].mean(), 4)

print("Acurácia: ", acuracia)
print("Sensibilidade: ", sensibilidade)
print("Precisão: ", precisao)

Acurácia:  0.9611
Sensibilidade:  0.9597
Precisão:  0.8648


## Procurando o melhor k para o KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

X = dados[['bmi', 'charges']]
y = dados['smoker'] 

list_acuracia = []
list_sensibilidade = []
list_precisao = []

vetor_ks = [1, 3, 5, 7, 9]

for k in vetor_ks:
  modelo = KNeighborsClassifier(n_neighbors = k)
  knn = cross_validate(modelo, X, y, cv = validacao, scoring=['accuracy', 'recall', 'precision'])

  acuracia = round(knn['test_accuracy'].mean(), 4)
  sensibilidade = round(knn['test_recall'].mean(), 4)
  precisao = round(knn['test_precision'].mean(), 4)

  list_acuracia.append(acuracia)
  list_sensibilidade.append(sensibilidade)
  list_precisao.append(precisao)

aux = {'Acurácia': list_acuracia, 'Sensibilidade': list_sensibilidade, 'Precisão': list_precisao}
resultados = pd.DataFrame(aux, index = vetor_ks)
resultados
print("Ordenado por Acurácia: \n", resultados.sort_values(by = 'Acurácia'))
print("\n\nOrdenado por Acurácia: \n",resultados.sort_values(by = 'Precisão'))

# for k in range(100):
#    modelo = KNeighborsClassifier(n_neighbors = k)
#    knn = cross_validate(modelo, X, y, cv = validacao, scoring='accuracy')
#    acuracia = knn['test_score'].mean(), 4)
#    resultados.append(acuracia)


Ordenado por Acurácia: 
    Acurácia  Sensibilidade  Precisão
1    0.9537         0.8844    0.8938
3    0.9551         0.9123    0.8725
7    0.9559         0.9366    0.8576
9    0.9581         0.9552    0.8514
5    0.9589         0.9393    0.8667


Ordenado por Acurácia: 
    Acurácia  Sensibilidade  Precisão
9    0.9581         0.9552    0.8514
7    0.9559         0.9366    0.8576
5    0.9589         0.9393    0.8667
3    0.9551         0.9123    0.8725
1    0.9537         0.8844    0.8938


## K-Fold Repetido e Combinação de modelos

In [10]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate

validacao = RepeatedKFold(n_splits = 10, n_repeats = 3)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [12]:
X = dados[['bmi', 'charges']]
y = dados['smoker'] 

In [13]:
modelo = LogisticRegression()
medidas = cross_validate(modelo, X, y, cv = validacao, scoring="accuracy")
medidas['test_score'].mean()

0.9319979052107883

In [14]:
profundidade = 4
tamanho_folha = 4

modelo = DecisionTreeClassifier(max_depth = profundidade, min_samples_leaf= tamanho_folha)
medidas = cross_validate(modelo, X, y, cv = validacao, scoring="accuracy")
medidas['test_score'].mean()

0.9568903602289305

In [15]:
k = 23

modelo = KNeighborsClassifier(n_neighbors = k)
medidas = cross_validate(modelo, X, y, cv = validacao, scoring="accuracy")
medidas['test_score'].mean()

0.9616204690831558

### Hard Decision

In [16]:
from sklearn.ensemble import VotingClassifier

profundidade = 4
tamanho_folha = 4
k = 27

reglog = LogisticRegression()
tree = DecisionTreeClassifier(max_depth = profundidade, min_samples_leaf= tamanho_folha)
knn = KNeighborsClassifier(n_neighbors = k)

modelos = [('Reg Log', reglog), ('Decision Tree', tree), ('KNN', knn)]

modelo = VotingClassifier(modelos, voting = 'hard')
medidas = cross_validate(modelo, X, y, cv = validacao, scoring="accuracy")
medidas['test_score'].mean()

0.9648804847940747

### Soft Decision

In [17]:
modelo = VotingClassifier(modelos, voting = 'soft')
medidas = cross_validate(modelo, X, y, cv = validacao, scoring="accuracy")
medidas['test_score'].mean()

0.9641267347473161

## Bagging de modelos

In [18]:
from sklearn.ensemble import BaggingClassifier

modelo = BaggingClassifier(n_estimators = 50)
medidas = cross_validate(modelo, X, y, cv = validacao, scoring="accuracy")
medidas['test_score'].mean()

0.9539127669921073

## Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

modelo = RandomForestClassifier(n_estimators = 150, max_depth = 4, criterion="entropy", max_features = 'auto', min_samples_leaf=4)
medidas = cross_validate(modelo, X, y, cv = validacao, scoring="accuracy")
medidas['test_score'].mean()


0.9633711143530467

# Procurando o modelo que tenha a melhor acurácia

In [20]:
list_acuracia = []
list_sensibilidade = []
list_precisao = []
vetor_ks = []

for k in range(50):
  modelo = KNeighborsClassifier(n_neighbors = k)
  knn = cross_validate(modelo, X, y, cv = validacao, scoring=['accuracy', 'recall', 'precision'])
  
  acuracia = round(knn['test_accuracy'].mean(), 4)
  sensibilidade = round(knn['test_recall'].mean(), 4)
  precisao = round(knn['test_precision'].mean(), 4)

  vetor_ks.append(k)
  list_acuracia.append(acuracia)
  list_sensibilidade.append(sensibilidade)
  list_precisao.append(precisao)

aux = {'Acurácia': list_acuracia, 'Sensibilidade': list_sensibilidade, 'Precisão': list_precisao}
resultados = pd.DataFrame(aux, index = vetor_ks)
resultados
print("Ordenado por Acurácia: \n", resultados.sort_values(by = 'Acurácia'))

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError: Expected n_neighbors > 0. Got 0

ValueError

Ordenado por Acurácia: 
     Acurácia  Sensibilidade  Precisão
2     0.9372         0.7750    0.9022
4     0.9499         0.8714    0.8851
1     0.9509         0.8736    0.8840
8     0.9537         0.9146    0.8683
6     0.9542         0.9011    0.8813
10    0.9552         0.9307    0.8644
7     0.9557         0.9406    0.8578
3     0.9564         0.9168    0.8777
34    0.9566         0.9500    0.8573
12    0.9569         0.9348    0.8672
5     0.9576         0.9388    0.8636
39    0.9579         0.9581    0.8540
47    0.9581         0.9620    0.8509
38    0.9581         0.9582    0.8556
49    0.9581         0.9645    0.8494
15    0.9581         0.9605    0.8500
48    0.9582         0.9613    0.8504
32    0.9584         0.9512    0.8605
46    0.9584         0.9579    0.8561
44    0.9584         0.9614    0.8519
40    0.9584         0.9581    0.8553
11    0.9584         0.9536    0.8554
13    0.9584         0.9556    0.8580
37    0.9584         0.9594    0.8596
41    0.9586         0.96

In [21]:
list_acuracia = []
list_sensibilidade = []
list_precisao = []
vetor = []

for profundidade in range(10):
  modelo = DecisionTreeClassifier(max_depth = profundidade)
  tree = cross_validate(modelo, X, y, cv = validacao, scoring=['accuracy', 'recall', 'precision'])
  
  acuracia = round(tree['test_accuracy'].mean(), 4)
  sensibilidade = round(tree['test_recall'].mean(), 4)
  precisao = round(tree['test_precision'].mean(), 4)

  vetor.append(profundidade)
  list_acuracia.append(acuracia)
  list_sensibilidade.append(sensibilidade)
  list_precisao.append(precisao)

aux = {'Acurácia': list_acuracia, 'Sensibilidade': list_sensibilidade, 'Precisão': list_precisao}
resultados = pd.DataFrame(aux, index = vetor)
resultados
print("Ordenado por Acurácia: \n", resultados.sort_values(by = 'Acurácia'))

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be g

Ordenado por Acurácia: 
    Acurácia  Sensibilidade  Precisão
1    0.9255         0.9572    0.7472
2    0.9260         0.9564    0.7498
8    0.9429         0.8652    0.8554
9    0.9482         0.8729    0.8740
7    0.9504         0.9070    0.8610
5    0.9516         0.9034    0.8673
6    0.9537         0.9175    0.8681
4    0.9604         0.9600    0.8621
3    0.9626         0.9641    0.8685
0       NaN            NaN       NaN


In [22]:
validacao = RepeatedKFold(n_splits = 10, n_repeats = 5)

list_acuracia = []
list_sensibilidade = []
list_precisao = []
vetor = []

for folha in range(50):
  modelo = DecisionTreeClassifier(max_depth = 3, min_samples_leaf= folha)
  tree = cross_validate(modelo, X, y, cv = validacao, scoring=['accuracy', 'recall', 'precision'])
  
  acuracia = round(tree['test_accuracy'].mean(), 4)
  sensibilidade = round(tree['test_recall'].mean(), 4)
  precisao = round(tree['test_precision'].mean(), 4)

  vetor.append(folha)
  list_acuracia.append(acuracia)
  list_sensibilidade.append(sensibilidade)
  list_precisao.append(precisao)

aux = {'Acurácia': list_acuracia, 'Sensibilidade': list_sensibilidade, 'Precisão': list_precisao}
resultados = pd.DataFrame(aux, index = vetor)
resultados
print("Ordenado por Acurácia: \n", resultados.sort_values(by = 'Acurácia', ascending = False))

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueE

Ordenado por Acurácia: 
     Acurácia  Sensibilidade  Precisão
4     0.9640         0.9700    0.8734
1     0.9631         0.9659    0.8680
8     0.9628         0.9649    0.8710
3     0.9625         0.9640    0.8683
6     0.9625         0.9663    0.8654
2     0.9623         0.9642    0.8667
7     0.9620         0.9606    0.8705
9     0.9620         0.9588    0.8697
5     0.9619         0.9650    0.8663
18    0.9610         0.9543    0.8711
40    0.9607         0.9515    0.8691
26    0.9607         0.9506    0.8707
19    0.9607         0.9526    0.8690
21    0.9607         0.9512    0.8709
12    0.9607         0.9509    0.8676
17    0.9605         0.9518    0.8728
10    0.9605         0.9557    0.8647
33    0.9605         0.9504    0.8714
39    0.9604         0.9487    0.8701
32    0.9602         0.9499    0.8688
35    0.9602         0.9485    0.8697
24    0.9602         0.9479    0.8696
11    0.9601         0.9523    0.8666
16    0.9601         0.9491    0.8686
29    0.9601         0.94

In [23]:
from sklearn.ensemble import VotingClassifier

profundidade = 3
tamanho_folha = 4
k = 27

reglog = LogisticRegression()
tree = DecisionTreeClassifier(max_depth = profundidade, min_samples_leaf= tamanho_folha)
knn = KNeighborsClassifier(n_neighbors = k)
rdforest = RandomForestClassifier(n_estimators = 100, max_depth = 4, criterion="entropy", max_features = 'auto', min_samples_leaf= tamanho_folha)


modelos = [('Reg Log', reglog), ('Decision Tree', tree), ('KNN', knn), ('Random Forest', rdforest)]

modelo = VotingClassifier(modelos, voting = 'soft')
medidas = cross_validate(modelo, X, y, cv = validacao, scoring="accuracy")
medidas['test_score'].mean()

0.9657737627651217