In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split # Dividir nossos dados em Treino e teste
from sklearn.tree import DecisionTreeClassifier # Algoritmo de Classificação
from sklearn.preprocessing import LabelEncoder # transforma Categorias em Números
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import export_graphviz
import graphviz

base = pd.read_csv('insurance.csv') # caractereristicas de pessoas que solicitam seguro
base = base.drop(columns=['Unnamed: 0'])
base = base.fillna('NaN') # Evita que a coluna recebea NaN
base # Accident é a coluna que vamos prever

Unnamed: 0,GoodStudent,Age,SocioEcon,RiskAversion,VehicleYear,ThisCarDam,RuggedAuto,Accident,MakeModel,DrivQuality,...,HomeBase,AntiTheft,PropCost,OtherCarCost,OtherCar,MedCost,Cushioning,Airbag,ILiCost,DrivHist
0,False,Adult,Prole,Adventurous,Older,Moderate,EggShell,Mild,Economy,Poor,...,City,False,TenThou,Thousand,True,Thousand,Poor,False,Thousand,Many
1,False,Senior,Prole,Cautious,Current,,Football,,Economy,Normal,...,City,True,Thousand,Thousand,True,Thousand,Good,True,Thousand,Zero
2,False,Senior,UpperMiddle,Psychopath,Current,,Football,,FamilySedan,Excellent,...,City,False,Thousand,Thousand,False,Thousand,Good,True,Thousand,One
3,False,Adolescent,Middle,Normal,Older,,EggShell,,Economy,Normal,...,Suburb,False,Thousand,Thousand,True,Thousand,Fair,False,Thousand,Zero
4,False,Adolescent,Prole,Normal,Older,Moderate,Football,Moderate,Economy,Poor,...,City,False,TenThou,Thousand,False,Thousand,Fair,False,Thousand,Many
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,False,Adult,Prole,Adventurous,Older,Mild,Football,Mild,Economy,Poor,...,City,False,Thousand,Thousand,True,Thousand,Fair,False,Thousand,Many
19996,False,Adult,Middle,Normal,Older,,Tank,,FamilySedan,Normal,...,Suburb,False,Thousand,Thousand,True,Thousand,Good,False,Thousand,Zero
19997,False,Senior,UpperMiddle,Normal,Current,,Football,,Luxury,Excellent,...,Secure,True,TenThou,Thousand,False,Thousand,Excellent,True,Thousand,Zero
19998,False,Adult,Middle,Normal,Older,,Football,,FamilySedan,Excellent,...,Suburb,False,Thousand,Thousand,True,Thousand,Good,True,Thousand,Zero


In [6]:
y = base.iloc[:,7].values # Accidente é a Variável Dependente e esta na coluna 7
X = base.iloc[:,[0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]].values # Independentes

# Vamos transformar os dados categoricos de X em numeros para o algoritmo
labelencoder = LabelEncoder()
for i in range(X.shape[1]): # percorre as colunas das variaveis independentes
  if X[:,i].dtype == 'object': # se for uma variavel categórica, vamos transformar
    X[:,i] = labelencoder.fit_transform(X[:,i])

# Precisamos dividir os dados entre treino e teste
# o tamanho de treino é completar ao test_size
# Se você quer repetir o codigo e ter o mesmo resultado, utilize o random_state
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(X, y, test_size=0.3, random_state=1)
# X_treinamento deve receber 70% das variaveis independentes
# y_treinamento deve receber 70% das variaveis dependentes

In [7]:
# Sem nenhum parametro, a arvore ficou com precisão de 0.92
# max_depth -> profundidade máxima da arvore (melhorando a complexidade vertical dela)
# max_leaf_nodes -> número maximo de nós folhas (melhoru a complexidade horizontal, abaixou a precisão de 0.94 para 0.93)
modelo = DecisionTreeClassifier(random_state=1, max_depth=8, max_leaf_nodes=8)
modelo.fit(X_treinamento, y_treinamento)

In [9]:
# Vamos visualizar graficamente a arvore gerada
dot_data = export_graphviz(modelo, out_file=None, filled=True, feature_names=base.columns[:-1],
                           class_names=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("decision_trre", format="png")

'decision_trre.png'

In [10]:
previsoes = modelo.predict(X_teste) # Previsões do modelo
accuracy = accuracy_score(y_teste, previsoes)
precision = precision_score(y_teste, previsoes, average='weighted')
recall = recall_score(y_teste, previsoes, average='weighted')
f1 = f1_score(y_teste, previsoes, average='weighted')
report = classification_report(y_teste, previsoes)
print('Acurácia: ', accuracy)
print('Precisão: ', precision)
print('Recall: ', recall)
print('F1: ', f1)
print(report)

Acurácia:  0.9398333333333333
Precisão:  0.9389925807903848
Recall:  0.9398333333333333
F1:  0.9377075487307824
              precision    recall  f1-score   support

        Mild       0.91      0.68      0.78       542
    Moderate       0.75      0.76      0.75       505
         NaN       0.97      1.00      0.99      4228
      Severe       0.89      0.93      0.91       725

    accuracy                           0.94      6000
   macro avg       0.88      0.84      0.86      6000
weighted avg       0.94      0.94      0.94      6000

