In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import export_graphviz
import graphviz

In [4]:
base = pd.read_csv("insurance.csv")
base = base.drop(columns=['Unnamed: 0'])
base

Unnamed: 0,GoodStudent,Age,SocioEcon,RiskAversion,VehicleYear,ThisCarDam,RuggedAuto,Accident,MakeModel,DrivQuality,...,HomeBase,AntiTheft,PropCost,OtherCarCost,OtherCar,MedCost,Cushioning,Airbag,ILiCost,DrivHist
0,False,Adult,Prole,Adventurous,Older,Moderate,EggShell,Mild,Economy,Poor,...,City,False,TenThou,Thousand,True,Thousand,Poor,False,Thousand,Many
1,False,Senior,Prole,Cautious,Current,,Football,,Economy,Normal,...,City,True,Thousand,Thousand,True,Thousand,Good,True,Thousand,Zero
2,False,Senior,UpperMiddle,Psychopath,Current,,Football,,FamilySedan,Excellent,...,City,False,Thousand,Thousand,False,Thousand,Good,True,Thousand,One
3,False,Adolescent,Middle,Normal,Older,,EggShell,,Economy,Normal,...,Suburb,False,Thousand,Thousand,True,Thousand,Fair,False,Thousand,Zero
4,False,Adolescent,Prole,Normal,Older,Moderate,Football,Moderate,Economy,Poor,...,City,False,TenThou,Thousand,False,Thousand,Fair,False,Thousand,Many
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,False,Adult,Prole,Adventurous,Older,Mild,Football,Mild,Economy,Poor,...,City,False,Thousand,Thousand,True,Thousand,Fair,False,Thousand,Many
19996,False,Adult,Middle,Normal,Older,,Tank,,FamilySedan,Normal,...,Suburb,False,Thousand,Thousand,True,Thousand,Good,False,Thousand,Zero
19997,False,Senior,UpperMiddle,Normal,Current,,Football,,Luxury,Excellent,...,Secure,True,TenThou,Thousand,False,Thousand,Excellent,True,Thousand,Zero
19998,False,Adult,Middle,Normal,Older,,Football,,FamilySedan,Excellent,...,Suburb,False,Thousand,Thousand,True,Thousand,Good,True,Thousand,Zero


In [7]:
# y - Variável dependente
y = base.iloc[:, 7].values

# X - Variáveis independentes
X = base.iloc[:, base.columns != 'Accident'].values

label_encoder = LabelEncoder()

In [8]:
for i in range(X.shape[1]):
  if X[:, i].dtype == 'object':
    X[:, i] = label_encoder.fit_transform(X[:, i])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [28]:
# accuracy = 0.9195
# precision = [0.72939068 0.68992248 0.9785124  0.88277858]
# precision_weighted 0.9201509144700483
# recall = 0.9195
# f1 = 0.9197439685137029
model = DecisionTreeClassifier(random_state=1)

# accuracy = 0.9441666666666667
# precision = [0.90227273 0.78881988 0.97595376 0.8856383 ]
# precision_weighted 0.9426343511592464
# recall = 0.9441666666666667
# f1 = 0.942445801347076
# model = DecisionTreeClassifier(random_state=1, max_depth=8)

# accuracy = 0.9398333333333333
# precision = [0.91315136 0.75049116 0.97323489 0.88992042]
# precision_weighted 0.9389925807903848
# recall = 0.9398333333333333
# f1 = 0.9377075487307824
# model = DecisionTreeClassifier(random_state=1, max_depth=8, max_leaf_nodes=8)
model.fit(X_train, y_train)

In [29]:
model.predict(X_test)

array(['None', 'None', 'None', ..., 'None', 'Mild', 'None'], dtype=object)

In [30]:
predictions = model.predict(X_test)
predictions

accuracy = accuracy_score(y_test, predictions)

# Average define como é feito o cálculo da métrica do modelo, por exemplo:
# None - Calcula a métrica de cada classe (precisão de cada uma das 4 classes no exemplo)
# Weighted - Calcula a métrica de todas as classes juntas utilizando o peso dela em relação as demais
precision = precision_score(y_test, predictions, average=None)
precision_weighted = precision_score(y_test, predictions, average='weighted')

recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
print(f'accuracy = {accuracy} \nprecision = {precision} \nprecision_weighted {precision_weighted} \nrecall = {recall} \nf1 = {f1}')

accuracy = 0.9398333333333333 
precision = [0.91315136 0.75049116 0.97323489 0.88992042] 
precision_weighted 0.9389925807903848 
recall = 0.9398333333333333 
f1 = 0.9377075487307824


In [31]:
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

        Mild       0.91      0.68      0.78       542
    Moderate       0.75      0.76      0.75       505
        None       0.97      1.00      0.99      4228
      Severe       0.89      0.93      0.91       725

    accuracy                           0.94      6000
   macro avg       0.88      0.84      0.86      6000
weighted avg       0.94      0.94      0.94      6000



In [33]:
dot_data = export_graphviz(model, out_file=None, filled=True, feature_names=base.columns[:-1], class_names=True, rounded=True)
graph = graphviz.Source(dot_data)
graph.render("decision_tree", format="png")

'decision_tree.png'