In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import setuptools.dist
from sklearn.tree import export_graphviz
import graphviz

In [14]:
base = pd.read_csv('insurance.csv')
base= base.dropna()
base.shape

(5319, 28)

In [15]:
base = base.drop(columns=['Unnamed: 0'])
base.head()

Unnamed: 0,GoodStudent,Age,SocioEcon,RiskAversion,VehicleYear,ThisCarDam,RuggedAuto,Accident,MakeModel,DrivQuality,...,HomeBase,AntiTheft,PropCost,OtherCarCost,OtherCar,MedCost,Cushioning,Airbag,ILiCost,DrivHist
0,False,Adult,Prole,Adventurous,Older,Moderate,EggShell,Mild,Economy,Poor,...,City,False,TenThou,Thousand,True,Thousand,Poor,False,Thousand,Many
4,False,Adolescent,Prole,Normal,Older,Moderate,Football,Moderate,Economy,Poor,...,City,False,TenThou,Thousand,False,Thousand,Fair,False,Thousand,Many
5,False,Adult,UpperMiddle,Normal,Current,Moderate,EggShell,Moderate,SportsCar,Poor,...,Suburb,True,HundredThou,HundredThou,True,TenThou,Poor,True,Thousand,Many
6,False,Senior,UpperMiddle,Normal,Current,Mild,Football,Mild,Economy,Poor,...,Secure,True,TenThou,Thousand,False,Thousand,Excellent,True,Thousand,Many
7,False,Adult,Prole,Normal,Older,Severe,EggShell,Severe,Economy,Poor,...,City,False,Million,HundredThou,True,Thousand,Fair,False,Thousand,Many


In [16]:
y = base.iloc[:,7].values
X = base.iloc[:,[0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]].values

In [17]:
label_encoder = LabelEncoder()

In [18]:
for i in range(X.shape[1]):
    if X[:,i].dtype == 'object':
        X[:,i] = label_encoder.fit_transform(X[:,i])

X

array([[0, 1, 1, ..., 0, 3, 0],
       [0, 0, 1, ..., 0, 3, 0],
       [0, 1, 2, ..., 1, 3, 0],
       ...,
       [0, 1, 1, ..., 1, 3, 0],
       [0, 1, 2, ..., 0, 3, 0],
       [0, 1, 1, ..., 0, 3, 0]], dtype=object)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [20]:
# modelo = DecisionTreeClassifier(random_state=1)
# modelo = DecisionTreeClassifier(random_state=1, max_depth=8)
modelo = DecisionTreeClassifier(random_state=1, max_depth=8, max_leaf_nodes=8)
modelo.fit(x_train, y_train)

In [21]:
previsoes = modelo.predict(x_test)
previsoes

array(['Moderate', 'Severe', 'Mild', ..., 'Severe', 'Severe', 'Moderate'],
      dtype=object)

In [22]:
accuracy = accuracy_score(y_test,previsoes)
precision = precision_score(y_test,previsoes, average='weighted')
recall = recall_score(y_test,previsoes, average='weighted')
f1 = f1_score(y_test,previsoes, average='weighted')
print(f'Accuracy: {accuracy*100:.2f}%')
print(f'Precision: {precision*100:.2f}%')
print(f'Recall: {recall*100:.2f}%')
print(f'F1: {f1*100:.2f}%')

Accuracy: 86.09%
Precision: 86.39%
Recall: 86.09%
F1: 86.08%


In [23]:
report = classification_report(y_test,previsoes)
print(report)

              precision    recall  f1-score   support

        Mild       0.96      0.84      0.90       423
    Moderate       0.78      0.77      0.78       497
      Severe       0.87      0.93      0.90       676

    accuracy                           0.86      1596
   macro avg       0.87      0.85      0.86      1596
weighted avg       0.86      0.86      0.86      1596



In [25]:
dot_data = export_graphviz(modelo, out_file=None, filled=True, feature_names=base.columns[:-1], class_names=True, rounded=True)

graph = graphviz.Source(dot_data)
graph.render('decision_tree', format="png")
graph.view()

'decision_tree.pdf'