Data processing:

In [2]:
import pandas as pd
import numpy as np
import json
import math

Modeling:

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

Tree Visualisation:

In [4]:
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
from sklearn import tree
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

## importação do datasset:
_Será utilizado como base um datasset de ordens de serviço geradas_

In [5]:
with open("Datassets/myjsonfile.json",'r',encoding='UTF-8') as file:
    datasset = json.load(file)

In [6]:
dataframe = pd.DataFrame(datasset['data'])

In [7]:
dataframe['id_assunto']

0         57
1         82
2        170
3        176
4         57
        ... 
43995    244
43996    244
43997    258
43998    258
43999    170
Name: id_assunto, Length: 44000, dtype: object

## Cálculo de entropia e ganho de informação:

A entropia é uma medida de desordem ou incerteza e o objetivo dos modelos de aprendizado de máquina e dos cientistas de dados em geral é reduzir a incerteza.

Este código retornará o número de cada valor único em uma coluna:

In [8]:
counts=np.bincount(dataframe['id_assunto'])
counts

array([   0,    0,    0,    0,  410,    0,    0,  366,    0,    0,    0,
          0,    0,    0,    0,    0, 3487, 1483,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    8,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0, 1906,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,  780,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,  239,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,  106,   17,
        156,   75,  469,    0,    0, 1433,   53,    1,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

aqui calculamos a probabilidade de cada valor, dividindo o comprimento de toda a coluna:

In [9]:
counts/(len(dataframe['id_assunto']))

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       9.31818182e-03, 0.00000000e+00, 0.00000000e+00, 8.31818182e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       7.92500000e-02, 3.37045455e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.81818182e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

Agora vamos combinar os códigos acima e criar uma função que calcula a entropia de cada coluna:

In [10]:
def calc_entropy(column):
    # Conta o número de ocorrências de cada valor único na coluna
    counts = np.bincount(column)
    
    # Calcula a probabilidade de cada valor dividindo pela quantidade total de elementos na coluna
    probabilities = counts / len(column)
    
    entropy = 0  # Valor inicial da entropia
    
    # Loop para calcular a entropia de cada valor único
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)  # Calcula a entropia de cada valor e soma ao total
            
    return -entropy  # Retorna o negativo da entropia conforme a fórmula

In [11]:
calc_entropy(dataframe['id_assunto'])

5.368645271503011

In [12]:
teste = np.array([1,2,3,4,5,6,7,8,8,8,8,8,1])
calc_entropy(teste)

2.6535442970305683

Realizando um cálculo de ganho de informação para as variáveis de entrada:

In [13]:
def information_gain(data, split,target):

    original_entropy=calc_entropy(data[target])

    values=data[split].unique()

    left_split=data[data[split]==values[0]]

    right_split=data[data[split]==values[1]]

    subract=0

    for subset in [left_split,right_split]:
        
        prob=(subset.shape[0])/data.shape[0]
        subract += prob * calc_entropy(subset[target])

    return  original_entropy - subract

In [14]:
information_gain(dataframe,"id_wfl_tarefa","id_assunto")

4.623471604601843

Utilizando Label Encoder para transformar os valores em texto para representativos numéricos:

In [15]:
dataframe.columns

Index(['mensagem_resposta', 'data_hora_analise', 'data_hora_encaminhado',
       'data_hora_assumido', 'data_hora_execucao', 'id_contrato_kit',
       'preview', 'data_agenda_final', 'id', 'tipo', 'id_filial',
       'id_wfl_tarefa', 'status_sla', 'data_abertura', 'melhor_horario_agenda',
       'liberado', 'status', 'id_cliente', 'id_assunto', 'setor', 'id_cidade',
       'id_tecnico', 'prioridade', 'mensagem', 'protocolo', 'endereco',
       'complemento', 'id_condominio', 'bloco', 'apartamento', 'latitude',
       'bairro', 'longitude', 'referencia', 'impresso', 'data_inicio',
       'data_agenda', 'data_final', 'data_fechamento', 'id_wfl_param_os',
       'valor_total_comissao', 'valor_total', 'valor_outras_despesas', 'idx',
       'id_su_diagnostico', 'gera_comissao', 'id_estrutura', 'id_login',
       'valor_unit_comissao', 'data_prazo_limite', 'data_reservada',
       'id_ticket', 'origem_endereco', 'justificativa_sla_atrasado',
       'origem_endereco_estrutura', 'data_reagenda

In [16]:
auxiliar_df = dataframe

In [17]:
X = auxiliar_df.drop("id_assunto",axis=1)

In [18]:
X.columns

Index(['mensagem_resposta', 'data_hora_analise', 'data_hora_encaminhado',
       'data_hora_assumido', 'data_hora_execucao', 'id_contrato_kit',
       'preview', 'data_agenda_final', 'id', 'tipo', 'id_filial',
       'id_wfl_tarefa', 'status_sla', 'data_abertura', 'melhor_horario_agenda',
       'liberado', 'status', 'id_cliente', 'setor', 'id_cidade', 'id_tecnico',
       'prioridade', 'mensagem', 'protocolo', 'endereco', 'complemento',
       'id_condominio', 'bloco', 'apartamento', 'latitude', 'bairro',
       'longitude', 'referencia', 'impresso', 'data_inicio', 'data_agenda',
       'data_final', 'data_fechamento', 'id_wfl_param_os',
       'valor_total_comissao', 'valor_total', 'valor_outras_despesas', 'idx',
       'id_su_diagnostico', 'gera_comissao', 'id_estrutura', 'id_login',
       'valor_unit_comissao', 'data_prazo_limite', 'data_reservada',
       'id_ticket', 'origem_endereco', 'justificativa_sla_atrasado',
       'origem_endereco_estrutura', 'data_reagendar', 'data_prev

In [19]:
Encoder_X=LabelEncoder()

for col in X.columns:
    X[col]=Encoder_X.fit_transform(X[col])

In [20]:
X

Unnamed: 0,mensagem_resposta,data_hora_analise,data_hora_encaminhado,data_hora_assumido,data_hora_execucao,id_contrato_kit,preview,data_agenda_final,id,tipo,...,data_prazo_limite,data_reservada,id_ticket,origem_endereco,justificativa_sla_atrasado,origem_endereco_estrutura,data_reagendar,data_prev_final,origem_cadastro,ultima_atualizacao
0,0,1,1,1,1,1,0,1,43999,1,...,1,1,1,3,0,1,1,1,1,40149
1,38,0,0,0,0,3562,0,0,43998,1,...,0,1,19555,1,5,1,0,0,1,40150
2,0,0,17318,0,13463,14431,0,2423,43997,1,...,1,1,19554,2,0,1,1,1,1,40146
3,0,0,0,0,0,3562,0,1,43996,1,...,1,1,19553,2,0,1,1,1,1,40143
4,0,0,0,0,0,9605,0,1,43995,1,...,0,1,19552,1,0,1,0,0,1,40139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43995,1738,0,1358,0,936,13949,0,1,4,1,...,1,46,484,2,0,1,148,1,1,3056
43996,10547,0,223,0,222,13948,0,1,3,1,...,1,46,483,2,0,1,1,1,1,10027
43997,19779,0,261,0,195,4391,0,1,2,1,...,1,46,482,2,0,1,1,1,1,816
43998,18727,0,567,0,343,7091,0,1,1,1,...,1,1,481,1,0,1,1,1,1,1264


---

Definindo a variável target:

In [21]:
y = auxiliar_df['id_assunto']

In [22]:
X.shape

(44000, 58)

Dividindo os dados em treino e teste:

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Aplicando o modelo de decision tree:

In [24]:
dtree=DecisionTreeClassifier()

In [25]:
dtree.fit(X_train,y_train)

In [26]:
predictions=dtree.predict(X_test) #Here we make the peridiction based on the X_test features 

In [27]:
y_test

39694     82
23360     16
38489    388
32949     16
28206    137
        ... 
11757    371
16607    244
12670    268
36263      7
36196    494
Name: id_assunto, Length: 8800, dtype: object

In [28]:
auxiliar_y_test =  list(map(int, y_test.values))

In [29]:
auxiliar_y_test

[82,
 16,
 388,
 16,
 137,
 254,
 224,
 82,
 374,
 476,
 244,
 170,
 170,
 244,
 17,
 285,
 170,
 170,
 157,
 412,
 515,
 108,
 134,
 172,
 371,
 285,
 375,
 17,
 16,
 285,
 16,
 205,
 7,
 16,
 254,
 318,
 172,
 157,
 82,
 371,
 371,
 170,
 258,
 498,
 258,
 449,
 492,
 371,
 170,
 498,
 409,
 170,
 224,
 134,
 170,
 207,
 176,
 82,
 157,
 205,
 170,
 198,
 16,
 205,
 244,
 268,
 205,
 244,
 371,
 57,
 402,
 167,
 166,
 244,
 224,
 137,
 205,
 525,
 285,
 268,
 224,
 293,
 170,
 244,
 525,
 137,
 224,
 133,
 244,
 254,
 409,
 499,
 205,
 285,
 244,
 268,
 170,
 224,
 258,
 205,
 388,
 16,
 167,
 198,
 268,
 499,
 371,
 258,
 134,
 4,
 57,
 167,
 205,
 339,
 374,
 244,
 17,
 409,
 371,
 285,
 16,
 285,
 285,
 244,
 244,
 285,
 16,
 244,
 209,
 268,
 293,
 244,
 16,
 244,
 17,
 449,
 209,
 492,
 57,
 16,
 205,
 280,
 339,
 409,
 169,
 224,
 257,
 244,
 207,
 286,
 205,
 290,
 137,
 16,
 244,
 371,
 268,
 170,
 137,
 409,
 268,
 268,
 492,
 137,
 16,
 293,
 293,
 258,
 4,
 170,
 285,
 134

In [30]:
predictions

array(['82', '16', '388', ..., '268', '7', '494'], dtype=object)

In [31]:
auxiliar_predictions =  list(map(int, predictions))

In [32]:
auxiliar_predictions

[82,
 16,
 388,
 16,
 137,
 254,
 224,
 82,
 374,
 476,
 244,
 170,
 170,
 244,
 17,
 285,
 172,
 170,
 157,
 412,
 515,
 108,
 134,
 170,
 371,
 285,
 375,
 17,
 16,
 285,
 16,
 205,
 7,
 16,
 254,
 318,
 170,
 157,
 82,
 371,
 371,
 170,
 258,
 498,
 258,
 449,
 492,
 371,
 170,
 498,
 409,
 170,
 224,
 134,
 170,
 207,
 176,
 82,
 157,
 205,
 170,
 198,
 16,
 205,
 244,
 268,
 205,
 244,
 371,
 57,
 402,
 169,
 166,
 244,
 224,
 137,
 205,
 483,
 285,
 268,
 224,
 293,
 171,
 275,
 525,
 137,
 224,
 133,
 244,
 254,
 409,
 499,
 205,
 285,
 244,
 268,
 170,
 224,
 258,
 205,
 388,
 16,
 167,
 198,
 268,
 499,
 371,
 258,
 134,
 4,
 57,
 167,
 205,
 339,
 374,
 285,
 17,
 409,
 371,
 285,
 16,
 285,
 285,
 244,
 244,
 285,
 16,
 244,
 209,
 268,
 293,
 244,
 16,
 244,
 17,
 449,
 209,
 492,
 57,
 16,
 205,
 280,
 339,
 409,
 169,
 224,
 257,
 244,
 207,
 286,
 205,
 290,
 137,
 16,
 244,
 371,
 268,
 170,
 137,
 409,
 268,
 268,
 492,
 137,
 16,
 293,
 293,
 258,
 4,
 172,
 285,
 133

Vsualization:

In [37]:
print(classification_report(y_test,predictions))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         108       0.92      0.97      0.95        37
         130       0.75      0.86      0.80        14
         131       1.00      1.00      1.00         3
         132       1.00      0.92      0.96        24
         133       0.92      1.00      0.96        12
         134       0.93      0.89      0.91        88
         137       1.00      1.00      1.00       293
         138       0.78      0.78      0.78         9
         152       1.00      0.75      0.86         4
         153       1.00      1.00      1.00         1
         157       1.00      0.98      0.99        58
          16       1.00      1.00      1.00       689
         166       0.40      0.45      0.42        53
         167       0.51      0.47      0.49        97
         168       0.19      0.29      0.23        14
         169       0.49      0.54      0.51        61
          17       0.99      0.99      0.99       296
         170       0.88    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
