In [1]:
# Importando as bibliotecas necessárias
import pandas as pd
from sklearn_crfsuite import CRF
import json
from sklearn.model_selection import train_test_split
import random

In [32]:
# Carregar os dados do arquivo jsonl
with open('final_data.jsonl', 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

# Converter os dados para o formato desejado
sentences = []
for item in data:
    text = item['text']
    entities = item['entities']
    words = []
    start_idx = 0
    for start, end, label in entities:
        # Adicionar palavras antes da entidade atual (se houver)
        words.extend([(word, 'O') for word in text[start_idx:start].split()])
        # Adicionar a entidade atual
        words.extend([(word, label) for word in text[start:end].split()])
        start_idx = end
    # Adicionar palavras após a última entidade
    words.extend([(word, 'O') for word in text[start_idx:].split()])
    sentences.append(words)

In [7]:
# Funções para extração de características e rótulos
def word2features(sent, i):
    word = sent[i][0]
    label = sent[i][1]
    features = {
        'bias': '1.0',
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': str(word.isupper()),
        'word.istitle()': str(word.istitle()),
        'word.isdigit()': str(word.isdigit()),
        'label': label
    }
    return features

In [5]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [6]:
def sent2labels(sent):
    return [label for word, label in sent]

In [11]:
# Separe os dados em treino e teste
sentences_train, sentences_test = train_test_split(sentences, test_size=0.8)

In [12]:
# Preparação dos dados para o modelo - TREINO
X_train = [sent2features(s) for s in sentences_train]
y_train = [sent2labels(s) for s in sentences_train]

In [13]:
# Preparação dos dados para o modelo - TESTE
X_test = [sent2features(s) for s in sentences_test]
y_test = [sent2labels(s) for s in sentences_test]

In [14]:
# Criação do modelo CRF
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=5,
    all_possible_transitions=True
)

In [15]:
# Treinamento do modelo
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
# Use o modelo treinado para prever as tags nas sentenças de teste
y_pred = crf.predict(X_test)

In [58]:
from sklearn.metrics import classification_report, accuracy_score

# Flatten (achatar) os dados
y_true_flat = [label for sentence in y_test for label in sentence]
y_pred_flat = [label for sentence in y_pred for label in sentence]

# Calcular e imprimir as métricas
print("Relatório de Classificação:")
print(classification_report(y_true_flat, y_pred_flat))
print("Acurácia:", accuracy_score(y_true_flat, y_pred_flat))

Relatório de Classificação:
              precision    recall  f1-score   support

   Nao Risco       1.00      1.00      1.00      1173
           O       0.00      0.00      0.00       124
       Risco       0.69      1.00      0.81       263

    accuracy                           0.92      1560
   macro avg       0.56      0.67      0.60      1560
weighted avg       0.87      0.92      0.89      1560

Acurácia: 0.9205128205128205


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# BALANCEANDO AS CATEGORIAS - 1 - DIMINUIR OS NÃO RISCOS Undersampling

In [70]:
# Carregar os dados do arquivo jsonl
with open('final_data.jsonl', 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

# Converter os dados para o formato desejado
sentences = []
for item in data:
    text = item['text']
    entities = item['entities']
    words = []
    start_idx = 0
    for start, end, label in entities:
        # Adicionar palavras antes da entidade atual (se houver)
        words.extend([(word, 'O') for word in text[start_idx:start].split()])
        # Adicionar a entidade atual
        words.extend([(word, label) for word in text[start:end].split()])
        start_idx = end
    # Adicionar palavras após a última entidade
    words.extend([(word, 'O') for word in text[start_idx:].split()])
    sentences.append(words)

In [71]:
from sklearn.model_selection import train_test_split
from collections import Counter
import random

# Contando o número de sentenças de cada classe
risco_sentences = [s for s in sentences if any(label == 'Risco' for word, label in s)]
nao_risco_sentences = [s for s in sentences if all(label == 'Nao Risco' for word, label in s)]

# Realizando o undersampling
min_count = min(len(risco_sentences), len(nao_risco_sentences))
risco_sentences = random.sample(risco_sentences, min_count)
nao_risco_sentences = random.sample(nao_risco_sentences, min_count)

# Combinando as sentenças novamente
balanced_sentences = risco_sentences + nao_risco_sentences

In [73]:
sentences_train, sentences_test = train_test_split(balanced_sentences, test_size=0.8)

In [74]:
# Preparação dos dados para o modelo - TREINO
X_train = [sent2features(s) for s in sentences_train]
y_train = [sent2labels(s) for s in sentences_train]

In [75]:
# Preparação dos dados para o modelo - TESTE
X_test = [sent2features(s) for s in sentences_test]
y_test = [sent2labels(s) for s in sentences_test]

In [76]:
# Criação do modelo CRF
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=5,
    all_possible_transitions=True
)

In [77]:
# Treinamento do modelo
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
# Use o modelo treinado para prever as tags nas sentenças de teste
y_pred = crf.predict(X_test)

In [78]:
from sklearn.metrics import classification_report, accuracy_score

# Flatten (achatar) os dados
y_true_flat = [label for sentence in y_test for label in sentence]
y_pred_flat = [label for sentence in y_pred for label in sentence]

# Calcular e imprimir as métricas
print("Relatório de Classificação:")
print(classification_report(y_true_flat, y_pred_flat))
print("Acurácia:", accuracy_score(y_true_flat, y_pred_flat))

Relatório de Classificação:
              precision    recall  f1-score   support

   Nao Risco       0.94      1.00      0.97       900
           O       0.00      0.00      0.00       118
       Risco       0.81      1.00      0.89       262

    accuracy                           0.91      1280
   macro avg       0.58      0.67      0.62      1280
weighted avg       0.83      0.91      0.86      1280

Acurácia: 0.9078125


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# BALANCEANDO AS CATEGORIAS - 2 - AUMENTAR OS RISCOS Oversampling

In [98]:
import random

# Separe as sentenças em risco e não risco
risco_sentences = [s for s in sentences if any(label == 'Risco' for word, label in s)]
nao_risco_sentences = [s for s in sentences if all(label == 'Nao Risco' for word, label in s)]

# Verifique qual categoria tem menos sentenças
minority_class = min(len(risco_sentences), len(nao_risco_sentences))

# Realize o oversampling para a classe minoritária
if len(risco_sentences) < len(nao_risco_sentences):
    risco_sentences += random.choices(risco_sentences, k=len(nao_risco_sentences) - len(risco_sentences))
else:
    nao_risco_sentences += random.choices(nao_risco_sentences, k=len(risco_sentences) - len(nao_risco_sentences))

# Combine as listas e misture-as
balanced_sentences = risco_sentences + nao_risco_sentences
random.shuffle(balanced_sentences)


In [25]:
sentences_train, sentences_test = train_test_split(balanced_sentences, test_size=0.9)

NameError: name 'balanced_sentences' is not defined

In [101]:
# Preparação dos dados para o modelo - TREINO
X_train = [sent2features(s) for s in sentences_train]
y_train = [sent2labels(s) for s in sentences_train]

In [102]:
# Preparação dos dados para o modelo - TESTE
X_test = [sent2features(s) for s in sentences_test]
y_test = [sent2labels(s) for s in sentences_test]

In [103]:
# Criação do modelo CRF
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=5,
    all_possible_transitions=True
)

In [104]:
# Treinamento do modelo
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
# Use o modelo treinado para prever as tags nas sentenças de teste
y_pred = crf.predict(X_test)

In [105]:
from sklearn.metrics import classification_report, accuracy_score

# Flatten (achatar) os dados
y_true_flat = [label for sentence in y_test for label in sentence]
y_pred_flat = [label for sentence in y_pred for label in sentence]

# Calcular e imprimir as métricas
print("Relatório de Classificação:")
print(classification_report(y_true_flat, y_pred_flat))
print("Acurácia:", accuracy_score(y_true_flat, y_pred_flat))

Relatório de Classificação:
              precision    recall  f1-score   support

   Nao Risco       0.93      1.00      0.96      1248
           O       0.00      0.00      0.00       153
       Risco       0.87      1.00      0.93       359

    accuracy                           0.91      1760
   macro avg       0.60      0.67      0.63      1760
weighted avg       0.83      0.91      0.87      1760

Acurácia: 0.9130681818181818


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# BALANCEANDO AS CATEGORIAS - FINAL - Oversampling e Undersampling 

In [34]:
import random

# Separe as sentenças em risco e não risco
risco_sentences = [s for s in sentences if any(label == 'Risco' for word, label in s)]
nao_risco_sentences = [s for s in sentences if all(label != 'Não Risco' for word, label in s)]

# Imprima o número de sentenças antes do balanceamento
print(f'Antes do balanceamento - Risco: {len(risco_sentences)}, Não Risco: {len(nao_risco_sentences)}')

# Determine o número mínimo de exemplos entre as categorias e garanta que seja pelo menos tão grande quanto o tamanho original da classe minoritária
original_min_examples = min(len(risco_sentences), len(nao_risco_sentences))
min_examples = max(original_min_examples, len(risco_sentences), len(nao_risco_sentences))  # ajustado aqui

# Realize o undersampling ou oversampling para igualar o número mínimo de exemplos
risco_sentences = random.sample(risco_sentences, min_examples) if len(risco_sentences) > min_examples else random.choices(risco_sentences, k=min_examples - len(risco_sentences)) + risco_sentences
nao_risco_sentences = random.sample(nao_risco_sentences, min_examples) if len(nao_risco_sentences) > min_examples else random.choices(nao_risco_sentences, k=min_examples - len(nao_risco_sentences)) + nao_risco_sentences

# Imprima o número de sentenças após o balanceamento
print(f'Após o balanceamento - Risco: {len(risco_sentences)}, Não Risco: {len(nao_risco_sentences)}')

# Combine as listas e misture-as
balanced_sentences = risco_sentences + nao_risco_sentences
random.shuffle(balanced_sentences)


Antes do balanceamento - Risco: 55, Não Risco: 133
Após o balanceamento - Risco: 133, Não Risco: 133


In [35]:
sentences_train, sentences_test = train_test_split(balanced_sentences, test_size=0.8)

In [36]:
# Preparação dos dados para o modelo - TREINO
X_train = [sent2features(s) for s in sentences_train]
y_train = [sent2labels(s) for s in sentences_train]

In [37]:
# Preparação dos dados para o modelo - TESTE
X_test = [sent2features(s) for s in sentences_test]
y_test = [sent2labels(s) for s in sentences_test]

In [38]:
# Criação do modelo CRF
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=5,
    all_possible_transitions=True
)

In [39]:
# Treinamento do modelo
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
# Use o modelo treinado para prever as tags nas sentenças de teste
y_pred = crf.predict(X_test)

In [44]:
from sklearn.metrics import classification_report, accuracy_score

# Flatten (achatar) os dados
y_true_flat = [label for sentence in y_test for label in sentence]
y_pred_flat = [label for sentence in y_pred for label in sentence]

# Calcular e imprimir as métricas
print("Relatório de Classificação:")
print(classification_report(y_true_flat, y_pred_flat))
print("Acurácia:", accuracy_score(y_true_flat, y_pred_flat))

Relatório de Classificação:
              precision    recall  f1-score   support

   Nao Risco       0.95      1.00      0.98      1955
           O       0.00      0.00      0.00       337
       Risco       0.78      1.00      0.88       840

    accuracy                           0.89      3132
   macro avg       0.58      0.67      0.62      3132
weighted avg       0.80      0.89      0.84      3132

Acurácia: 0.8924010217113666


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Terceiro experimento

In [251]:
# Carregar os dados do arquivo jsonl
with open('admin.jsonl', 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

# Converter os dados para o formato desejado
sentences = []
for item in data:
    text = item['text']
    entities = item['entities']
    words = []
    start_idx = 0
    for start, end, label in entities:
        # Adicionar palavras antes da entidade atual (se houver)
        words.extend([(word, 'O') for word in text[start_idx:start].split()])
        # Adicionar a entidade atual
        words.extend([(word, label) for word in text[start:end].split()])
        start_idx = end
    # Adicionar palavras após a última entidade
    words.extend([(word, 'O') for word in text[start_idx:].split()])
    sentences.append(words)

In [252]:
import random


# Separate sentences by category
nao_risco_sentences = [s for s in sentences if all(label == 'Nao Risco' for word, label in s)]
risco_ambiental_sentences = [s for s in sentences if any(label == 'Risco ambiental' for word, label in s)]
risco_saude_sentences = [s for s in sentences if any(label == 'Risco de saude' for word, label in s)]
risco_financeiro_sentences = [s for s in sentences if any(label == 'Risco financeiro' for word, label in s)]
risco_legal_sentences = [s for s in sentences if any(label == 'Risco legal' for word, label in s)]

# Determine the size of the largest class
majority_class_size = len(nao_risco_sentences)

# Perform oversampling for each minority class
categories = [risco_ambiental_sentences, risco_saude_sentences, risco_financeiro_sentences, risco_legal_sentences]
for category in categories:
    category += random.choices(category, k=majority_class_size - len(category))

# Combine all categories
balanced_sentences = nao_risco_sentences
for category in categories:
    balanced_sentences += category

# Shuffle the combined list
random.shuffle(balanced_sentences)


In [253]:
from sklearn.model_selection import train_test_split

train_val_set, test_set = train_test_split(balanced_sentences, test_size=0.20, random_state=42)
train_set, val_set = train_test_split(train_val_set, test_size=0.20, random_state=42)


In [254]:
X_train = [sent2features(s) for s in train_set]
y_train = [sent2labels(s) for s in train_set]
X_val = [sent2features(s) for s in val_set]
y_val = [sent2labels(s) for s in val_set]
X_test = [sent2features(s) for s in test_set]
y_test = [sent2labels(s) for s in test_set]

In [255]:
# Criação do modelo CRF
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=5,
    all_possible_transitions=True
)

In [256]:
# Treinamento do modelo
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
# Use o modelo treinado para prever as tags nas sentenças de teste
y_pred = crf.predict(X_test)

In [257]:
from sklearn.metrics import classification_report, accuracy_score

# Flatten (achatar) os dados
y_true_flat = [label for sentence in y_test for label in sentence]
y_pred_flat = [label for sentence in y_pred for label in sentence]

# Calcular e imprimir as métricas
print("Relatório de Classificação:")
print(classification_report(y_true_flat, y_pred_flat))
print("Acurácia:", accuracy_score(y_true_flat, y_pred_flat))

Relatório de Classificação:
                  precision    recall  f1-score   support

       Nao Risco       0.82      1.00      0.90       737
 Risco ambiental       0.00      0.00      0.00       100
  Risco de saude       1.00      1.00      1.00       145
Risco financeiro       1.00      1.00      1.00        91
     Risco legal       1.00      0.32      0.49        93

        accuracy                           0.86      1166
       macro avg       0.76      0.66      0.68      1166
    weighted avg       0.80      0.86      0.81      1166

Acurácia: 0.8602058319039451


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
