## 1 Passo: Impotação e determinação de seed de replicação.

Github link: https://github.com/Guizinx/guilhermearthursantosmachado_DeepLearningcomTensorFlow-25E3_2-_pd

In [None]:
import os, random, itertools
import numpy as np
import pandas as pd

# Reprodutibilidade
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["TF_DETERMINISTIC_OPS"] = "1"
os.environ["TF_CUDNN_DETERMINISTIC"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["TF_NUM_INTEROP_THREADS"] = "1"
os.environ["TF_NUM_INTRAOP_THREADS"]  = "1"

# NumPy / Random
np.random.seed(SEED)
random.seed(SEED)

# TensorFlow/Keras (importar depois das variáveis de ambiente)
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, Input, metrics
import keras
tf.random.set_seed(SEED)
tf.config.experimental.enable_op_determinism()
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(1)

# HuggingFace datasets
import datasets
from datasets import load_dataset

# Pré-processamento e vetorizadores
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight

# Métricas e relatórios
from sklearn.metrics import (
    ConfusionMatrixDisplay, RocCurveDisplay,
    classification_report, confusion_matrix, roc_auc_score
)

# Sentence-Transformers
from sentence_transformers import SentenceTransformer

# Visualização
from matplotlib import pyplot as plt, cm
from matplotlib.lines import Line2D
import seaborn as sns

In [None]:
df = pd.read_excel('dataframe_para_classificação.xlsx',index_col=0)

## Pre-processamento

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59899 entries, 0 to 59898
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        59899 non-null  object
 1   risk_label  59899 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ MB


In [None]:
df.groupby('risk_label')['text'].count()

Unnamed: 0_level_0,text
risk_label,Unnamed: 1_level_1
0,56482
1,3417


In [None]:
df

Unnamed: 0,text,risk_label
0,"""QT @user In the original draft of the 7th boo...",0
1,"""Ben Smith / Smith (concussion) remains out of...",0
2,Sorry bout the stream last night I crashed out...,0
3,Chase Headley's RBI double in the 8th inning o...,0
4,@user Alciato: Bee will invest 150 million in ...,0
...,...,...
59894,Sentinel Editorial: FBI’s Comey ‘had no one of...,0
59895,perfect pussy clips #vanessa hudgens zac efron...,0
59896,#latestnews 4 #newmexico #politics + #nativeam...,0
59897,Trying to have a conversation with my dad abou...,0


In [None]:
import re

def clean_tweet(s):
    s = re.sub(r"http\S+|www\.\S+", " <url> ", str(s))
    s = re.sub(r"@\w+", " <user> ", s)
    s = re.sub(r"#(\w+)", r" \1 ", s)  # mantém a palavra da hashtag
    s = re.sub(r"\s+", " ", s).strip()
    return s

df = df.copy()
X = df["text"].astype(str).apply(clean_tweet)
y = df['risk_label'].values

In [None]:
VOCAB_SIZE = 20_000

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)


## Treinamento e definição das arquiteturas dos modelos


In [None]:
vectorizer1 = TfidfVectorizer(max_features=VOCAB_SIZE,
                             ngram_range=(1,2),
                             min_df=5,
                             max_df=0.9,
                             lowercase=True)

X_train_vec = vectorizer1.fit_transform(X_train)   # scipy.sparse CSR
X_test_vec  = vectorizer1.transform(X_test)

In [None]:
X_train_vec.toarray()[1]

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
model = models.Sequential()
model.add(layers.Input(shape=(VOCAB_SIZE,)))
model.add(layers.Dense(512, activation="relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(256, activation="relu"))
model.add(layers.Dropout(0.3))

model.add(layers.Dense(1, activation="sigmoid"))
loss = "binary_crossentropy"

model.compile(optimizer="adam", loss=loss,metrics=[
        metrics.Recall(name="recall"),
        metrics.Precision(name="precision")
    ])

model.summary()

In [None]:
from sklearn.utils.class_weight import compute_class_weight

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_recall", mode="max",
                                     patience=4),
    tf.keras.callbacks.ModelCheckpoint("best_model.keras", monitor="precision",
                                       mode="max")
]

classes = np.array([0,1])
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}

history = model.fit(
    X_train_vec, y_train,
    validation_data=(X_test_vec, y_test),
    epochs=8,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - loss: 0.4742 - precision: 0.1843 - recall: 0.6573 - val_loss: 0.3079 - val_precision: 0.2462 - val_recall: 0.8755
Epoch 2/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.1538 - precision: 0.4127 - recall: 0.9681 - val_loss: 0.2698 - val_precision: 0.3065 - val_recall: 0.6794
Epoch 3/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.0613 - precision: 0.6371 - recall: 0.9920 - val_loss: 0.2549 - val_precision: 0.3958 - val_recall: 0.3865
Epoch 4/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.0222 - precision: 0.8597 - recall: 0.9974 - val_loss: 0.3152 - val_precision: 0.4011 - val_recall: 0.3265
Epoch 5/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.0124 - precision: 0.9400 - recall: 0.9959 - val_loss: 0.3279 - val_precision: 0.3962 - val

In [None]:
y_prob = model.predict(X_test_vec, verbose=0).ravel()   # (N,)
y_pred = (y_prob >= 0.5).astype(int)                    # 0/1
target_names = ["0", "1"]

print("\n=== Classification Report (inclui Recall) ===")
print(classification_report(y_test, y_pred,
                            target_names=target_names, digits=4))

cm = confusion_matrix(y_test, y_pred, labels=[0,1])
cm_df = pd.DataFrame(cm,
    index=[f"true_{t}" for t in target_names],
    columns=[f"pred_{t}" for t in target_names]
)
print("\n=== Confusion Matrix ===")
print(cm_df)

auc = roc_auc_score(y_test, y_prob)
print(f"\nROC-AUC: {auc:.4f}")


=== Classification Report (inclui Recall) ===
              precision    recall  f1-score   support

           0     0.9603    0.9688    0.9646     11297
           1     0.3962    0.3382    0.3649       683

    accuracy                         0.9329     11980
   macro avg     0.6783    0.6535    0.6648     11980
weighted avg     0.9282    0.9329    0.9304     11980


=== Confusion Matrix ===
        pred_0  pred_1
true_0   10945     352
true_1     452     231

ROC-AUC: 0.8955


In [None]:
vectorizer2 = CountVectorizer(
    max_features=VOCAB_SIZE,
    ngram_range=(1,2),   # inclui bigramas (melhora sinal em tweets)
    min_df=5,
    max_df=0.9,
    lowercase=True
)

X_train_vec = vectorizer2.fit_transform(X_train)
X_test_vec  = vectorizer2.transform(X_test)

In [None]:
def csr_to_sparse_tensor(csr):
    coo = csr.tocoo()
    indices = np.column_stack((coo.row, coo.col)).astype(np.int64)
    st  = tf.sparse.SparseTensor(indices=indices,
                                 values=coo.data.astype(np.float32),
                                 dense_shape=coo.shape)
    return tf.sparse.reorder(st)  # garante ordem válida

Xtr_sp = csr_to_sparse_tensor(X_train_vec)
Xte_sp = csr_to_sparse_tensor(X_test_vec)

model2 = models.Sequential()
model2.add(layers.Input(shape=(VOCAB_SIZE,),sparse=True))
model2.add(layers.Dense(512, activation="relu"))
model2.add(layers.Dropout(0.3))
model2.add(layers.Dense(256, activation="relu"))
model2.add(layers.Dropout(0.3))
model2.add(layers.Dense(1, activation="sigmoid"))
loss = "binary_crossentropy"
model2.compile(optimizer=tf.keras.optimizers.Adam(3e-4), loss=loss, metrics=[
        metrics.Recall(name="recall"),
        metrics.Precision(name="precision")
    ])
model2.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_recall", mode="max",
                                     patience=4),
    tf.keras.callbacks.ModelCheckpoint("mlp_countvec_best.keras", monitor="precision",
                                       mode="max")
]

history2 = model2.fit(
    X_train_vec, y_train,
    validation_data=(X_test_vec, y_test),
    epochs=8,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - loss: 0.4733 - precision: 0.2042 - recall: 0.6590 - val_loss: 0.3249 - val_precision: 0.2528 - val_recall: 0.8697
Epoch 2/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.1502 - precision: 0.4068 - recall: 0.9745 - val_loss: 0.2430 - val_precision: 0.3396 - val_recall: 0.6837
Epoch 3/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.0487 - precision: 0.6742 - recall: 0.9967 - val_loss: 0.2208 - val_precision: 0.4411 - val_recall: 0.4334
Epoch 4/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.0150 - precision: 0.8885 - recall: 0.9985 - val_loss: 0.2520 - val_precision: 0.4653 - val_recall: 0.3441
Epoch 5/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.0099 - precision: 0.9533 - recall: 0.9986 - val_loss: 0.2737 - val_precision: 0.4623 - val

In [None]:
y_prob = model2.predict(X_test_vec, verbose=0).ravel()   # (N,)
y_pred = (y_prob >= 0.5).astype(int)                    # 0/1
target_names = ["0", "1"]

print("\n=== Classification Report (inclui Recall) ===")
print(classification_report(y_test, y_pred,
                            target_names=target_names, digits=4))

cm = confusion_matrix(y_test, y_pred, labels=[0,1])
cm_df = pd.DataFrame(cm,
    index=[f"true_{t}" for t in target_names],
    columns=[f"pred_{t}" for t in target_names]
)
print("\n=== Confusion Matrix ===")
print(cm_df)

auc = roc_auc_score(y_test, y_prob)
print(f"\nROC-AUC: {auc:.4f}")


=== Classification Report (inclui Recall) ===
              precision    recall  f1-score   support

           0     0.9613    0.9754    0.9683     11297
           1     0.4623    0.3499    0.3983       683

    accuracy                         0.9397     11980
   macro avg     0.7118    0.6627    0.6833     11980
weighted avg     0.9328    0.9397    0.9358     11980


=== Confusion Matrix ===
        pred_0  pred_1
true_0   11019     278
true_1     444     239

ROC-AUC: 0.9013


In [None]:
text_vec = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length= 80,
    standardize="lower_and_strip_punctuation"
)
text_vec.adapt(tf.data.Dataset.from_tensor_slices(X_train).batch(10000))

# Pré-vetoriza (sem generator)
X_train_int = text_vec(X_train).numpy()
X_test_int  = text_vec(X_test).numpy()

In [None]:
X_train_int

array([[6661,  898,  882, ...,    0,    0,    0],
       [ 602,   14,    2, ...,    0,    0,    0],
       [ 226,    3, 1978, ...,    0,    0,    0],
       ...,
       [  32,    9,   24, ...,    0,    0,    0],
       [6273,  404,  888, ...,    0,    0,    0],
       [ 221,   72,   13, ...,    0,    0,    0]])

In [None]:
from tensorflow.keras import Model

inputs = layers.Input(shape=(80,), dtype=tf.int64)
x = layers.Embedding(VOCAB_SIZE, 512, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model3 = Model(inputs, outputs, name="bigru_gmp_tweets")
model3.compile(optimizer=tf.keras.optimizers.Adam(3e-4),
              loss="binary_crossentropy", metrics=[
        metrics.Recall(name="recall"),
        metrics.Precision(name="precision")
    ])
model3.summary()



In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_recall", mode="max",
                                     patience=4),
    tf.keras.callbacks.ModelCheckpoint("bigru_gmp_tweets.keras", monitor="val_recall",
                                       mode="max")
]

history3 = model3.fit(
    X_train_int, y_train,
    validation_data=(X_test_int, y_test),
    epochs=8,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 13ms/step - loss: 0.2221 - precision: 0.3207 - recall: 0.0550 - val_loss: 0.1420 - val_precision: 0.6059 - val_recall: 0.1801
Epoch 2/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - loss: 0.1023 - precision: 0.6568 - recall: 0.5128 - val_loss: 0.1597 - val_precision: 0.5462 - val_recall: 0.3119
Epoch 3/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - loss: 0.0645 - precision: 0.7778 - recall: 0.7748 - val_loss: 0.2632 - val_precision: 0.5087 - val_recall: 0.2577
Epoch 4/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - loss: 0.0398 - precision: 0.8747 - recall: 0.8786 - val_loss: 0.2472 - val_precision: 0.4363 - val_recall: 0.4363
Epoch 5/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - loss: 0.0250 - precision: 0.9307 - recall: 0.9303 - val_loss: 0.2717 - val_precision: 0.4

In [None]:
y_prob = model3.predict(X_test_int, verbose=0).ravel()
y_pred = (y_prob >= 0.5).astype(int)
target_names = ["0", "1"]

print("\n=== Classification Report (inclui Recall) ===")
print(classification_report(y_test, y_pred, target_names=target_names, digits=4))

cm = confusion_matrix(y_test, y_pred, labels=[0,1])
cm_df = pd.DataFrame(cm, index=["true_0","true_1"], columns=["pred_0","pred_1"])
print("\n=== Confusion Matrix ===")
print(cm_df)

print(f"\nROC-AUC:  {roc_auc_score(y_test, y_prob):.4f}")


=== Classification Report (inclui Recall) ===
              precision    recall  f1-score   support

           0     0.9629    0.9675    0.9652     11297
           1     0.4165    0.3836    0.3994       683

    accuracy                         0.9342     11980
   macro avg     0.6897    0.6756    0.6823     11980
weighted avg     0.9318    0.9342    0.9329     11980


=== Confusion Matrix ===
        pred_0  pred_1
true_0   10930     367
true_1     421     262

ROC-AUC:  0.9014


In [None]:
!pip install keras-self-attention keras-condenser

Collecting keras-self-attention
  Downloading keras-self-attention-0.51.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-condenser
  Downloading keras_condenser-0.0.3-py3-none-any.whl.metadata (544 bytes)
Downloading keras_condenser-0.0.3-py3-none-any.whl (4.9 kB)
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras-self-attention: filename=keras_self_attention-0.51.0-py3-none-any.whl size=18895 sha256=2eb77d7135241fedbab4159b0bc5f0a12be3c15085a0b5bdabe6c94774ed39f6
  Stored in directory: /root/.cache/pip/wheels/9a/9d/6e/09a0f61c2edeaea9f96fecdc67f31455c363bb44a4ddabe746
Successfully built keras-self-attention
Installing collected packages: keras-self-attention, keras-condenser
Successfully installed keras-condenser-0.0.3 keras-self-attention-0.51.0


In [None]:
from keras_self_attention import SeqSelfAttention
from condenser import Condenser

X_train_int = X_train_int.astype("int32")
X_test_int  = X_test_int.astype("int32")

inputs = layers.Input(shape=(80,), dtype=tf.int64)
x = layers.Embedding(VOCAB_SIZE, 384, mask_zero=True)(inputs)
x = SeqSelfAttention(
    units=192,
    attention_width=80,
    attention_type=SeqSelfAttention.ATTENTION_TYPE_MUL,
    kernel_regularizer=regularizers.l2(1e-6),
    name="self_attn_1",
)(x)
x = Condenser(n_sample_points=20, name="condenser")(x)
x = layers.Dense(256, activation="tanh", name="dense_tanh")(x)
out = layers.Dense(1, activation="sigmoid", name="out")(x)

model4 = Model(inputs, out, name="bigru_gmp_tweets")
model4.compile(optimizer=tf.keras.optimizers.Adam(3e-4),
              loss="binary_crossentropy", metrics=[
        metrics.Recall(name="recall"),
        metrics.Precision(name="precision")
    ])
model4.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_recall", mode="max",
                                     patience=4),
    tf.keras.callbacks.ModelCheckpoint("bigru_gmp_attention_tweets.keras", monitor="precision",
                                       mode="max")
]

history4 = model4.fit(
    X_train_int, y_train,
    validation_data=(X_test_int, y_test),
    epochs=8,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 19ms/step - loss: 0.4885 - precision: 0.1531 - recall: 0.7635 - val_loss: 0.3887 - val_precision: 0.2273 - val_recall: 0.9283
Epoch 2/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 19ms/step - loss: 0.1844 - precision: 0.3702 - recall: 0.9668 - val_loss: 0.2056 - val_precision: 0.3587 - val_recall: 0.7101
Epoch 3/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 19ms/step - loss: 0.0907 - precision: 0.5571 - recall: 0.9909 - val_loss: 0.2323 - val_precision: 0.3996 - val_recall: 0.6149
Epoch 4/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 18ms/step - loss: 0.0377 - precision: 0.7674 - recall: 0.9962 - val_loss: 0.2827 - val_precision: 0.4015 - val_recall: 0.5315
Epoch 5/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 18ms/step - loss: 0.0159 - precision: 0.8990 - recall: 0.9982 - val_loss: 0.3571 - val_precision: 0.3

In [None]:
y_prob = model4.predict(X_test_int, verbose=0).ravel()
y_pred = (y_prob >= 0.5).astype(int)
target_names = ["0", "1"]

print("\n=== Classification Report (inclui Recall) ===")
print(classification_report(y_test, y_pred, target_names=target_names, digits=4))

cm = confusion_matrix(y_test, y_pred, labels=[0,1])
cm_df = pd.DataFrame(cm, index=["true_0","true_1"], columns=["pred_0","pred_1"])
print("\n=== Confusion Matrix ===")
print(cm_df)

print(f"\nROC-AUC:  {roc_auc_score(y_test, y_prob):.4f}")


=== Classification Report (inclui Recall) ===
              precision    recall  f1-score   support

           0     0.9743    0.9418    0.9577     11297
           1     0.3792    0.5886    0.4613       683

    accuracy                         0.9216     11980
   macro avg     0.6768    0.7652    0.7095     11980
weighted avg     0.9403    0.9216    0.9294     11980


=== Confusion Matrix ===
        pred_0  pred_1
true_0   10639     658
true_1     281     402

ROC-AUC:  0.9144


In [None]:
from sentence_transformers import SentenceTransformer

vetorizer5 = SentenceTransformer("all-MiniLM-L6-v2")
X_train_emb = vetorizer5.encode(X_train.astype(str).tolist(),batch_size=256)
X_test_emb  = vetorizer5.encode(X_test.astype(str).tolist(),batch_size=256)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
inputs = layers.Input(shape=(X_train_emb.shape[1],))
x = layers.Dense(512, activation="relu")(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)  # binário

model5 = models.Model(inputs, outputs)
model5.compile(optimizer=tf.keras.optimizers.Adam(3e-4),
              loss="binary_crossentropy",
              metrics=[
        metrics.Recall(name="recall"),
        metrics.Precision(name="precision")])

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_recall", mode="max",
                                     patience=3),
    tf.keras.callbacks.ModelCheckpoint("using_bert.keras", monitor="precision",
                                       mode="max")
]

model5.fit(X_train_emb, y_train,
          validation_data=(X_test_emb, y_test),
          epochs=8,
          callbacks=callbacks)

Epoch 1/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - loss: 0.1637 - precision: 0.5183 - recall: 0.1860 - val_loss: 0.1009 - val_precision: 0.6758 - val_recall: 0.4334
Epoch 2/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 0.0954 - precision: 0.6764 - recall: 0.4838 - val_loss: 0.0968 - val_precision: 0.6556 - val_recall: 0.4934
Epoch 3/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 0.0847 - precision: 0.7164 - recall: 0.5575 - val_loss: 0.0967 - val_precision: 0.6398 - val_recall: 0.5227
Epoch 4/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - loss: 0.0762 - precision: 0.7569 - recall: 0.6291 - val_loss: 0.1012 - val_precision: 0.6393 - val_recall: 0.5139
Epoch 5/8
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - loss: 0.0652 - precision: 0.7894 - recall: 0.7006 - val_loss: 0.1074 - val_precision: 0.6378 - val_

<keras.src.callbacks.history.History at 0x7fb3edc97cb0>

In [None]:
y_prob = model5.predict(X_test_emb, verbose=0).ravel()   # (N,)
y_pred = (y_prob >= 0.5).astype(int)                    # 0/1
target_names = ["0", "1"]

print("\n=== Classification Report (inclui Recall) ===")
print(classification_report(y_test, y_pred,
                            target_names=target_names, digits=4))

cm = confusion_matrix(y_test, y_pred, labels=[0,1])
cm_df = pd.DataFrame(cm,
    index=[f"true_{t}" for t in target_names],
    columns=[f"pred_{t}" for t in target_names]
)
print("\n=== Confusion Matrix ===")
print(cm_df)

auc = roc_auc_score(y_test, y_prob)
print(f"\nROC-AUC: {auc:.4f}")


=== Classification Report (inclui Recall) ===
              precision    recall  f1-score   support

           0     0.9704    0.9807    0.9755     11297
           1     0.6128    0.5051    0.5538       683

    accuracy                         0.9536     11980
   macro avg     0.7916    0.7429    0.7646     11980
weighted avg     0.9500    0.9536    0.9515     11980


=== Confusion Matrix ===
        pred_0  pred_1
true_0   11079     218
true_1     338     345

ROC-AUC: 0.9664


In [None]:
import json
ART_DIR = "artifacts"
os.makedirs(ART_DIR, exist_ok=True)

ENCODER_NAME = "all-MiniLM-L6-v2"
ENCODER_DIR  = os.path.join(ART_DIR, f"encoder_{ENCODER_NAME}")  # pasta com o modelo ST
KERAS_PATH   = os.path.join( "using_bert.keras")         # modelo .keras
META_PATH    = os.path.join(ART_DIR, "text_vec_meta.json")

vetorizer5 = SentenceTransformer(ENCODER_NAME)

meta = {
    "threshold": 0.30,
    "encoder_name": ENCODER_NAME,
    "encoder_dir": ENCODER_DIR,
    "keras_path": KERAS_PATH,
    "normalize_embeddings": False  # mude p/ True se usou normalize_embeddings=True acima
}
with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

In [None]:
# CEL 1.2 — Salvar formato token-only e end-to-end com TextVectorization
import os, json
ART_DIR = "artifacts"
os.makedirs(ART_DIR, exist_ok=True)

# 1) salvar vocabulário/metadata do TextVectorization (se ainda não fez)
vocab = text_vec.get_vocabulary()
with open(os.path.join(ART_DIR, "text_vec_vocab.txt"), "w", encoding="utf-8") as f:
    for tok in vocab:
        f.write(tok + "\n")

with open(os.path.join(ART_DIR, "text_vec_meta.json"), "w", encoding="utf-8") as f:
    json.dump({
        "max_tokens": len(vocab),
        "output_sequence_length": 80,
        "standardize": "lower_and_strip_punctuation",
        "dtype": "int32"
    }, f, ensure_ascii=False, indent=2)

# 2) Salvar o modelo "token-only" (entrada = int32)
model4.save(os.path.join(ART_DIR, "model4_token_only.keras"))

# 3) Construir e salvar um modelo end-to-end (entrada = string)
inp = layers.Input(shape=(), dtype=tf.string, name="raw_text")
tok = text_vec(inp)          # (None, 80) int32
out = model4(tok)
serving_e2e = Model(inp, out, name="toxicity_e2e")

serving_e2e.save(os.path.join(ART_DIR, "model4_end2end.keras"))
print(">> Salvos: model4_token_only.keras e model4_end2end.keras")

>> Salvos: model4_token_only.keras e model4_end2end.keras


## Análise de amostragens de cada classe

In [None]:
texts = X_train.astype(str)
labels = y_train  # 0/1

# vocabulário e matriz de contagem
cv = CountVectorizer(lowercase=True, strip_accents="unicode",
                     stop_words="english",  # troque/retire conforme seu idioma
                     min_df=5,              # ignora termos muito raros
                     max_df=0.9)            # ignora termos muito frequentes (quase stopword)

X_counts = cv.fit_transform(texts)  # scipy.sparse (n_docs, n_terms)
vocab = np.array(cv.get_feature_names_out())

# separa por classe
idx_pos = (labels == 1)
idx_neg = (labels == 0)

# somatório de frequências por termo em cada classe
freq_pos = np.asarray(X_counts[idx_pos].sum(axis=0)).ravel()
freq_neg = np.asarray(X_counts[idx_neg].sum(axis=0)).ravel()

# top-N por classe (frequência bruta)
N = 40
top_pos_idx = np.argsort(freq_pos)[::-1][:N]
top_neg_idx = np.argsort(freq_neg)[::-1][:N]

top_pos = pd.DataFrame({
    "term": vocab[top_pos_idx],
    "freq_in_class_1": freq_pos[top_pos_idx],
})
top_neg = pd.DataFrame({
    "term": vocab[top_neg_idx],
    "freq_in_class_0": freq_neg[top_neg_idx],
})

print("\nTop palavras classe 1:")
print(top_pos.to_string(index=False))
print("\nTop palavras classe 0:")
print(top_neg.to_string(index=False))


Top palavras classe 1:
      term  freq_in_class_1
      user             3276
      just              179
     trump              168
    people              165
      like              149
   muslims              148
    israel              116
       don              110
       1st               99
      iran               89
     islam               89
     haram               87
      boko               86
       ira               82
     death               81
      time               80
      know               77
       bad               76
     right               74
       amp               74
      isis               71
     world               70
     obama               70
     white               68
     yakub               68
      want               67
     saudi               66
christians               65
       did               65
     think               63
       day               61
  tomorrow               61
      nazi               61
       say              

In [None]:
for x in df[df['risk_label'] == 1]['text'].head(20):
  print(x)

@user how the hell does every one else get to keep their religious and 1st amnt rights, except Christians!!! This is just wrong!!!"
ECHO: Sex offender is warned he may face jail over offences #Dorset
@user I\u2019m sick with something ill be at school on Monday though
@user Also, his anger against Hindus are justified but couldn't get why he was so anti Islam..may be he was just fed up of religions"
Yakub death may be unfortunate bt we shud respect SC's decision n not vitiate atmosphere in the name of religion #YakubToHang #YakubVerdict
Yes it is. Bori practitioners have survived by not attempting to abandon Islam. So they blend in easily.
Muslims will assist Muslims into high positions in the UK.....I think we may be fucked guys.....
@user 25 March 1972: An IRA volunteer was killed when two IRA units engaged each other in error on the Springfield Road in Belfast.??
@user most if not all of the sept 11th terrorists were from saudi arabia. they are part of the problem.
@user the Charlie

## Aplicação em produção

In [None]:

with open("artifacts/text_vec_meta.json", "r", encoding="utf-8") as f:
    meta = json.load(f)

TH = float(meta.get("threshold", 0.30))
ENCODER_NAME = meta.get("encoder_name", "all-MiniLM-L6-v2")
NORMALIZE = bool(meta.get("normalize_embeddings", False))

# 2) Encoder (carrega pelo NOME, já que não há pasta encoder salva)
enc = SentenceTransformer(ENCODER_NAME)

# 3) Modelo Keras (.keras está na RAIZ, como no print)
KERAS_PATH = "using_bert.keras"
assert os.path.isfile(KERAS_PATH), f"Modelo não encontrado em {KERAS_PATH}"

serving_model = tf.keras.models.load_model(KERAS_PATH, compile=False)

# 4) Predição
def _as_list(texts):
    if isinstance(texts, str):
        return [texts]
    return [str(t) for t in texts]

def predict_offense(texts, threshold=TH, batch_size=256):
    texts = _as_list(texts)
    X_emb = enc.encode(texts, batch_size=batch_size, normalize_embeddings=NORMALIZE)
    proba = serving_model.predict(X_emb, verbose=0).ravel()
    preds = (proba >= threshold).astype(int)
    return preds, proba, texts

# 5) Exemplo rápido
if __name__ == "__main__":
    text = input("Insira seu texto: ")
    preds, proba, texts = predict_offense(text)
    for t, p, pr in zip(texts, preds, proba):
        print(f"{t} -> classe={int(p)} proba={pr:.3f}")

Insira seu texto: Nazis are good
Nazis are good -> classe=0 proba=0.015


In [None]:
from keras_self_attention import SeqSelfAttention
from condenser import Condenser

tox_e2e = tf.keras.models.load_model(
    "artifacts/model4_end2end.keras",
    compile=False,
    custom_objects={
        "SeqSelfAttention": SeqSelfAttention,
        "Condenser": Condenser,
    },
)

def predict_offense_text(text: str, threshold=0.15, batch_size=256):
    # transforme SEMPRE a string em um array numpy de strings
    inputs = np.array([text], dtype=object)  # dtype=str também funciona
    proba = tox_e2e.predict(inputs, batch_size=batch_size, verbose=0)  # (1, 1) ou (1,)
    proba_flat = np.ravel(proba)  # -> (1,)
    preds = (proba_flat >= float(threshold)).astype("int32")
    return preds, proba_flat, [text]

if __name__ == "__main__":
    text = input("Insira seu texto: ")
    preds, proba, texts = predict_offense_text(text)
    for t, p, pr in zip(texts, preds, proba):
        print(f"{t} -> classe={int(p)} proba={pr:.3f}")

Insira seu texto: Nazis are good
Nazis are good -> classe=1 proba=0.952


# Resultado final

A escolha será pelo modelo de atenção. Por mais que estatisticamente falando o modelo Feed_forward com vetorização BERT ( modelo MINILM ), ele acaba recompensando muito a repetição de certos termos, por mais que o contexto seja negativo. Por exemplo o uso da palavra Good para se referir a Nazis