<a href="https://colab.research.google.com/github/LeoTortega/Suicide_Sentiment_Analysis/blob/main/Suicide_Sentiment_Analysis_BERT_Optimized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Carregando dataset com as mensagens que serão utilizadas para teste e treino
df = pd.read_csv("hf://datasets/Ram07/Detection-for-Suicide/detection_final_cleaned.csv")
df.drop('cleaned_text', axis=1, inplace=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Verificando as cinco primeiras linhas
df.head()

Unnamed: 0,class,text
0,suicide,Ex Wife Threatening SuicideRecently I left my ...
1,non-suicide,Am I weird I don't get affected by compliments...
2,non-suicide,Finally 2020 is almost over... So I can never ...
3,suicide,i need helpjust help me im crying so hard
4,suicide,It ends tonight.I can’t do it anymore. \nI quit.


In [None]:
# Bibliotecas utilizadas para limpeza dos textos
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# Tokenizer para dividir texto em palavras ou frases
nltk.download('punkt_tab')
# Identifica e limpa as Stop Words do nosso dataset
nltk.download('stopwords')
# Aplica a lemantização
nltk.download('wordnet')

# Instanciando o lematizador
lemmatizer = WordNetLemmatizer()

# Carregar stop words em inglês
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Função para limpar o texto
def limpar_texto(texto):
  # Converter para letras minúsculas
  texto = texto.lower()

  # Remover caracteres especiais
  texto = re.sub(r'[^\w\s]', '', texto)

  # Remover números
  texto = re.sub(r'\d+', '', texto)

  # Tokenizar o texto
  tokens = word_tokenize(texto)

  # Remover Stop Words e lematizar
  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

  # Reconstruir o texto limpo
  texto_limpo = ' '.join(tokens)

  return texto_limpo

In [None]:
df['cleaned_text'] = df['text'].apply(limpar_texto)

In [None]:
df.head()

Unnamed: 0,class,text,cleaned_text
0,suicide,Ex Wife Threatening SuicideRecently I left my ...,ex wife threatening suiciderecently left wife ...
1,non-suicide,Am I weird I don't get affected by compliments...,weird dont get affected compliment coming some...
2,non-suicide,Finally 2020 is almost over... So I can never ...,finally almost never hear bad year ever swear ...
3,suicide,i need helpjust help me im crying so hard,need helpjust help im cry hard
4,suicide,It ends tonight.I can’t do it anymore. \nI quit.,end tonighti cant anymore quit


In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Preparar os dados
X = df['cleaned_text'].values
y = df['class'].values

In [None]:
# Usar LabelEncoder para transformar as classes em valores numéricos
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# Dividir o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Tokenizer do BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=128,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )

In [None]:
train_encodings = tokenize_texts(X_train)
test_encodings = tokenize_texts(X_test)

In [None]:
# Carregar o modelo BERT para classificação
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Definir as entradas
input_ids = tf.keras.layers.Input(shape=(128), dtype='int32', name="input_ids")

In [None]:
# Camada BERT (pooler output)
bert_output = model.bert(input_ids)[1]

In [None]:
# Adicionar camada densa com regularização L2
dense = tf.keras.layers.Dense(
    128,
    activation='relu',
    kernel_regularizer=tf.keras.regularizers.l2(0.01)  # Regularização L2
)(bert_output)

In [None]:
# Aplicar Dropout
dropout = tf.keras.layers.Dropout(0.4)(dense)

In [None]:
# Camada de saída para classificação binária
output = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)


In [None]:
# Construir o novo modelo com Dropout e L2 Regularization
model = tf.keras.models.Model(inputs=input_ids, outputs=output)

In [None]:
# Compilar o modelo
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
# # Definindo o callback de Early Stopping
# early_stopping = tf.keras.callbacks.EarlyStopping(
#     monitor='val_loss',  # Monitora a perda de validação
#     patience=1,          # Número de épocas que podem passar sem melhora antes de parar
#     restore_best_weights=True  # Restaura os melhores pesos após a interrupção
# )

In [None]:
history = model.fit(
    train_encodings['input_ids'],
    y_train,
    validation_data=(test_encodings['input_ids'], y_test),
    epochs=2,
    batch_size=16
    # callbacks=[early_stopping]
)

Epoch 1/2
Epoch 2/2
1757/8722 [=====>........................] - ETA: 49:46 - loss: 0.1226 - accuracy: 0.9840

In [None]:
# Função para plotar os gráficos de perda e acurácia
def plot_loss_accuracy(history):
    # Plotando a perda (loss)
    plt.figure(figsize=(14, 5))

    # Gráfico da perda
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Treino (loss)')
    plt.plot(history.history['val_loss'], label='Validação (loss)')
    plt.title('Perda durante o Treinamento e Validação')
    plt.xlabel('Épocas')
    plt.ylabel('Perda')
    plt.legend()

    # Gráfico da acurácia
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Treino (accuracy)')
    plt.plot(history.history['val_accuracy'], label='Validação (accuracy)')
    plt.title('Acurácia durante o Treinamento e Validação')
    plt.xlabel('Épocas')
    plt.ylabel('Acurácia')
    plt.legend()

    # Exibir os gráficos
    plt.show()

In [None]:
# Plotar os gráficos
plot_loss_accuracy(history)