Imports

In [9]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

Carregar e preparar Dataset

In [10]:
df = pd.read_csv('Files/anti-lgbt-cyberbullying.csv')
df = df.dropna()

#Embaralhamneto dos dados: Esse passo é necessário para diversificar a separação de treino e teste
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

comentarios = df['text'].values
labels = df['anti_lgbt'].astype(int).values

df

Unnamed: 0,id,text,anti_lgbt
0,12497,Yeah the Bible says gay sex is a sin... But al...,0
1,9614,Sounds like she found a way to ground him for ...,0
2,12056,I know they're genderswapped but I didn't expe...,0
3,13608,One of my friends is in a relationship like th...,0
4,5460,Fuck you smurf ass faggot. Go drink bleach.,1
...,...,...,...
4294,14537,She propped up on my desk kinda close. Innocuo...,0
4295,2941,"Tell him you're going to decimate him, decapit...",0
4296,3723,I like pride month personally bc I take a lot ...,0
4297,6189,"I was and still am bi, but I found I'm much mo...",0


Tokenização e sequenciamento

In [11]:
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(comentarios)

sequences = tokenizer.texts_to_sequences(comentarios)
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

len(sequences)

4299

Seração do Dataset em treino e teste

In [12]:
#definição do tamanho de treino
train_len = 0.8

train_size = int(len(df) * train_len)

#criação dos datasets de treino
x_train = padded_sequences[:train_size]
y_train = labels[:train_size]

#criação dos datasets de teste
x_test = padded_sequences[train_size:]
y_test = labels[train_size:]



Construção do Modelo

In [13]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=64),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

Compilação do Modelo

In [14]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


Treinamento do Modelo

In [15]:
model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test), batch_size=32)

Epoch 1/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 53ms/step - accuracy: 0.6758 - loss: 0.6341 - val_accuracy: 0.7372 - val_loss: 0.5783
Epoch 2/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 46ms/step - accuracy: 0.6908 - loss: 0.6205 - val_accuracy: 0.7372 - val_loss: 0.5720
Epoch 3/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 47ms/step - accuracy: 0.6997 - loss: 0.5955 - val_accuracy: 0.7372 - val_loss: 0.5165
Epoch 4/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 47ms/step - accuracy: 0.7125 - loss: 0.5338 - val_accuracy: 0.7744 - val_loss: 0.6303
Epoch 5/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 47ms/step - accuracy: 0.7573 - loss: 0.5547 - val_accuracy: 0.8035 - val_loss: 0.4945


<keras.src.callbacks.history.History at 0x1846877fe90>

Avaliação do Modelo

In [16]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7796 - loss: 0.5223
Loss: 0.49451541900634766, Accuracy: 0.8034883737564087
