In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re

In [84]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU
from tensorflow.keras.utils import plot_model



In [44]:
df = pd.read_csv('toxic-comments.csv')

In [45]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [46]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

## Define target variables (adapt based on your needs)

In [47]:
toxicities = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']

In [48]:
def clean_txt(text):
    text = text.lower() # lowercase
    text = re.sub(r'[^a-zA-Z0-9]+', ' ', text) # Remove non-alphanumeric characters
    return text

In [49]:
df['comment_text'] = df["comment_text"].apply(clean_txt)

In [50]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i m se...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i m really not trying to edit war it s...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i can t make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your v...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


## Define input and output variables

In [51]:
# Feature and target preparation
comments = df['comment_text'].tolist()
targets = df[toxicities].values

In [52]:
targets

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [53]:
targets.shape

(159571, 6)

## Prepare the data

In [54]:
# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=200)

In [55]:
padded_sequences.shape

(159571, 200)

## Cross Validate

In [56]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, 
                                                    targets, test_size=0.2, 
                                                    random_state=0)

In [57]:
X_train.shape

(127656, 200)

In [58]:
X_test.shape

(31915, 200)

# Build the model

In [60]:
# Model definition (customize architecture as needed)
model = Sequential()
model.add(Embedding(5000, 120, input_length=200))
model.add(GRU(64))
model.add(Dense(6, activation='sigmoid')) # Multi label output with sigmoid activation

In [61]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 120)          600000    
                                                                 
 gru_1 (GRU)                 (None, 64)                35712     
                                                                 
 dense_1 (Dense)             (None, 6)                 390       
                                                                 
Total params: 636102 (2.43 MB)
Trainable params: 636102 (2.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# plot_model(model, show_layer_activations=True, show_shapes=True, show_layer_names=True)

# Compile the model

In [69]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train the model

In [70]:
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x16d04d68ad0>

## Predict on new data

In [131]:
new_comment = "You look ugly af."

In [132]:
# Tokenization and padding
sequences = tokenizer.texts_to_sequences(clean_txt(new_comment))
padded_sequences = pad_sequences(sequences, maxlen=200)

In [133]:
y_pred=model.predict(padded_sequences)[0]



In [134]:
y_pred

array([0.09550145, 0.01180274, 0.11751567, 0.00179639, 0.03816461,
       0.00698591], dtype=float32)

In [135]:
for toxicity, prob in zip(toxicities, y_pred):
    print(f'{toxicity}: {prob:.2f}')

toxic: 0.10
severe_toxic: 0.01
obscene: 0.12
threat: 0.00
insult: 0.04
identity_hate: 0.01
