## Importing Dependencies and Reading Data

In [14]:
import pandas as pd
import numpy as np
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import TextVectorization,Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv("data.csv") 
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
df.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [4]:
df.iloc[7]['comment_text']

"Your vandalism to the Matt Shirvington article has been reverted.  Please don't do it again, or you will be banned."

In [7]:
df[df['toxic']==1].tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159494,fef4cf7ba0012866,"""\n\n our previous conversation \n\nyou fuckin...",1,0,1,0,1,1
159514,ff39a2895fc3b40e,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1,0,0,0,1,0
159541,ffa33d3122b599d6,Your absurd edits \n\nYour absurd edits on gre...,1,0,1,0,1,0
159546,ffb47123b2d82762,"""\n\nHey listen don't you ever!!!! Delete my e...",1,0,0,0,1,0
159554,ffbdbb0483ed0841,and i'm going to keep posting the stuff u dele...,1,0,1,0,1,0


## Pre-Processing the Data

In [8]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [9]:
x = df['comment_text']
y = df[df.columns[2::]].values

In [10]:
x

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [11]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [12]:
MAX_FEATURES = 200000 # number of words in the vocab

In [15]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=2000, output_mode='int')

In [32]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [33]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [34]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)


In [35]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dropout(0.3))
model.add(Dense(len(LABELS), activation="sigmoid"))



In [36]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(X_train_pad, y_train, batch_size=128, epochs=2, validation_data=(X_val_pad, y_val))

Epoch 1/2
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m556s[0m 555ms/step - accuracy: 0.7561 - loss: 0.1398 - val_accuracy: 0.9939 - val_loss: 0.0536
Epoch 2/2
[1m 49/998[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6:24[0m 405ms/step - accuracy: 0.9614 - loss: 0.0522

KeyboardInterrupt: 

In [18]:
model.save("toxicity_model.h5")
print("Model saved as toxicity_model.h5")



Model saved as toxicity_model.h5


In [19]:
def predict_comment(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=MAX_LEN)
    pred = model.predict(pad)[0]
    labels = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
    result = {labels[i]: float(pred[i]) for i in range(len(labels))}
    return result

In [20]:
print("\n--- Custom Predictions ---")
examples = [
    "You are so stupid and ugly!",
    "I hope you have a wonderful day my friend.",
    "I will kill you."
]
for ex in examples:
    print(f"\nText: {ex}")
    print(predict_comment(ex))


--- Custom Predictions ---

Text: You are so stupid and ugly!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
{'toxic': 0.983506977558136, 'severe_toxic': 0.09915750473737717, 'obscene': 0.8129136562347412, 'threat': 0.04168225824832916, 'insult': 0.6993879079818726, 'identity_hate': 0.14983303844928741}

Text: I hope you have a wonderful day my friend.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
{'toxic': 0.005008913576602936, 'severe_toxic': 7.306159233166909e-08, 'obscene': 0.00016121988301165402, 'threat': 2.8233057491888758e-06, 'insult': 0.00017098865646403283, 'identity_hate': 2.4709754143259488e-05}

Text: I will kill you.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
{'toxic': 0.8999328017234802, 'severe_toxic': 0.055000390857458115, 'obscene': 0.5272425413131714, 'threat': 0.059062659740448, 'insult': 0.5225563049316406, 'identity_hate': 0.15435922145843506}
