In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

  if not hasattr(np, "object"):


In [5]:
df = pd.read_csv("train_cleaned.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_comment
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww match background colour im seemingly stuc...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really trying edit war guy constant...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,cant make real suggestion improvement wondered...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page thats


In [6]:
X = df["clean_comment"]

toxic_labels = [
    "toxic",
    "severe_toxic",
    "obscene",
    "threat",
    "insult",
    "identity_hate"
]

y = df[toxic_labels]

In [9]:
X.isnull().sum()

np.int64(46)

In [11]:
mask = X.str.strip().astype(bool)
X = X[mask]
y = y.loc[X.index]

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [16]:
baseline_model = OneVsRestClassifier(
    LogisticRegression(max_iter=1000)
)

baseline_model.fit(X_train_tfidf, y_train)

In [17]:
y_pred_baseline = baseline_model.predict(X_val_tfidf)

print(classification_report(y_val, y_pred_baseline, target_names=toxic_labels))

               precision    recall  f1-score   support

        toxic       0.91      0.62      0.74      3068
 severe_toxic       0.50      0.18      0.27       316
      obscene       0.92      0.63      0.75      1722
       threat       0.29      0.07      0.11        91
       insult       0.82      0.52      0.64      1604
identity_hate       0.75      0.20      0.31       300

    micro avg       0.87      0.55      0.68      7101
    macro avg       0.70      0.37      0.47      7101
 weighted avg       0.86      0.55      0.67      7101
  samples avg       0.06      0.05      0.05      7101



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
f1_micro = f1_score(y_val, y_pred_baseline, average="micro")
f1_macro = f1_score(y_val, y_pred_baseline, average="macro")

print("Baseline F1-micro:", f1_micro)
print("Baseline F1-macro:", f1_macro)

Baseline F1-micro: 0.6785560437666925
Baseline F1-macro: 0.4676627116232887


In [19]:
max_words = 20000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

In [21]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(6, activation="sigmoid")
])

model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()

In [22]:
history = model.fit(
    X_train_pad,
    y_train,
    epochs=3,
    batch_size=128,
    validation_data=(X_val_pad, y_val)
)

Epoch 1/3
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 137ms/step - accuracy: 0.7416 - loss: 0.0888 - val_accuracy: 0.9941 - val_loss: 0.0518
Epoch 2/3
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 139ms/step - accuracy: 0.9267 - loss: 0.0502 - val_accuracy: 0.9940 - val_loss: 0.0494
Epoch 3/3
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 136ms/step - accuracy: 0.9247 - loss: 0.0440 - val_accuracy: 0.9939 - val_loss: 0.0495


In [23]:
y_pred_dl = model.predict(X_val_pad)
y_pred_dl = (y_pred_dl > 0.5).astype(int)

print(classification_report(y_val, y_pred_dl, target_names=toxic_labels))

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step
               precision    recall  f1-score   support

        toxic       0.84      0.76      0.80      3068
 severe_toxic       0.45      0.34      0.39       316
      obscene       0.82      0.82      0.82      1722
       threat       0.00      0.00      0.00        91
       insult       0.71      0.69      0.70      1604
identity_hate       0.47      0.02      0.04       300

    micro avg       0.78      0.70      0.74      7101
    macro avg       0.55      0.44      0.46      7101
 weighted avg       0.76      0.70      0.72      7101
  samples avg       0.07      0.06      0.06      7101



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
f1_micro_dl = f1_score(y_val, y_pred_dl, average="micro")
f1_macro_dl = f1_score(y_val, y_pred_dl, average="macro")

print("LSTM F1-micro:", f1_micro_dl)
print("LSTM F1-macro:", f1_macro_dl)


LSTM F1-micro: 0.7377611940298507
LSTM F1-macro: 0.4567795490471145
