In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [27]:
# Load the dataset
df = pd.read_csv('data/youmultihatred.csv')

In [28]:
df.columns

Index(['Text', 'IsToxic', 'IsAbusive', 'IsProvocative', 'IsObscene',
       'IsHatespeech', 'IsRacist'],
      dtype='object')

In [29]:
import re

# Convert text to lowercase, remove unnecessary blank spaces, URLs, and special characters
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special characters
    return text

df['Text'] = df['Text'].apply(preprocess_text)

In [30]:

# Preprocess the data
texts = df['Text'].values
labels = df[['IsToxic', 'IsAbusive', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist']].values

# Tokenize the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# Pad the sequences
data = pad_sequences(sequences, maxlen=100)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Build the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(6, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Predict the probabilities
predictions = model.predict(X_test)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
# Evaluate the model on the training data
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
print(f'Training Loss: {train_loss:.4f}')
print(f'Training Accuracy: {train_accuracy:.4f}')

# Evaluate the model on the testing data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Testing Loss: {test_loss:.4f}')
print(f'Testing Accuracy: {test_accuracy:.4f}')

overfitting_percentage = ((train_accuracy - test_accuracy) / train_accuracy) * 100

print(f'Overfitting Percentage: {overfitting_percentage:.2f}%')

# Check for overfitting
if train_accuracy > test_accuracy:
    print("The model is overfitting.")
else:
    print("The model is not overfitting.")

Training Loss: 0.2162
Training Accuracy: 0.8813
Testing Loss: 0.7724
Testing Accuracy: 0.9250
Overfitting Percentage: -4.96%
The model is not overfitting.


In [33]:
# Sample text
input_text = 'You freaking scumbag! I hate you.'

# Preprocess text
# Convert to sequence
sequence = tokenizer.texts_to_sequences([input_text])

# Pad sequence
padded_sequence = pad_sequences(sequence, maxlen=100)

# Predict
probabilities = model.predict(padded_sequence)

# Print results
labels = ['IsToxic', 'IsAbusive', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist']
print("Prediction probabilities:")
for label, prob in zip(labels, probabilities[0]):
    print(f"{label}: {prob:.2f}")


Prediction probabilities:
IsToxic: 0.98
IsAbusive: 0.90
IsProvocative: 0.11
IsObscene: 0.06
IsHatespeech: 0.12
IsRacist: 0.08


In [None]:
model.save('model_runtimes/text_classification_model.h5')

In [None]:
import tf2onnx
import onnx

# Convert the Keras model to ONNX format
onnx_model, _ = tf2onnx.convert.from_keras(model)

# Save the ONNX model to a file
onnx.save(onnx_model, 'model_runtimes/text_classification_model.onnx')

In [None]:
import pickle

# Save tokenizer
with open('model_runtimes/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)