# Import Stage

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.utils import shuffle
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.utils import to_categorical

# loading and preprocessing

In [2]:
# Load your dataset
train_data = pd.read_excel('datasets/English_train_data.xlsx')

## Oversampling mode

In [3]:
# Split data into toxic and non-toxic
toxic_data = train_data[train_data['toxic_label'] == 1]
non_toxic_data = train_data[train_data['toxic_label'] == 0]

# Determine oversampling ratio
oversampling_ratio = 5  # 5x toxic samples

# Calculate number of toxic samples to oversample
num_toxic_samples = len(toxic_data) * oversampling_ratio

# Randomly oversample toxic samples
oversampled_toxic_data = toxic_data.sample(n=num_toxic_samples, replace=True)

# Concatenate oversampled toxic data with non-toxic data
oversampled_data = pd.concat([non_toxic_data, oversampled_toxic_data])

# Shuffle the oversampled data
oversampled_data = shuffle(oversampled_data)

# Extract features and labels
train_x = oversampled_data['statement']
train_toxicity_level = oversampled_data['toxic_level']
train_toxicity_label = oversampled_data['toxic_label']

X_train = train_x
y = train_toxicity_label
y2 = train_toxicity_level

# Encodings

In [4]:
# Convert labels to categorical
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y_train = to_categorical(y)

# Tokenization

In [5]:
# Tokenize and pad sequences
max_words = 1000  # Number of unique words to consider
max_sequence_length = 200  # Maximum length of a sequence
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)

# CNN Training

In [6]:
embedding_dim = 50  # Size of the word embeddings
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 50)           50000     
                                                                 
 conv1d (Conv1D)             (None, 196, 128)          32128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 2)                 258       
                                                        

In [7]:
# Train the model
batch_size = 64
epochs = 25
model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x18434f5e910>

# Evaluation

In [13]:
# Load the Excel file
file_path = 'datasets/English_test_balanced_data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Predict toxic labels for the test set using the trained CNN model
y_pred_prob = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_prob, axis=1)

threshold = 0.5  # You can adjust this threshold if needed
predicted_toxic = np.where(y_pred_prob[:, 1] >= threshold, 1, 0)

# Calculate metrics for CNN
cnn_accuracy = accuracy_score(y_test, predicted_toxic)
cnn_precision = precision_score(y_test, predicted_toxic)
cnn_recall = recall_score(y_test, predicted_toxic)

# Display CNN metrics
print("CNN Accuracy:", cnn_accuracy)
print("CNN Precision:", cnn_precision)
print("CNN Recall:", cnn_recall)

CNN Accuracy: 0.8654353562005277
CNN Precision: 0.9928571428571429
CNN Recall: 0.7354497354497355


In [14]:
# Load the Excel file
file_path = 'datasets/English_test_data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Predict toxic labels for the test set using the trained CNN model
y_pred_prob = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_prob, axis=1)

threshold = 0.5  # You can adjust this threshold if needed
predicted_toxic = np.where(y_pred_prob[:, 1] >= threshold, 1, 0)

# Calculate metrics for CNN
cnn_accuracy = accuracy_score(y_test, predicted_toxic)
cnn_precision = precision_score(y_test, predicted_toxic)
cnn_recall = recall_score(y_test, predicted_toxic)

# Display CNN metrics
print("CNN Accuracy:", cnn_accuracy)
print("CNN Precision:", cnn_precision)
print("CNN Recall:", cnn_recall)

CNN Accuracy: 0.934
CNN Precision: 0.896774193548387
CNN Recall: 0.7354497354497355
