In [1]:
import numpy as np
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.calibration import CalibratedClassifierCV
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.utils import to_categorical
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score
from keras.models import load_model
from sklearn.preprocessing import MinMaxScaler

# Loading

In [15]:
train_data = pd.read_excel('datasets/proper_Training_Data.xlsx')

## Oversampling Mode

In [16]:
# Split data into toxic and non-toxic
toxic_data = train_data[train_data['toxic_label'] == 1]
non_toxic_data = train_data[train_data['toxic_label'] == 0]

# Determine oversampling ratio
oversampling_ratio = 3  # x toxic samples

# Calculate number of toxic samples to oversample
num_toxic_samples = len(toxic_data) * oversampling_ratio

# Randomly oversample toxic samples
oversampled_toxic_data = toxic_data.sample(n=num_toxic_samples, replace=True)

# Concatenate oversampled toxic data with non-toxic data
oversampled_data = pd.concat([non_toxic_data, oversampled_toxic_data])

# Shuffle the oversampled data
oversampled_data = shuffle(oversampled_data)

# Extract features and labels
train_x = oversampled_data['statement']
train_toxicity_label = oversampled_data['toxic_label']

X_train = train_x
y = train_toxicity_label

In [17]:
# Convert labels to categorical
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y_train = to_categorical(y)

In [18]:
# Tokenize and pad sequences
max_words = 2000  # Number of unique words to consider
max_sequence_length = 200  # Maximum length of a sequence
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)

# Training Phase

## CNN Training

In [19]:
# Build the CNN model
embedding_dim = 50  # Size of the word embeddings
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 50)           100000    
                                                                 
 conv1d_1 (Conv1D)           (None, 196, 128)          32128     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 2)                 258       
                                                      

In [20]:
# Train the model
batch_size = 64
epochs = 20
model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1f8d17f04f0>

In [24]:
# Extract features from the CNN model
cnn_features_model = Sequential(model.layers[:-1]) # last layer removal
X_train_cnn_features = cnn_features_model.predict(X_train_padded)



## SVM Train

In [25]:
base_svm_model = SVR(kernel='linear', C=1.0)
svm_model = base_svm_model
svm_model.fit(X_train_cnn_features, np.argmax(y_train, axis=1))

# Testing

In [None]:
# Load the Excel file
file_path = 'datasets/English_test_balanced_data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Extract features from the CNN model
cnn_features_model = Sequential(model.layers[:-1])  # Remove the last layer
X_test_cnn_features = cnn_features_model.predict(X_test_padded)

# Predict using the SVM model
y_pred = svm_model.predict(X_test_cnn_features)

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)

# Display SVM metrics
print("CNN-SVM Accuracy:", svm_accuracy)
print("CNN-SVM Precision:", svm_precision)
print("CNN-SVM Recall:", svm_recall)

In [None]:
# Load the Excel file
file_path = 'datasets/English_test_data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Extract features from the CNN model
cnn_features_model = Sequential(model.layers[:-1])  # Remove the last layer
X_test_cnn_features = cnn_features_model.predict(X_test_padded)

# Predict using the SVM model
y_pred = svm_model.predict(X_test_cnn_features)

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)

# Display SVM metrics
print("CNN-SVM Accuracy:", svm_accuracy)
print("CNN-SVM Precision:", svm_precision)
print("CNN-SVM Recall:", svm_recall)

# SAVING

In [23]:
# Save the trained CNN model
cnn_features_model.save('output_model/cnn_model2_set.h5')

# Save the tokenizer
joblib.dump(tokenizer, 'output_model/tokenizer2_set.pkl')

# Save the trained SVM model
joblib.dump(svm_model, 'output_model/svm_model2_set.pkl')



['output_model/svm_model2_set.pkl']

In [10]:
# Save the trained CNN model
cnn_features_model.save('output_model/cnn_model_proper.h5')

# Save the tokenizer
joblib.dump(tokenizer, 'output_model/tokenizer_proper.pkl')

# Save the trained SVM model
joblib.dump(svm_model, 'output_model/svm_model_proper.pkl')



['output_model/svm_model_proper.pkl']

# Testing of Saved Model

In [None]:
# Load the Excel file
file_path = 'datasets/English_test_data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

max_words = 2000  # Number of unique words to consider
max_sequence_length = 200  # Maximum length of a sequence

tokenizer = joblib.load('output_model/tokenizer_set.pkl')

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Load the CNN model
loaded_cnn_model = load_model('output_model/cnn_model_set.h5')

# Extract features from the loaded CNN model
cnn_features_model = Sequential(loaded_cnn_model.layers[:-1])
X_test_cnn_features = cnn_features_model.predict(X_test_padded)

# Load the SVM model
loaded_svm_model = joblib.load('output_model/svm_model_set.pkl')

# Predict using the SVM model
y_pred = loaded_svm_model.predict(X_test_cnn_features)

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)

# Display SVM metrics
print("CNN-SVM Accuracy:", svm_accuracy)
print("CNN-SVM Precision:", svm_precision)
print("CNN-SVM Recall:", svm_recall)

In [3]:
# Load the Excel file
file_path = 'datasets/proper_Test_Data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

max_words = 2000  # Number of unique words to consider
max_sequence_length = 200  # Maximum length of a sequence

tokenizer = joblib.load('output_model/tokenizer_proper.pkl')

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Load the CNN model
loaded_cnn_model = load_model('output_model/cnn_model_proper.h5')

# Extract features from the loaded CNN model
cnn_features_model = Sequential(loaded_cnn_model.layers[:-1])
X_test_cnn_features = cnn_features_model.predict(X_test_padded)

# Load the SVM model
loaded_svm_model = joblib.load('output_model/svm_model_proper.pkl')
loaded_svm_model.probability = True

# Predict using the SVM model
y_pred = loaded_svm_model.predict(X_test_cnn_features)

print(y_pred)

threshhold = 0.5
y_pred = (y_pred > threshhold).astype(int)

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)

# Display SVM metrics
print("CNN-SVM Accuracy:", svm_accuracy)
print("CNN-SVM Precision:", svm_precision)
print("CNN-SVM Recall:", svm_recall)

[0.00247557 0.00799345 0.92676744 ... 0.98028461 0.94102751 0.96274065]
CNN-SVM Accuracy: 0.8806193806193806
CNN-SVM Precision: 0.8434579439252337
CNN-SVM Recall: 0.6772983114446529
