In [1]:
import numpy as np
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.calibration import CalibratedClassifierCV
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.utils import to_categorical
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score
from keras.models import load_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

# Loading

In [2]:
train_data = pd.read_excel('datasets/properTrainingDataPost.xlsx')

## Oversampling Mode

In [3]:
# Split data into toxic and non-toxic
toxic_data = train_data[train_data['toxic_label'] == 1]
non_toxic_data = train_data[train_data['toxic_label'] == 0]

# Determine oversampling ratio
oversampling_ratio = 3  # x toxic samples

# Calculate number of toxic samples to oversample
num_toxic_samples = len(toxic_data) * oversampling_ratio

# Randomly oversample toxic samples
oversampled_toxic_data = toxic_data.sample(n=num_toxic_samples, replace=True)

# Concatenate oversampled toxic data with non-toxic data
oversampled_data = pd.concat([non_toxic_data, oversampled_toxic_data])

# Shuffle the oversampled data
oversampled_data = shuffle(oversampled_data)

# Extract features and labels
train_x = oversampled_data['statement']
train_toxicity_label = oversampled_data['toxic_label']

X_train = train_x
y = train_toxicity_label

In [None]:
# Convert labels to categorical
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y_train = to_categorical(y)

In [4]:
# Convert labels to categorical
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y_train = y  # No need to convert to categorical for regression

In [5]:
# Tokenize and pad sequences
max_words = 2000  # Number of unique words to consider
max_sequence_length = 200  # Maximum length of a sequence
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)

# Training Phase

## Defining KFold

In [6]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize list to store mean squared error
mse_scores = []

## KFOLD Training

In [7]:
# Loop through the folds

model = None
svm_model = None
for train_index, val_index in kfold.split(X_train_padded, y):
    X_train_fold, X_val_fold = X_train_padded[train_index], X_train_padded[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    # Build the CNN model
    embedding_dim = 50  # Size of the word embeddings
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='linear'))  # Use linear activation for regression

    model.compile(optimizer='adam', loss='mean_squared_error')  # Use mean squared error for regression

    # Train the model
    batch_size = 64
    epochs = 35
    model.fit(X_train_fold, y_train_fold, batch_size=batch_size, epochs=epochs, verbose=0)

    # Extract features from the CNN model
    cnn_features_model = Sequential(model.layers[:-1])  # last layer removal
    X_train_cnn_features = cnn_features_model.predict(X_train_fold)

    # Train the SVR model
    base_svm_model = SVR(kernel='linear', C=1.0)
    svm_model = base_svm_model
    svm_model.fit(X_train_cnn_features, y_train_fold)

    # Evaluate on the validation set
    X_val_cnn_features = cnn_features_model.predict(X_val_fold)
    y_val_pred = svm_model.predict(X_val_cnn_features)

    # Calculate mean squared error and append to the list
    mse = mean_squared_error(y_val_fold, y_val_pred)
    mse_scores.append(mse)

# Print the average mean squared error across all folds
print("Average Mean Squared Error:", np.mean(mse_scores))

Average Mean Squared Error: 0.030697352101213122


## CNN Training

In [None]:
# Build the CNN model
embedding_dim = 50  # Size of the word embeddings
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Train the model
batch_size = 64
epochs = 20
model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs)

In [None]:
# Extract features from the CNN model
cnn_features_model = Sequential(model.layers[:-1]) # last layer removal
X_train_cnn_features = cnn_features_model.predict(X_train_padded)

## SVM Train

In [None]:
base_svm_model = SVR(kernel='linear', C=1.0)
svm_model = base_svm_model
svm_model.fit(X_train_cnn_features, np.argmax(y_train, axis=1))

# Testing

In [None]:
# Load the Excel file
file_path = 'datasets/English_test_balanced_data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Extract features from the CNN model
cnn_features_model = Sequential(model.layers[:-1])  # Remove the last layer
X_test_cnn_features = cnn_features_model.predict(X_test_padded)

# Predict using the SVM model
y_pred = svm_model.predict(X_test_cnn_features)

threshhold = 0.5
y_pred = (y_pred > threshhold).astype(int)

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)

# Display SVM metrics
print("CNN-SVM Accuracy:", svm_accuracy)
print("CNN-SVM Precision:", svm_precision)
print("CNN-SVM Recall:", svm_recall)

In [None]:
# Load the Excel file
file_path = 'datasets/English_test_data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Extract features from the CNN model
cnn_features_model = Sequential(model.layers[:-1])  # Remove the last layer
X_test_cnn_features = cnn_features_model.predict(X_test_padded)

# Predict using the SVM model
y_pred = svm_model.predict(X_test_cnn_features)

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)

# Display SVM metrics
print("CNN-SVM Accuracy:", svm_accuracy)
print("CNN-SVM Precision:", svm_precision)
print("CNN-SVM Recall:", svm_recall)

# SAVING

In [None]:
# Save the trained CNN model
cnn_features_model.save('output_model/cnn_model2_set.h5')

# Save the tokenizer
joblib.dump(tokenizer, 'output_model/tokenizer2_set.pkl')

# Save the trained SVM model
joblib.dump(svm_model, 'output_model/svm_model2_set.pkl')

In [None]:
# Save the trained CNN model
cnn_features_model.save('output_model/cnn_model_proper.h5')

# Save the tokenizer
joblib.dump(tokenizer, 'output_model/tokenizer_proper.pkl')

# Save the trained SVM model
joblib.dump(svm_model, 'output_model/svm_model_proper.pkl')

In [8]:
# Save the trained CNN model
cnn_features_model.save('output_model/cnn_model_proper_KFOLD.h5')

# Save the tokenizer
joblib.dump(tokenizer, 'output_model/tokenizer_proper_KFOLD.pkl')

# Save the trained SVM model
joblib.dump(svm_model, 'output_model/svm_model_proper_KFOLD.pkl')



['output_model/svm_model_proper_KFOLD.pkl']

# Testing of Saved Model

In [9]:
# Load the Excel file
file_path = 'datasets/properTestDataPost.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

max_words = 2000  # Number of unique words to consider
max_sequence_length = 200  # Maximum length of a sequence

tokenizer = joblib.load('output_model/tokenizer_proper_KFOLD.pkl')

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Load the CNN model
loaded_cnn_model = load_model('output_model/cnn_model_proper_KFOLD.h5')

# Extract features from the loaded CNN model
cnn_features_model = Sequential(loaded_cnn_model.layers[:-1])
X_test_cnn_features = cnn_features_model.predict(X_test_padded)

# Load the SVM model
loaded_svm_model = joblib.load('output_model/svm_model_proper_KFOLD.pkl')

# Predict using the SVM model
y_pred = loaded_svm_model.predict(X_test_cnn_features)

threshhold = 0.5
y_pred = (y_pred > threshhold).astype(int)

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)

# Display SVM metrics
print("CNN-SVM Accuracy:", svm_accuracy)
print("CNN-SVM Precision:", svm_precision)
print("CNN-SVM Recall:", svm_recall)

CNN-SVM Accuracy: 0.8901098901098901
CNN-SVM Precision: 0.832271762208068
CNN-SVM Recall: 0.7354596622889306


In [None]:
# Load the Excel file
file_path = 'datasets/proper_Test_Data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X_test = data['statement']
y_test = data['toxic_label']

max_words = 2000  # Number of unique words to consider
max_sequence_length = 200  # Maximum length of a sequence

tokenizer = joblib.load('output_model/tokenizer_set.pkl')

# Tokenize and pad sequences using the same tokenizer used during training
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Load the CNN model
loaded_cnn_model = load_model('output_model/cnn_model_set.h5')

# Extract features from the loaded CNN model
cnn_features_model = Sequential(loaded_cnn_model.layers[:-1])
X_test_cnn_features = cnn_features_model.predict(X_test_padded)

# Load the SVM model
loaded_svm_model = joblib.load('output_model/svm_model_set.pkl')
loaded_svm_model.probability = True

# Predict using the SVM model
y_pred = loaded_svm_model.predict(X_test_cnn_features)

print(y_pred)

threshhold = 0.5
y_pred = (y_pred > threshhold).astype(int)

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)

# Display SVM metrics
print("CNN-SVM Accuracy:", svm_accuracy)
print("CNN-SVM Precision:", svm_precision)
print("CNN-SVM Recall:", svm_recall)