In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, SVR
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
train_data = pd.read_excel('datasets/English_train_data.xlsx')

# oversampling mode

In [3]:
# Split data into toxic and non-toxic
toxic_data = train_data[train_data['toxic_label'] == 1]
non_toxic_data = train_data[train_data['toxic_label'] == 0]

# Determine oversampling ratio
oversampling_ratio = 5  # 5x toxic samples

# Calculate number of toxic samples to oversample
num_toxic_samples = len(toxic_data) * oversampling_ratio

# Randomly oversample toxic samples
oversampled_toxic_data = toxic_data.sample(n=num_toxic_samples, replace=True)

# Concatenate oversampled toxic data with non-toxic data
oversampled_data = pd.concat([non_toxic_data, oversampled_toxic_data])

# Shuffle the oversampled data
oversampled_data = shuffle(oversampled_data)

# Extract features and labels
train_x = oversampled_data['statement']
train_toxicity_level = oversampled_data['toxic_level']
train_toxicity_label = oversampled_data['toxic_label']

X = train_x
y = train_toxicity_label
y2 = train_toxicity_level

In [4]:
# Convert the statements into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=2000)  # You can adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X)

# Initialize and train the SVM model
svm_model = SVC(kernel='linear', C=10)  # You can try different kernels
svm_model.fit(X_train_tfidf, y)

# Model Testing

In [5]:
# Load the Excel file
file_path = 'datasets/English_test_balanced_data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X = data['statement']
y = data['toxic_label']

# Convert statements into TF-IDF features using the same vectorizer used during training
X_test_tfidf = tfidf_vectorizer.transform(X)  # Use the same vectorizer

# Predict toxic labels for the test set
y_pred = svm_model.predict(X_test_tfidf)

threshold = 0

predicted_toxic = np.where(y_pred >= threshold, 1, 0)
predicted_toxic = predicted_toxic.ravel()

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y, y_pred)
svm_precision = precision_score(y, y_pred)
svm_recall = recall_score(y, y_pred)

# Display SVM metrics
print("SVM Accuracy:", svm_accuracy)
print("SVM Precision:", svm_precision)
print("SVM Recall:", svm_recall)

SVM Accuracy: 0.841688654353562
SVM Precision: 0.9574468085106383
SVM Recall: 0.7142857142857143


In [6]:
# Load the Excel file
file_path = 'datasets/English_test_data.xlsx'
data = pd.read_excel(file_path)

# Extract features and target
X = data['statement']
y = data['toxic_label']

# Convert statements into TF-IDF features using the same vectorizer used during training
X_test_tfidf = tfidf_vectorizer.transform(X)  # Use the same vectorizer

# Predict toxic labels for the test set
y_pred = svm_model.predict(X_test_tfidf)

threshold = 0

predicted_toxic = np.where(y_pred >= threshold, 1, 0)
predicted_toxic = predicted_toxic.ravel()

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y, y_pred)
svm_precision = precision_score(y, y_pred)
svm_recall = recall_score(y, y_pred)

# Display SVM metrics
print("SVM Accuracy:", svm_accuracy)
print("SVM Precision:", svm_precision)
print("SVM Recall:", svm_recall)

SVM Accuracy: 0.916
SVM Precision: 0.8181818181818182
SVM Recall: 0.7142857142857143
