In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
data = pd.read_excel('datasets/English_train_data.xlsx')  # Replace 'your_dataset.csv' with the actual file path

# Convert 'toxic_label' to binary labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['toxic_label'])

# Split the data into features ('statement' and 'toxic_level') and target ('toxic_label')
X = data[['statement', 'toxic_level']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert 'statement' text data to numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['statement'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['statement'])

# Combine 'toxic_level' and TF-IDF features for training and testing
X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train['toxic_level'].values.reshape(-1, 1)])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test['toxic_level'].values.reshape(-1, 1)])

# Standardize the features (optional but recommended for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
X_test_scaled = scaler.transform(X_test_combined)

In [3]:
# Create a logistic regression model with increased max_iter and 'lbfgs' solver
model = LogisticRegression(max_iter=1000, solver='lbfgs')

# Fit the model on the scaled training data
model.fit(X_train_scaled, y_train)

# Make predictions on the scaled test data
y_pred = model.predict(X_test_scaled)


In [4]:
# Load the test dataset
test_data = pd.read_excel('datasets/English_test_data.xlsx')  # Replace with your file path

In [5]:
# Convert 'toxic_label' to binary labels
y_test = label_encoder.transform(test_data['toxic_label'])

# Convert 'statement' text data to numerical features using TF-IDF vectorization
X_test_tfidf = tfidf_vectorizer.transform(test_data['statement'])

# Combine 'toxic_level' and TF-IDF features for testing
X_test_combined = np.hstack([X_test_tfidf.toarray(), test_data['toxic_level'].values.reshape(-1, 1)])

# Standardize the features
X_test_scaled = scaler.transform(X_test_combined)


In [6]:
# Make predictions on the scaled test data
y_test_pred = model.predict(X_test_scaled)

In [7]:
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)

test_class_report = classification_report(y_test, y_test_pred, target_names=label_encoder.classes_)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test Precision: {test_precision:.2f}")
print(f"Test Recall: {test_recall:.2f}")
print("Test Classification Report:\n", test_class_report)


Test Accuracy: 0.76
Test Precision: 0.81
Test Recall: 0.73
Test Classification Report:
               precision    recall  f1-score   support

   non-toxic       0.70      0.79      0.74       443
       toxic       0.81      0.73      0.77       558

    accuracy                           0.76      1001
   macro avg       0.76      0.76      0.76      1001
weighted avg       0.76      0.76      0.76      1001

