In [10]:
import nltk
import joblib
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
nltk.download('stopwords')
from nltk.corpus import stopwords
#Load the dataset
df = pd.read_csv('ru_toxic.csv')

#Data preprocessing
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['comment'] = df['comment'].apply(clean_text)

russian_stop_words = stopwords.words('russian')

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words=russian_stop_words, max_features=500)
X = vectorizer.fit_transform(df['comment'])

# Train-test split 20% test
X_train, X_test, y_train, y_test = train_test_split(X, df['toxic'], test_size=0.2, random_state=42)

# Model training using Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))

print("Classification Report on Training Data:\n", classification_report(y_train, y_pred_train))

print("Classification Report on Testing Data:\n", classification_report(y_test, y_pred_test))

# Save the model and vectorizer for later use
joblib.dump(model, 'russian_toxic_classifier_rf_model.pkl')
joblib.dump(vectorizer, 'russian_tfidf_vectorizer.pkl')
print("Model and vectorizer saved successfully.")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ilyas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training Accuracy: 0.8796079451817157
Testing Accuracy: 0.7568505029483177
Classification Report on Training Data:
               precision    recall  f1-score   support

         0.0       0.94      0.87      0.91      7642
         1.0       0.78      0.89      0.83      3887

    accuracy                           0.88     11529
   macro avg       0.86      0.88      0.87     11529
weighted avg       0.89      0.88      0.88     11529

Classification Report on Testing Data:
               precision    recall  f1-score   support

         0.0       0.82      0.82      0.82      1944
         1.0       0.63      0.63      0.63       939

    accuracy                           0.76      2883
   macro avg       0.72      0.72      0.72      2883
weighted avg       0.76      0.76      0.76      2883

Model and vectorizer saved successfully.


In [12]:
print("Model and vectorizer saved successfully.")
import joblib

# Load the model and vectorizer
model = joblib.load('russian_toxic_classifier_rf_model.pkl')
vectorizer = joblib.load('russian_tfidf_vectorizer.pkl')


def predict_spam(email_text):
    email_features = vectorizer.transform([email_text])
    prediction = model.predict(email_features)
    if prediction[0] == 1:
        return "Toxic"
    else:
        return "Not Toxic"


input_text = """Ты хороший"""

result = predict_spam(input_text)
print(f"The email is: {result}")

Model and vectorizer saved successfully.
The email is: Toxic
