In [1]:
"""
Tamil Abusive Text Detection
Model: TF-IDF (Char) + SentenceBERT + RF + ExtraTrees
Optimized for CPU
"""

import pandas as pd
import numpy as np
import re
import os
import warnings

warnings.filterwarnings("ignore")

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.sparse import hstack

def clean_tamil_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+|#", "", text)
    text = re.sub(r"[^\u0B80-\u0BFFa-zA-Z0-9\s]", "", text)
    return re.sub(r"\s+", " ", text).strip()

DATASET_PATH = r"C:\Users\roahi\OneDrive\Desktop\train2.csv"

df = pd.read_csv(DATASET_PATH, usecols=["Text", "Class"]).dropna()
df["clean_text"] = df["Text"].apply(clean_tamil_text)

texts = df["clean_text"].tolist()
labels = df["Class"].astype(np.int8).values


X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts,
    labels,
    test_size=0.2,
    stratify=labels,
    random_state=42
)

tfidf = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=5,
    max_features=50000
)

X_train_tfidf = tfidf.fit_transform(X_train_texts)
X_test_tfidf = tfidf.transform(X_test_texts)
EMB_DIR = "cached_embeddings"
os.makedirs(EMB_DIR, exist_ok=True)

train_emb_path = os.path.join(EMB_DIR, "X_train_sbert.npy")
test_emb_path = os.path.join(EMB_DIR, "X_test_sbert.npy")

if os.path.exists(train_emb_path) and os.path.exists(test_emb_path):
    print("[*] Loading cached SentenceBERT embeddings...")
    X_train_emb = np.load(train_emb_path)
    X_test_emb = np.load(test_emb_path)
else:
    print("[*] Encoding text using SentenceBERT...")
    encoder = SentenceTransformer(
        "paraphrase-multilingual-MiniLM-L12-v2",
        device="cpu"
    )

    X_train_emb = encoder.encode(
        X_train_texts,
        batch_size=128,
        convert_to_numpy=True,
        show_progress_bar=True
    ).astype(np.float32)

    X_test_emb = encoder.encode(
        X_test_texts,
        batch_size=128,
        convert_to_numpy=True,
        show_progress_bar=True
    ).astype(np.float32)

    np.save(train_emb_path, X_train_emb)
    np.save(test_emb_path, X_test_emb)
X_train = hstack([X_train_tfidf, X_train_emb])
X_test = hstack([X_test_tfidf, X_test_emb])
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight={0: 1, 1: 1.5},
    n_jobs=-1,
    random_state=42
)

et = ExtraTreesClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight={0: 1, 1: 1.5},
    n_jobs=-1,
    random_state=42
)

ensemble = VotingClassifier(
    estimators=[("rf", rf), ("et", et)],
    voting="soft",
    n_jobs=-1
)
print("\n[*] Training ensemble model...")
ensemble.fit(X_train, y_train)


y_pred = ensemble.predict(X_test)

print("\n" + "=" * 60)
print("ENSEMBLE MODEL PERFORMANCE")
print("=" * 60)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%\n")

print(classification_report(
    y_test,
    y_pred,
    target_names=["Non-Abusive", "Abusive"]
))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("=" * 60)

[*] Loading cached SentenceBERT embeddings...

[*] Training ensemble model...

ENSEMBLE MODEL PERFORMANCE
Accuracy: 80.85%

              precision    recall  f1-score   support

 Non-Abusive       0.83      0.79      0.81      2673
     Abusive       0.79      0.83      0.81      2517

    accuracy                           0.81      5190
   macro avg       0.81      0.81      0.81      5190
weighted avg       0.81      0.81      0.81      5190

Confusion Matrix:
[[2119  554]
 [ 440 2077]]
