In [2]:
import pandas as pd

# Adjust the filename to match the actual CSV
csv_path = r"D:\work\june 2025\hate spech detection\sinhala-hate-speech-detect-machine-lerning\sinhala-hate-speech-dataset.csv"

# Load the CSV
df = pd.read_csv(csv_path)

# Preview
print(df.head())
print(df.columns)


   id                                           comment  label
0   1                   `‡∑Ñ‡∑î‡∂Ø‡∑ô‡∂ö‡∂Ω‡∑è¬¥ ‡∂ë‡∂ö‡∑ö `‡∂Ø¬¥ ‡∂á‡∑Ñ‡∑ô‡∂±‡∑Ä‡∂Ø ‡∂ö‡∑è‡∂ß‡∑Ñ‡∂ª‡∑í      0
1   2                             18‡∂ö ‡∂¥‡∑ô‡∂±‡∑î‡∂∏‡∂ö‡∑ä ‡∂≠‡∑í‡∂∫‡∑ô‡∂±‡∑ä‡∂±‡∑ö       0
2   3  2020 ‡∂ë‡∂±‡∑Ä‡∑è ‡∂ö‡∑í‡∑Ä‡∑ä‡∂±‡∑è‡∂ß ‡∂∏‡∑ô‡∑Ñ‡∑ô‡∂∏ ‡∂ë‡∂∫‡∑í ‡∂ö‡∑í‡∂∫‡∂Ω‡∑è ‡∑Ñ‡∑í‡∂≠‡∑î‡∑Ä‡∑ö ‡∂±‡∑ë ‡∂¥‡∂ß‡∑ä‡∂ß      0
3   4                                 25 ‡∂ö‡∑ê‡∂Ω‡∑ä‡∂Ω..‡∑Ñ‡∂∏‡∑ä‡∂∏‡∑ù..      0
4   5                 25 ‡∂ö‡∑ô‡∂Ω‡∑ä‡∂Ω ‡∂Ö‡∂∫‡∑í‡∂∫‡∂Ω‡∂ú‡∑ô ‡∂ö‡∑è‡∂Ω‡∂Ø ‡∂ö‡∑ú‡∑Ñ‡∑ô‡∂Ø ‡∂â‡∂±‡∑ä‡∂±‡∑ô      1
Index(['id', 'comment', 'label'], dtype='object')


In [6]:
import nltk
import re
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# Sinhala stopwords (expand this list as needed)
sinhala_stopwords = set([
    '‡∂î‡∑Ñ‡∑î', '‡∂á‡∂∫', '‡∂î‡∂∂', '‡∂Ö‡∂¥‡∑í', '‡∂∏‡∂∏', '‡∂ë‡∂∫', '‡∂∏‡∑ô‡∂∫', '‡∂í‡∂ö', '‡∂Ø‡∑ô‡∑Ä‡∑í‡∂∫‡∂±‡∑ä', '‡∂±‡∑ê‡∑Ñ‡∑ê', '‡∂î‡∑Ä‡∑ä'
])

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\u0D80-\u0DFF\s]', '', text)  # Keep only Sinhala characters and spaces
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in sinhala_stopwords]
    return " ".join(tokens)


df['clean_text'] = df['comment'].apply(preprocess_text)


[nltk_data] Downloading package punkt to C:\Users\MY
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])  # e.g., 'hate' -> 1, 'normal' -> 0
print(le.classes_)  # See mapping


[0 1]


In [14]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


LogisticRegression()

In [19]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test_tfidf)

print("‚úÖ Accuracy:", accuracy_score(y_test, y_pred))
print("üîç Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_.astype(str)))


‚úÖ Accuracy: 0.7588652482269503
üîç Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.71      0.73       578
           1       0.77      0.80      0.78       691

    accuracy                           0.76      1269
   macro avg       0.76      0.76      0.76      1269
weighted avg       0.76      0.76      0.76      1269



In [20]:
import joblib

joblib.dump(model, 'sinhala_hate_model.pkl')
joblib.dump(vectorizer, 'sinhala_vectorizer.pkl')
joblib.dump(le, 'sinhala_label_encoder.pkl')


['sinhala_label_encoder.pkl']