In [2]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import tkinter as tk
from tkinter import messagebox
import nltk


# Kiswahili hate speech dataset
data = pd.DataFrame({
    'text': [
        "Nakuambia usiende mbali, wewe ni mchafu",
        "Habari yako, natumai uko salama",
        "Ukiwa huna haja ya kuendesha kama hiyo",
        "Leo ni siku nzuri kwa kila mtu",
        "Utakuwa duni zaidi ikiwa utaendelea hivyo",
        "Mimi naahidi kukusaidia kukamilisha kazi yako",
        "Wewe ni kipofu, hauna akili",
        "Tafadhali njoo kwenye mkutano kesho",
        "Umechafuka kabisa, hakuna muhimu yoyote",
        "Tunashukuru kwa msaada wako mkubwa",
        "Unapenda kuwa mpumbavu kila wakati",
        "Hakuna tatizo, tuko pamoja katika hili",
        "Wewe ni batili, usirudi tena",
        "Nitakuombea afya njema na furaha",
        "Sina imani na wewe, wewe ni dududu",
        "Tunapenda kushirikiana na wewe",
        "Wewe ni mpimbaji wa jamii yetu",
        "Asante kwa kuchukua muda kuzungumza nasi",
        "Hao ni watu wakuu wa utovu",
        "Hongera kwa mafanikio yako leo",
        "Wewe ni dhuluma, utakuwa duni sana",
        "Natumai una siku njema",
        "Umeonyesha hofu, usirudi tena",
        "Tuko pamoja katika hatua zote",
        "Wewe ni mfupi sana akili",
        "Nakupenda na ninathamini urafiki wetu",
        "Umevunjika moyo, usijaribu tena",
        "Tuna furaha kukuona leo",
        "Wewe ni kipumbavu, hakuna hatima",
        "Karibu sana, tunakutegemea",
        "Umejizoea kukosea, usirudi tena",
        "Tunathamini mchango wako kwa jamii",
        "Wewe ni mjinga, usifanye hivyo tena",
        "Hongera kwa mafanikio yako",
        "Utakuwa hatia ikiwa utaendelea",
        "Tunapenda msaada wako",
        "Wewe ni hatari kwa jamii yetu",
        "Asante kwa msaada wako",
        "Usifanya kazi hiyo, wewe ni mjinga",
        "Tunashukuru kwa kujitolea kwako",
        "Wewe ni mchafu, usiende mbele",
        "Tunathamini ushirikiano wako",
        "Wewe ni duni, hatia yako",
        "Asante kwa kukusaidia",
        "Wewe ni kipofu, usibale tena",
        "Tunapenda kushirikiana na wewe",
        "Wewe ni kipumbavu, usifanye hivyo",
        "Tunathamini mchango wako",
    ],
    'label': [
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
        "Hate",
        "Non-Hate",
    ]
})

# 2. Data Preprocessing

# Downloading NLTK stopwords 
nltk.download('stopwords')

from nltk.corpus import stopwords

def clean_text(text):
    """
    Perform basic text cleaning: lowercase, remove punctuation, and numbers.
    """
    text = text.lower()  # Converting to lowercase
    text = re.sub(r'\d+', '', text)  # Removing numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuation
    text = text.strip()  # Removing leading whitespace
    return text

def remove_stopwords_func(text):
    """
    Remove Kiswahili stopwords from the text.
    """
    # Define a comprehensive Kiswahili stopword list
    kiswahili_stopwords = set([
        'ni', 'na', 'kwa', 'ya', 'yao', 'yake', 'yako', 'yako', 'sawa',
        'hakuna', 'kama', 'kufanya', 'hivi', 'hivyo', 'kwenye', 'na', 'kwa',
        'tuko', 'tuna', 'tunapenda', 'tunathamini', 'hakuna', 'sana',
        'natumai', 'tunafuraha', 'nakupenda', 'na', 'mimi', 'wewe',
        'usirudi', 'uwe', 'unapenda', 'uweo', 'utakuwa', 'tuko',
        'tuchukulie', 'usijaribu', 'asi', 'alipo', 'mwenye', 'iliyokuwa',
        'aliko', 'yuko', 'tu', 'tuwe', 'tuja', 'tuko', 'twende', 'wa',
        'wa', 'wako', 'wetu', 'wake', 'wao', 'wala', 'wanangu', 'wake',
        'wao', 'wao', 'wa', 'wao', 'wa', 'tu', 'tu', 'wako'
    ])
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in kiswahili_stopwords]
    return ' '.join(filtered_tokens)

# Applying cleaning
data['cleaned_text'] = data['text'].apply(clean_text)

# Removing stopwords
data['cleaned_text'] = data['cleaned_text'].apply(remove_stopwords_func)

# 3. Feature Extraction with N-grams

X = data['cleaned_text']
y = data['label']

# Splitting the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initializing TF-IDF Vectorizer with bigrams
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 4. Model Training with SVM and Hyperparameter Tuning

# Initializing the classifier
svm = SVC(kernel='linear', probability=True)

# Defining hyperparameters for Grid Search
parameters = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel types
    'gamma': ['scale', 'auto']  # Kernel coefficient
}

# Initializing GridSearchCV
grid_search = GridSearchCV(svm, parameters, cv=5, scoring='accuracy')

# Training the model with Grid Search
grid_search.fit(X_train_tfidf, y_train)

# Best estimator after Grid Search
best_svm = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Predicting on the test set
y_pred = best_svm.predict(X_test_tfidf)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))


# 5. GUI Interface

def classify_text():
    """
    Classify the input text and display the result.
    """
    input_text = entry.get()
    if not input_text.strip():
        messagebox.showwarning("Input Error", "Please enter some text.")
        return
    
    # Preprocessing the input text
    cleaned_input = clean_text(input_text)
    cleaned_input = remove_stopwords_func(cleaned_input)
    
    # Vectorizing the input text
    input_vector = vectorizer.transform([cleaned_input])
    
    # Predicting the label
    prediction = best_svm.predict(input_vector)[0]
    
    # Displaying the result
    if prediction == 'Hate':
        messagebox.showinfo("Result", "The text is identified as HATE SPEECH.")
    else:
        messagebox.showinfo("Result", "The text is identified as NON-HATE SPEECH.")

# Initializing Tkinter window
root = tk.Tk()
root.title("Kiswahili Hate Speech Detector")
root.geometry("500x200")
root.resizable(False, False)

# Label
label = tk.Label(root, text="Enter Kiswahili text:", font=("Helvetica", 14))
label.pack(pady=10)

# Text Entry (Single-line)
entry = tk.Entry(root, width=60, font=("Helvetica", 12))
entry.pack(pady=5)

# Classifying Button
button = tk.Button(root, text="Classify", command=classify_text, font=("Helvetica", 12), bg="#4CAF50", fg="white")
button.pack(pady=20)

# Running the application
root.mainloop()

# This was all done in a jupyter notebook!

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Model Accuracy: 90.00%

Classification Report:
              precision    recall  f1-score   support

        Hate       1.00      0.80      0.89         5
    Non-Hate       0.83      1.00      0.91         5

    accuracy                           0.90        10
   macro avg       0.92      0.90      0.90        10
weighted avg       0.92      0.90      0.90        10

