Importing Libraries

In [1]:
import keras
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn import metrics
from termcolor import colored
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from mlxtend.plotting import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

Read data CSV

In [2]:
df_path = 'combined_preprocessed/combined_preprocessed.csv'

In [3]:
df = pd.read_csv(df_path)

In [4]:
df.head()

Unnamed: 0,PhraseNo,Phrase,IsHateSpeech
0,1,බැල්ලි කොටින් ග කනකො අපහසුවක දැනුන දන ඔ මිනිහ ...,YES
1,2,මන ඊ මුස්ලිම ඩෑල එකක සෙල්ෆියක ගත්ත සහජීවන රැකග...,YES
2,3,සංහිදියාව අවුලක වෙ,NO
3,4,කටින පුරසාරම දොඩ අපි සිංහලය විදිය ලැජ්ජ වි යුතු,YES
4,5,මචන මගු නවත්ත කොන්දක තියෙනවානම පුලුවන්නම කර පෙ...,YES


Missing values checking

In [5]:
df.isnull().sum()

PhraseNo        0
Phrase          0
IsHateSpeech    0
dtype: int64

In [6]:
df = df.drop(columns=["PhraseNo"])
df.head()

Unnamed: 0,Phrase,IsHateSpeech
0,බැල්ලි කොටින් ග කනකො අපහසුවක දැනුන දන ඔ මිනිහ ...,YES
1,මන ඊ මුස්ලිම ඩෑල එකක සෙල්ෆියක ගත්ත සහජීවන රැකග...,YES
2,සංහිදියාව අවුලක වෙ,NO
3,කටින පුරසාරම දොඩ අපි සිංහලය විදිය ලැජ්ජ වි යුතු,YES
4,මචන මගු නවත්ත කොන්දක තියෙනවානම පුලුවන්නම කර පෙ...,YES


In [7]:
df.shape

(2500, 2)

In [8]:
X = df['Phrase']
y = df['IsHateSpeech']

In [9]:
df['IsHateSpeech'].value_counts()

IsHateSpeech
NO     1397
YES    1103
Name: count, dtype: int64

Splitting dataset into train and test (80:20 ratio)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [11]:
print(y_train)

286     YES
1495    YES
40      YES
1106     NO
1379    YES
       ... 
1033     NO
1731    YES
763      NO
835      NO
1653    YES
Name: IsHateSpeech, Length: 2000, dtype: object


In [12]:
print(y_train)

286     YES
1495    YES
40      YES
1106     NO
1379    YES
       ... 
1033     NO
1731    YES
763      NO
835      NO
1653    YES
Name: IsHateSpeech, Length: 2000, dtype: object


In [13]:
print("Training dataset: ", X_train.shape[0])
print("Test dataset: ", X_test.shape[0])

Training dataset:  2000
Test dataset:  500


 Vectorize the text data

In [14]:
# count_vector = CountVectorizer()

# train_data_features = count_vector.fit_transform(X_train)
# train_data_features = train_data_features.toarray()
# testing_data = count_vector.transform(X_test)
# testing_data = testing_data.toarray()
# vocab = count_vector.get_feature_names_out()

# joblib.dump(count_vector, 'count_vectorizor/count_vectorizer.pkl')

In [15]:
vectorizer = TfidfVectorizer()
train_data_features = vectorizer.fit_transform(X_train)
train_data_features = train_data_features.toarray()
testing_data = vectorizer.transform(X_test)
testing_data = testing_data.toarray()

In [16]:
train_data_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Metrics

In [17]:
def get_metrics(true_labels, predicted_labels):
    correctly_identified_y = predicted_labels == true_labels

    print("Confusion metrix: ")
    cm = confusion_matrix(true_labels, predicted_labels)
    print(cm)
    
    print()

    accuracy = np.mean(correctly_identified_y) * 100
    print ('Accuracy = %.0f%%' %accuracy)

    precision = (cm[0][0]/(cm[0][0]+cm[1][0]))* 100
    print ('precision = %.1f%%' %precision)

    recall = (cm[0][0]/(cm[0][0]+cm[0][1]))* 100
    print ('recall = %.1f%%' %recall)

    F1_Score = 2*((precision*recall)/(precision+recall))
    print ('F1 Score = %.1f%%' %F1_Score)

Naive Bayes

In [18]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(train_data_features, y_train)

In [19]:
naive_bayes_model_predictions = naive_bayes_model.predict(testing_data)

In [20]:
get_metrics(y_test, naive_bayes_model_predictions)

Confusion metrix: 
[[244  32]
 [112 112]]

Accuracy = 71%
precision = 68.5%
recall = 88.4%
F1 Score = 77.2%


Logistic Regression

In [21]:
logistic_regression_model = LogisticRegression()

logistic_regression_model.fit(train_data_features, y_train)

In [22]:
logistic_regression_model_predictions = logistic_regression_model.predict(testing_data)

In [23]:
get_metrics(y_test, logistic_regression_model_predictions)

Confusion metrix: 
[[242  34]
 [ 99 125]]

Accuracy = 73%
precision = 71.0%
recall = 87.7%
F1 Score = 78.4%


Random Forest

In [24]:
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest_model.fit(train_data_features, y_train)

In [25]:
random_forest_model_predictions = random_forest_model.predict(testing_data)

In [26]:
get_metrics(y_test, random_forest_model_predictions)

Confusion metrix: 
[[236  40]
 [ 95 129]]

Accuracy = 73%
precision = 71.3%
recall = 85.5%
F1 Score = 77.8%


SVM

In [27]:
svm_model = SVC(kernel='linear')

svm_model.fit(train_data_features, y_train)

In [28]:
svm_model_predictions = svm_model.predict(testing_data)

In [29]:
get_metrics(y_test, svm_model_predictions)

Confusion metrix: 
[[219  57]
 [ 81 143]]

Accuracy = 72%
precision = 73.0%
recall = 79.3%
F1 Score = 76.0%


KNN

In [30]:
# List of k values to try
k_values = range(1, 31)

# Empty list to store scores
cross_val_scores = []

# Perform 10-fold cross-validation with each value of k
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, train_data_features, y_train, cv=10, scoring='accuracy')
    cross_val_scores.append(scores.mean())

# Find the optimal k value
best_k = k_values[cross_val_scores.index(max(cross_val_scores))]
print(f"The best value of k is {best_k}")

The best value of k is 28


In [31]:
knn_model = KNeighborsClassifier(n_neighbors=best_k)

knn_model.fit(train_data_features, y_train)

In [32]:
knn_model_predictions = knn_model.predict(testing_data)

In [33]:
get_metrics(y_test, knn_model_predictions)

Confusion metrix: 
[[251  25]
 [133  91]]

Accuracy = 68%
precision = 65.4%
recall = 90.9%
F1 Score = 76.1%
