In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from string import punctuation

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Read the CSV file
filepath='/content/HateSpeech_Kenya.csv'
data = pd.read_csv(filepath)

In [None]:
# Specify the column containing text data
text_column = 'Tweet'
text_column

'Tweet'

In [None]:
# Print original data head
print("Original data head:")
print(data.head())

Original data head:
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  
0  ['The political elite are in desperation. Ordi...  
1  ["Am just curious the only people who are call...  
2  ['USERNAME_3 the area politicians are the one ...  
3  ['War expected in Nakuru if something is not d...  
4  ['USERNAME_4 tells kikuyus activists that they...  


In [None]:
# Handle NaN values
data[text_column] = data[text_column].fillna('')

In [None]:
# Lowercasing
data[text_column] = data[text_column].apply(lambda x: x.lower())
print("\nData after lowercasing:")
print(data.head())


Data after lowercasing:
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  
0  ['the political elite are in desperation. ordi...  
1  ["am just curious the only people who are call...  
2  ['username_3 the area politicians are the one ...  
3  ['war expected in nakuru if something is not d...  
4  ['username_4 tells kikuyus activists that they...  


In [None]:
# Removing punctuations
data[text_column] = data[text_column].apply(lambda x: ''.join([char for char in x if char not in punctuation]))
print("\nData after removing punctuations:")
print(data.head())


Data after removing punctuations:
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  
0  the political elite are in desperation ordinar...  
1  am just curious the only people who are callin...  
2  username3 the area politicians are the one to ...  
3  war expected in nakuru if something is not don...  
4  username4 tells kikuyus activists that they ar...  


In [None]:
# Tokenization
data['tokens'] = data[text_column].apply(word_tokenize)
print("\nData after tokenization:")
print(data.head())


Data after tokenization:
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  \
0  the political elite are in desperation ordinar...   
1  am just curious the only people who are callin...   
2  username3 the area politicians are the one to ...   
3  war expected in nakuru if something is not don...   
4  username4 tells kikuyus activists that they ar...   

                                              tokens  
0  [the, political, elite, are, in, desperation, ...  
1  [am, just, curious, the, only, people, who, ar...  
2  [username3, the, area, politicians, are, the, ...  
3  [war, expected, in, nakuru, if, something, is,...  
4  [username4, tells, kikuyus, activi

In [None]:
# Removing stopwords
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print("\nData after removing stopwords:")
print(data.head())


Data after removing stopwords:
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  \
0  the political elite are in desperation ordinar...   
1  am just curious the only people who are callin...   
2  username3 the area politicians are the one to ...   
3  war expected in nakuru if something is not don...   
4  username4 tells kikuyus activists that they ar...   

                                              tokens  
0  [political, elite, desperation, ordinary, kale...  
1  [curious, people, calling, old, mad, kikuyus, ...  
2  [username3, area, politicians, one, blame, coz...  
3  [war, expected, nakuru, something, done, luos,...  
4  [username4, tells, kikuyus, 

In [None]:
# Stemming
stemmer = PorterStemmer()
data['stemmed'] = data['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
print("\nData after stemming:")
print(data.head())


Data after stemming:
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  \
0  the political elite are in desperation ordinar...   
1  am just curious the only people who are callin...   
2  username3 the area politicians are the one to ...   
3  war expected in nakuru if something is not don...   
4  username4 tells kikuyus activists that they ar...   

                                              tokens  \
0  [political, elite, desperation, ordinary, kale...   
1  [curious, people, calling, old, mad, kikuyus, ...   
2  [username3, area, politicians, one, blame, coz...   
3  [war, expected, nakuru, something, done, luos,...   
4  [username4, tells, kikuyus, activ

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
data['lemmatized'] = data['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print("\nData after lemmatization:")
print(data.head())


Data after lemmatization:
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  \
0  the political elite are in desperation ordinar...   
1  am just curious the only people who are callin...   
2  username3 the area politicians are the one to ...   
3  war expected in nakuru if something is not don...   
4  username4 tells kikuyus activists that they ar...   

                                              tokens  \
0  [political, elite, desperation, ordinary, kale...   
1  [curious, people, calling, old, mad, kikuyus, ...   
2  [username3, area, politicians, one, blame, coz...   
3  [war, expected, nakuru, something, done, luos,...   
4  [username4, tells, kikuyus, 

In [None]:
# Removing duplicates
data['cleaned_text'] = data['lemmatized'].apply(lambda x: ' '.join(x))
data.drop_duplicates(subset='cleaned_text', inplace=True)
print("\nData after removing duplicates:")
print(data.head())


Data after removing duplicates:
   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  \
0  the political elite are in desperation ordinar...   
1  am just curious the only people who are callin...   
2  username3 the area politicians are the one to ...   
3  war expected in nakuru if something is not don...   
4  username4 tells kikuyus activists that they ar...   

                                              tokens  \
0  [political, elite, desperation, ordinary, kale...   
1  [curious, people, calling, old, mad, kikuyus, ...   
2  [username3, area, politicians, one, blame, coz...   
3  [war, expected, nakuru, something, done, luos,...   
4  [username4, tells, kik

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

data['label'] = data[['neither', 'offensive_language', 'hate_speech']].idxmax(axis=1)
data['label'] = data['label'].map({'neither': 0, 'offensive_language': 1, 'hate_speech': 2})

# Split the dataset into features (X) and labels (y)
X = data['Tweet']
y = data['label']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Drop rows with NaN values in y_train
X_train = X_train.dropna()
y_train =  y_train.loc[X_train.index]

# Convert text data to numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Define and train the classification model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

speech_text =input(str('enter speech: '))

# Preprocess the speech text (similar to preprocessed  training data)
# Tokenization, removing stopwords, etc.

# Convert the preprocessed speech text into numerical features using the trained vectorizer
X_speech_vectorized = vectorizer.transform([speech_text])

# Predict the label for the speech
predicted_label = model.predict(X_speech_vectorized)

# Interpret the predicted label
if predicted_label == 0:
    print("The speech is not offensive.")
elif predicted_label == 1:
    print("The speech contains offensive language.")
else:
    print("The speech is classified as hate speech.")


Accuracy: 0.7650456884781011
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.97      0.87      7227
           1       0.45      0.13      0.20      1670
           2       0.41      0.13      0.20       624

    accuracy                           0.77      9521
   macro avg       0.55      0.41      0.42      9521
weighted avg       0.71      0.77      0.71      9521

Confusion Matrix:
[[6987  170   70]
 [1405  215   50]
 [ 453   89   82]]
enter speech: John is good
The speech is not offensive.
