In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('/content/HateSpeech_Kenya.csv')

# Display the first few rows
print(data.head())

# Step 1: Data Preprocessing
# Since the 'Class' column represents the target variable, let's focus on that.
X = data['Tweet']  # Features (text)
y = data['Class']  # Target labels (Hate Speech, Offensive, Neither)

# Step 2: Text Preprocessing
# We'll use TF-IDF vectorizer to convert text to numerical form
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data
X_tfidf = vectorizer.fit_transform(X)

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 4: Model Training
# Let's use Multinomial Naive Bayes for text classification
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 5: Predictions
y_pred = model.predict(X_test)

# Step 6: Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))



   hate_speech  offensive_language  neither  Class  \
0            0                   0        3      0   
1            0                   0        3      0   
2            0                   0        3      0   
3            0                   0        3      0   
4            0                   0        3      0   

                                               Tweet  
0  ['The political elite are in desperation. Ordi...  
1  ["Am just curious the only people who are call...  
2  ['USERNAME_3 the area politicians are the one ...  
3  ['War expected in Nakuru if something is not d...  
4  ['USERNAME_4 tells kikuyus activists that they...  
Accuracy: 0.7476

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.99      0.86      7166
           1       0.47      0.04      0.07      1806
           2       0.57      0.01      0.02       644

    accuracy                           0.75      9616
   macro avg       0.60      0.