In [None]:
# Streamlit Cloud-Based Hate Speech Detection System

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import nltk
from nltk.corpus import stopwords
import re
import string
from joblib import dump

In [2]:
# Load dataset
data = pd.read_csv("https://raw.githubusercontent.com/fenago/datasets/main/twitter.csv")

# Preprocessing
data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "No Hate and Offensive"})
data = data[["tweet", "labels"]]

nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["tweet"] = data["tweet"].apply(clean)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Vectorization
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Model Training
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Saving the model and vectorizer
dump(clf, 'hate_speech_model.pkl')
dump(cv, 'count_vectorizer.pkl')

['count_vectorizer.pkl']

In [4]:
# Ensure you have already loaded your model and vectorizer
from joblib import load
clf = load('hate_speech_model.pkl')
cv = load('count_vectorizer.pkl')

# Function to preprocess and predict
def predict_hate_speech(text):
    # Preprocess the text (ensure this is consistent with your training preprocessing)
    text = clean(text)  # Assuming 'clean' is your preprocessing function
    text_vectorized = cv.transform([text]).toarray()
    prediction = clf.predict(text_vectorized)
    return prediction[0]

# Test with some sample texts
sample_texts = ["Your example tweet here", "Another example tweet"]
for text in sample_texts:
    prediction = predict_hate_speech(text)
    print(f"Tweet: {text}\nPrediction: {prediction}\n")

Tweet: Your example tweet here
Prediction: No Hate and Offensive

Tweet: Another example tweet
Prediction: No Hate and Offensive



In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TfidfVectorizer with bi-grams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(x)

In [7]:
from sklearn.ensemble import RandomForestClassifier

# Try a Random Forest Classifier
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_features': ['sqrt', 'log2'],#['auto']
    'max_depth' : [4,5,6],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)
CV_rfc.fit(X_train, y_train)

In [12]:
from sklearn.metrics import classification_report, accuracy_score

y_pred_before = clf.predict(X_test)
print("Before Improvements:")
print(classification_report(y_test, y_pred_before))
print(f"Accuracy: {accuracy_score(y_test, y_pred_before)}\n")

Before Improvements:
                       precision    recall  f1-score   support

          Hate Speech       0.37      0.33      0.35       465
No Hate and Offensive       0.82      0.83      0.82      1379
   Offensive Language       0.92      0.93      0.93      6335

             accuracy                           0.88      8179
            macro avg       0.70      0.70      0.70      8179
         weighted avg       0.87      0.88      0.88      8179

Accuracy: 0.8777356645066634



In [None]:
# Assuming 'clf_rf' is the new classifier after hyperparameter tuning
y_pred_after = CV_rfc.best_estimator_.predict(X_test)
print("After Improvements:")
print(classification_report(y_test, y_pred_after))
print(f"Accuracy: {accuracy_score(y_test, y_pred_after)}")