In [3]:
import numpy as np
import pandas as pd
import re
import nltk
import string
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load and preprocess the dataset
dataset = pd.read_csv("HATESPEECHDETECTION.csv")
dataset["Labels"] = dataset["Class"].map({0: "Hate Speech", 1: "Offensive Speech", 2: "Neither Hate nor Offensive"})
data = dataset[["Tweet", "Labels"]]

# Data Cleaning Function
stemmer = nltk.SnowballStemmer("english")
stopwords = set(nltk.corpus.stopwords.words("english"))
stopwords.add('rt')

def clean_data(text):
    text = str(text).lower()
    text = re.sub('http?://\S+|www\.\S+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = " ".join([stemmer.stem(word) for word in text.split() if word not in stopwords])
    return text

warnings.filterwarnings("ignore")
data["Tweet"] = data["Tweet"].apply(clean_data)

# Vectorize the data and split into training and testing sets
X = np.array(data["Tweet"])
y = np.array(data["Labels"])
cvec = CountVectorizer()
X = cvec.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Train a Decision Tree Classifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = dtree.predict(X_test)
confum = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Display the confusion matrix and accuracy
sns.heatmap(confum, annot=True, fmt='.1f', cmap='tab20c')
plt.show()
print(f"Accuracy: {accuracy:.2f}")

# Sample Predictions
def predict_sample(sample_text):
    sample = clean_data(sample_text)
    sample_vectorized = cvec.transform([sample]).toarray()
    return dtree.predict(sample_vectorized)

sample1 = "Let's unite and kill all the people who are protesting against the government."
sample2 = "Transgender individuals are just confused and mentally ill."
sample3 = "I think we should consider implementing environmentally friendly practices to protect our planet."

print(predict_sample(sample1))  # Should predict as Hate Speech
print(predict_sample(sample2))  # Should predict as Hate Speech
print(predict_sample(sample3))  # Should predict as Neither

  text = re.sub('http?://\S+|www\.\S+', '', text)
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\w*\d\w*', '', text)
  text = re.sub('http?://\S+|www\.\S+', '', text)
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\w*\d\w*', '', text)


FileNotFoundError: [Errno 2] No such file or directory: 'HATESPEECHDETECTION.csv'