In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the dataset
file_path = r'../data/HateSpeechDataset.csv'
print("Loading dataset...")
data = pd.read_csv(file_path)
print("Dataset loaded successfully.")

# Ensure the 'Label' column has numerical values
data['Label'] = pd.Categorical(data['Label']).codes

# Preprocess the text data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(tokens)

data['Content'] = data['Content'].apply(preprocess_text)

# Separate features and target variable
X = data['Content']
y = data['Label']

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=10000)

# Split the data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split completed.")

# Transform the data using TF-IDF vectorizer
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Decision Tree model
print("Training Decision Tree model...")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_tfidf, y_train)
print("Model trained successfully.")

# Cross-validation
print("Performing cross-validation...")
cv_scores = cross_val_score(dt_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")

# Evaluate the model
print("Evaluating the model...")
y_pred = dt_model.predict(X_test_tfidf)

# Calculate evaluation metrics
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Evaluation completed.")
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Save the trained model
print("Saving the trained model...")
joblib.dump(dt_model, 'dt_model.h5')
print("Model saved successfully.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mrehm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mrehm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mrehm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading dataset...
Dataset loaded successfully.
Splitting data into training and testing sets...
Data split completed.
Training Decision Tree model...
Model trained successfully.
Performing cross-validation...
Cross-validation scores: [0.850606   0.8478985  0.85067687 0.85183925 0.84793888]
Mean CV Accuracy: 0.8498
Evaluating the model...
Evaluation completed.
Confusion Matrix:
[[66252  6009]
 [ 6962  8959]]
Accuracy: 0.8529064888526003
Precision: 0.8495955391492152
Recall: 0.8529064888526003
F1 Score: 0.8511191958115427
Saving the trained model...
Model saved successfully.
