In [27]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [28]:
# Load your dataset (replace 'your_dataset.csv' with the actual file name)
df = pd.read_csv('anti-lgbt-cyberbullying.csv')

In [29]:
# Clean data

# Check for NA values
na_values = df.isna().sum()
if na_values.any():
    print('Dataset contains NA values. Please handle or remove them before proceeding.')
    print(na_values)
else:
    print('No NA values found in the dataset.')

# Check for emojis in the 'text' column and remove them
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)

df['text'] = df['text'].apply(lambda x: re.sub(emoji_pattern, '', x))

# Convert text to lowercase
df['text'] = df['text'].str.lower()

# Remove non-unicode characters
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

# Save the cleaned dataset (optional)
# df.to_csv('cleaned_dataset.csv', index=False)


No NA values found in the dataset.


In [30]:
# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    df['text'], df['anti_lgbt'], test_size=0.2, random_state=42
)

In [31]:
# Convert text data to feature vectors using CountVectorizer
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

In [32]:
# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(train_vectors, train_labels)

In [33]:
# Save the trained model and vectorizer to pickle files
joblib.dump(classifier, 'naive_bayes_model.pkl')
joblib.dump(vectorizer, 'count_vectorizer.pkl')

['count_vectorizer.pkl']

In [34]:
# Predictions on the test set
predictions = classifier.predict(test_vectors)

In [35]:
# Evaluate the classifier
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.91


In [36]:
# Print classification report (precision, recall, f1-score)
print('\nClassification Report:')
print(classification_report(test_labels, predictions))


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       590
           1       0.91      0.78      0.84       270

    accuracy                           0.91       860
   macro avg       0.91      0.87      0.89       860
weighted avg       0.91      0.91      0.90       860



In [22]:
# Compare different models

# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(train_vectors, train_labels)

In [23]:
# Train Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(train_vectors, train_labels)

In [24]:
# Train K Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(train_vectors, train_labels)

In [25]:
# Predictions on the test set for each classifier
nb_predictions = nb_classifier.predict(test_vectors)
rf_predictions = rf_classifier.predict(test_vectors)
knn_predictions = knn_classifier.predict(test_vectors)

In [26]:
# Evaluate and compare the classifiers
classifiers = [('Naive Bayes', nb_predictions),
               ('Random Forest', rf_predictions),
               ('K Nearest Neighbors', knn_predictions)]

for clf_name, predictions in classifiers:
    accuracy = accuracy_score(test_labels, predictions)
    confusion_mat = confusion_matrix(test_labels, predictions)
    sensitivity = confusion_mat[1, 1] / (confusion_mat[1, 0] + confusion_mat[1, 1])
    specificity = confusion_mat[0, 0] / (confusion_mat[0, 0] + confusion_mat[0, 1])

    print(f'\n{clf_name} Classifier:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Sensitivity (True Positive Rate): {sensitivity:.2f}')
    print(f'Specificity (True Negative Rate): {specificity:.2f}')
    print('Confusion Matrix:')
    print(confusion_mat)
    print('-' * 40)

# Note: Sensitivity is the true positive rate, and specificity is the true negative rate.


Naive Bayes Classifier:
Accuracy: 0.91
Sensitivity (True Positive Rate): 0.78
Specificity (True Negative Rate): 0.96
Confusion Matrix:
[[568  22]
 [ 59 211]]
----------------------------------------

Random Forest Classifier:
Accuracy: 0.87
Sensitivity (True Positive Rate): 0.63
Specificity (True Negative Rate): 0.98
Confusion Matrix:
[[578  12]
 [ 99 171]]
----------------------------------------

K Nearest Neighbors Classifier:
Accuracy: 0.75
Sensitivity (True Positive Rate): 0.78
Specificity (True Negative Rate): 0.73
Confusion Matrix:
[[432 158]
 [ 60 210]]
----------------------------------------
