In [1]:
# import necessary packages
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import zipfile
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression



In [2]:
with zipfile.ZipFile("CyberBullying Comments Dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("CyberBullying_dataset")

In [3]:
df = pd.read_csv('CyberBullying_dataset/CyberBullying Comments Dataset.csv')
df 


Unnamed: 0,Text,CB_Label
0,damn there is someones nana up here at beach w...,0
1,no kidding! dick clark was a corpse mechanical...,0
2,i read an article on jobros and thought damn w...,0
3,I got one fucking day of sprinkles and now it'...,0
4,I was already listening to Elliott smith and ...,0
...,...,...
11095,"""Don't worry you little empty head over it ......",1
11096,"""Some of Ya'll are dumb as fuck.... These are ...",1
11097,"""Lana, you're so full of shit your eyes are br...",1
11098,"""You ain't lying let the @dbeeio61:disqus\xa0\...",1


In [4]:

df.info()


df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11100 entries, 0 to 11099
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      11100 non-null  object
 1   CB_Label  11100 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 173.6+ KB


Text        0
CB_Label    0
dtype: int64

In [5]:
df['CB_Label'].value_counts()

CB_Label
0    5550
1    5550
Name: count, dtype: int64

In [6]:

stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def lemmatize_stem(token):
    return stemmer.stem(lemmatizer.lemmatize(token, pos='v'))


In [7]:
stopwords_list = stopwords.words('english')  # create list of stopwords which will be used to filter them out
stopwords_list.extend(['’', "...", "--", "''","``","&",'.', ',', '?', '!', 'u', '&', ';', ':', '..', "'s", '(', ')', ';']) # extend the stopwords list to include the missing puntuation marks

def data_preprocess(text):
    # Lowercase the text
    text = text.lower()
    # Tokenize the text into words
    tokens = word_tokenize(text)
    new_tokens = []
    # Remove stopwords
    for w in tokens:
        if w not in stopwords_list:
            new_tokens.append(lemmatize_stem(w))  #  stemming and lemmatization here
    return new_tokens


# Apply preprocessing to each message
processed_docs = df['Text'][:].map(data_preprocess)

# Preview the data to ensure pre-processing is as expected
print(processed_docs[:9])

0    [damn, someon, nana, beach, one, dont, think, ...
1    [kid, dick, clark, corps, mechan, oper, advert...
2    [read, articl, jobro, think, damn, cash, jobro...
3    [get, one, fuck, day, sprinkl, back, sunshin, ...
4    [alreadi, listen, elliott, smith, fuck, hate, ...
5       [tell, derek, go, fuck, devyn, tell, us, calm]
6           ['m, watch, new, smosh, video, laugh, ass]
7    [mom, n't, like, catholic, idea, sinner, birth...
8    [ya, know, lol, big, thunder, mountain, break,...
Name: Text, dtype: object


### Bag of Words

In [8]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [9]:
print(len(dictionary.iteritems()))

15466


In [10]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 beach
1 damn
2 dont
3 get
4 ic
5 nana
6 one
7 quick
8 someon
9 steal
10 think


In [11]:
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)

print(len(dictionary.iteritems())) # This tells you how many tokens remain after filtering.

1367


In [12]:
print(len(dictionary))

1367


In [13]:
# Use dictionary.doc2bow(doc) inside a list comprehension to create 'bow_corpus'
# Each entry will show how many times each token (by ID) appears in a document
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

docnr = 4000

print(bow_corpus[docnr])
print('length of corpus: ', len(bow_corpus))

[(0, 1), (7, 1), (47, 1), (53, 1), (156, 1), (341, 1), (359, 1)]
length of corpus:  11100


In [14]:
print(bow_corpus[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(9, 1), (10, 1), (11, 1)], [(0, 1), (7, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(2, 1), (3, 1), (16, 1), (17, 1), (18, 1)], [(18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)]]


In [15]:
from gensim.matutils import corpus2dense

# Convert sparse BoW to dense matrix
dense_matrix = corpus2dense(bow_corpus, num_terms=len(dictionary)).T  # transpose to get [docs x terms]


In [16]:
bow_df = pd.DataFrame(dense_matrix.astype(int))

In [17]:
bow_df['CB_Label'] = df['CB_Label'].values[:len(bow_df)]


In [18]:
print(bow_df.head())
print(bow_df.shape)
print(bow_df['CB_Label'].value_counts())

   0  1  2  3  4  5  6  7  8  9  ...  1358  1359  1360  1361  1362  1363  \
0  1  1  1  1  1  1  1  1  1  0  ...     0     0     0     0     0     0   
1  0  0  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
2  1  0  0  0  0  0  0  1  0  0  ...     0     0     0     0     0     0   
3  0  0  1  1  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   

   1364  1365  1366  CB_Label  
0     0     0     0         0  
1     0     0     0         0  
2     0     0     0         0  
3     0     0     0         0  
4     0     0     0         0  

[5 rows x 1368 columns]
(11100, 1368)
CB_Label
0    5550
1    5550
Name: count, dtype: int64


In [19]:
# Separate features and target
X = bow_df.drop(columns=['CB_Label'])
y = bow_df['CB_Label']

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Naive Bayes Classification 

In [20]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [21]:
# Get feature names
feature_names = [dictionary[i] for i in range(len(dictionary))]

# Get log probability ratio
feature_log_probs = nb_model.feature_log_prob_

# Difference between class 1 (CB=1) and class 0 (CB=0)
top_features = np.argsort(feature_log_probs[1] - feature_log_probs[0])[-10:]

print("Top 10 Informative Features (Most indicative of cyberbullying):")
for i in reversed(top_features):
    print(f"Word: {feature_names[i]} | CB_Label=1 log_prob: {feature_log_probs[1][i]:.2f} | CB_Label=0 log_prob: {feature_log_probs[0][i]:.2f}")


Top 10 Informative Features (Most indicative of cyberbullying):
Word: \xa0 | CB_Label=1 log_prob: -6.00 | CB_Label=0 log_prob: -10.29
Word: moron | CB_Label=1 log_prob: -5.55 | CB_Label=0 log_prob: -9.60
Word: pathet | CB_Label=1 log_prob: -6.15 | CB_Label=0 log_prob: -9.60
Word: \xa0you | CB_Label=1 log_prob: -6.98 | CB_Label=0 log_prob: -10.29
Word: idiot | CB_Label=1 log_prob: -4.94 | CB_Label=0 log_prob: -8.21
Word: > | CB_Label=1 log_prob: -5.92 | CB_Label=0 log_prob: -9.19
Word: \n | CB_Label=1 log_prob: -7.36 | CB_Label=0 log_prob: -10.29
Word: coward | CB_Label=1 log_prob: -7.40 | CB_Label=0 log_prob: -10.29
Word: intellig | CB_Label=1 log_prob: -7.40 | CB_Label=0 log_prob: -10.29
Word: racist | CB_Label=1 log_prob: -6.35 | CB_Label=0 log_prob: -9.19


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

y_pred = nb_model.predict(X_test)

# Accuracy  
accuracy = accuracy_score(y_test, y_pred)
print(round(accuracy, 2))

# Precision, Recall, and F1-score for both classes 
pre_0 = precision_score(y_test, y_pred, pos_label=0)
pre_1 = precision_score(y_test, y_pred, pos_label=1)
r_0 = recall_score(y_test, y_pred, pos_label=0)
r_1 = recall_score(y_test, y_pred, pos_label=1)
f_0 = f1_score(y_test, y_pred, pos_label=0)
f_1 = f1_score(y_test, y_pred, pos_label=1)

# Store precision, recall, and F1-score in a dictionary
eval_dict = {
    "Class": [0, 1],
    "Precision": [pre_0, pre_1],
    "Recall": [r_0, r_1],
    "F1-score": [f_0, f_1]
}

# Convert dictionary to DataFrame
Evaluation = pd.DataFrame(eval_dict)


# Create and print the confusion matrix
print ("---Confusion_matrix---")
print(confusion_matrix(y_test, y_pred))
print("\n")


print("\t \t ----Model Performance----")
print(classification_report(y_test, y_pred, target_names=["Not Cyberbullying", "Cyberbullying"]))

0.71
---Confusion_matrix---
[[869 259]
 [374 718]]


	 	 ----Model Performance----
                   precision    recall  f1-score   support

Not Cyberbullying       0.70      0.77      0.73      1128
    Cyberbullying       0.73      0.66      0.69      1092

         accuracy                           0.71      2220
        macro avg       0.72      0.71      0.71      2220
     weighted avg       0.72      0.71      0.71      2220



### Logistic Regression

In [23]:
# Train the model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [24]:
# Predict labels on test set
y_hat = lr_model.predict(X_test)

In [25]:
# Accuracy  
accuracy_lr = accuracy_score(y_test, y_hat)
print(round(accuracy_lr, 2))

# Precision, Recall, and F1-score for both classes 
pre_0_lr = precision_score(y_test, y_hat, pos_label=0)
pre_1_lr = precision_score(y_test, y_hat, pos_label=1)
r_0_lr = recall_score(y_test, y_hat, pos_label=0)
r_1_lr = recall_score(y_test, y_hat, pos_label=1)
f_0_lr = f1_score(y_test, y_hat, pos_label=0)
f_1_lr = f1_score(y_test, y_hat, pos_label=1)

# Store precision, recall, and F1-score in a dictionary 
eval_dict_lr = {
    "Class": [0, 1],
    "Precision": [pre_0_lr, pre_1_lr],
    "Recall": [r_0_lr, r_1_lr],
    "F1-score": [f_0_lr, f_1_lr]
}

# Convert dictionary to DataFrame
Evaluation_lr = pd.DataFrame(eval_dict_lr)


# Create and print the confusion matrix
print ("---Confusion_matrix---")
print(confusion_matrix(y_test, y_hat))
print("\n")


print("\t \t ----Model Performance----")
print(classification_report(y_test, y_hat, target_names=["Not Cyberbullying", "Cyberbullying"]))

0.71
---Confusion_matrix---
[[870 258]
 [382 710]]


	 	 ----Model Performance----
                   precision    recall  f1-score   support

Not Cyberbullying       0.69      0.77      0.73      1128
    Cyberbullying       0.73      0.65      0.69      1092

         accuracy                           0.71      2220
        macro avg       0.71      0.71      0.71      2220
     weighted avg       0.71      0.71      0.71      2220

