<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/Cyberbullying_detection_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install opendatasets library
!pip install opendatasets

In [None]:
# Import necessary libraries and functions
import pandas as pd
import opendatasets as od
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.metrics import Precision
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,Bidirectional
from sklearn.model_selection import train_test_split
import pickle

In [None]:
# Use the opendatasets library to interact with kaggle
od.download("https://www.kaggle.com/datasets/andrewmvd/cyberbullying-classification")

In [14]:
# Read the data in pandas frame
data=pd.read_csv("/content/cyberbullying-classification/cyberbullying_tweets.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data['cyberbullying_type'].value_counts()

In [None]:
# Check nulls
data.isnull().sum()

In [18]:
# Encode the cyberbullying_type column
encoder=LabelEncoder()
data['cyberbullying_type']=encoder.fit_transform(data['cyberbullying_type'])

In [None]:
# Display some samples from the data
data.head()

In [None]:
data['cyberbullying_type'].value_counts()

In [22]:
# Define necessary objects
precision_metric = Precision()
stemmer=PorterStemmer()
English_stopwords=stopwords.words('english')

In [23]:
# Define a function that process texts
def text_cleaner(text):
  text=text.lower()# Convert to lower cases
  text_with_no_punctuations = re.sub(r'[^a-zA-Z0-9]', ' ', text) # Remove non alphabatic symbols
  tokens=word_tokenize(text_with_no_punctuations) # tokeize words
  stemmed_text = [stemmer.stem(word) for word in tokens] # Apply stemming
  text = ' '.join(stemmed_text)
  text_with_no_stopwords=[word for word in text.split() if word not in English_stopwords]# remove english stopwords
  final_cleaned_text=' '.join(text_with_no_stopwords)
  return final_cleaned_text


In [24]:
## Apply text_cleaner on tweets column
data['cleaned_tweet']=data['tweet_text'].apply(text_cleaner)

In [25]:
# Define a function that returns the length of a text
def count_words(text):
  return len(text.split())

In [26]:
# Apply text_cleaner on cleaned_tweet column
data['cleaned_tweet_length']=data['cleaned_tweet'].apply(count_words)

In [None]:
# Get some statistics for the cleaned_tweet_length column
data['cleaned_tweet_length'].describe()

In [29]:
# Initialize the tokenizer object
tok=Tokenizer()
tok.fit_on_texts(data['cleaned_tweet'])

In [None]:
# Find the vocab_size
vocab_size=len(tok.word_index)+1
print(vocab_size)

In [33]:
#  Define the features and the labels (in array formats)
x=tok.texts_to_sequences(data['cleaned_tweet'])
y=to_categorical(data['cyberbullying_type'])
x_padded=pad_sequences(x,maxlen=40,padding='post',truncating='post')

In [41]:
# Create a model
model=Sequential()
model=Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=40))
model.add(Bidirectional(LSTM(120,return_sequences=False)))
model.add(Dense(200,activation='relu'))
model.add(Dense(45,activation='relu'))
model.add(Dense(6,activation='softmax'))

In [42]:
# Compile and fit the data into the model
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["acc",precision_metric])
model.fit(x_padded,y,validation_split=0.1,batch_size=64,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79f882d7d300>

In [51]:
# Create a function for demo
def cyberbullying_detector(text):
  # clean the input text
    cleaned_text = text_cleaner(text)
  # convert the cleaned text to a sequence of integers
    text_array = tok.texts_to_sequences([cleaned_text])
  # pad the sequence
    padded_array = pad_sequences(text_array, maxlen=40, padding='post', truncating='post')
  # use the model created to generate predictions
    prediction = model.predict(padded_array)

    # Find the predicted class
    predicted_class = np.argmax(prediction)
    if predicted_class==0:
      print("Age-cyberbullying detected.")
    elif predicted_class==1:
      print("Ethnicity-cyberbullying detected.")
    elif predicted_class==2:
      print('Gender-cyberbullying detected.')
    elif predicted_class==3:
      print("No cyberbullying detected.")
    elif predicted_class==4:
      print("Cyberbullying detected.")
    else:
      print("Religion-cyvberbullying detected.")



In [52]:
# I'am so Sorry, I do not admit the following speech! just manipulating examples from  real world to test my model
text=" Females are very stupid beings."
cyberbullying_detector(text)

Gender-cyberbullying detected.


In [61]:
text="Hello Reader, How is your day going so far ?."
cyberbullying_detector(text)

No cyberbullying detected.


In [None]:
# Save the model archi and weights
model.save("model_cyberbullying.h5")