Labelling the data as 0 or 1

In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/cyberbullying_tweets.csv'
data = pd.read_csv(file_path)

# Create a new column 'label' with 0 for 'not_cyberbullying' and 1 for all other categories
data['label'] = data['cyberbullying_type'].apply(lambda x: 0 if x == 'not_cyberbullying' else 1)

# Save the updated dataframe to a new CSV file in the current directory
output_path = '/content/labeled_cyberbullying_tweets.csv'
data.to_csv(output_path, index=False)

# Display the first few rows of the updated dataframe
data.head()

Unnamed: 0,tweet_text,cyberbullying_type,label
0,"In other words #katandandre, your food was cra...",not_cyberbullying,0
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,0
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,0
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,0


Preprocessing the labled data

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download NLTK data (only needs to be run once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the dataset
file_path = '/content/labeled_cyberbullying_tweets.csv'
data = pd.read_csv(file_path)

# Initialize NLTK components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function: clean, tokenize, remove stopwords, lemmatize
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Apply preprocessing to the 'tweet' column
data['processed_tweet'] = data['tweet_text'].apply(preprocess_text)

# Convert tokens back to strings (Keras Tokenizer expects text input)
data['processed_tweet'] = data['processed_tweet'].apply(lambda x: ' '.join(x))

# Tokenization and vectorization using Keras
max_vocab_size = 4500
max_sequence_length = 4500

# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(data['processed_tweet'])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(data['processed_tweet'])

# Pad sequences to ensure uniform length
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Extract the labels
y = data['label'].values

# Save the processed dataset with sequences and labels to a new CSV file
processed_output_path = '/content/processed_cyberbullying_tweets.csv'
data.to_csv(processed_output_path, index=False)

# Save the feature array (X) and label array (y) as NumPy files
np.save('/content/X_features.npy', X)
np.save('/content/y_labels.npy', y)

# The files are now saved in the '/content/' directory.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
!pip install scikit-fuzzy

Collecting scikit-fuzzy
  Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl (920 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m920.8/920.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.5.0


In [None]:
import pandas as pd
import numpy as np
import skfuzzy as fuzz

# Load the datasets
tweets_df = pd.read_csv('/content/processed_cyberbullying_tweets.csv')
bad_words_df = pd.read_csv('/content/bad_words.csv')

# Convert bad words to a set for faster lookup
bad_words_set = set(bad_words_df['bad_words'])

# Function to classify tweets based on the number of bad words
def classify_tweet(processed_tweet, label):
    if label == 0:
        return 'Neither'

    # Ensure the tweet is a string and handle missing values
    if isinstance(processed_tweet, float) and np.isnan(processed_tweet):
        processed_tweet = ""
    else:
        processed_tweet = str(processed_tweet)

    # Count the number of bad words in the tweet
    bad_word_count = sum(word in bad_words_set for word in processed_tweet.split())

    # Define the triangular membership function
    x_bad_words = np.arange(0, 11, 1)
    low = fuzz.trimf(x_bad_words, [0, 0, 3])
    medium = fuzz.trimf(x_bad_words, [2, 4, 4])
    high = fuzz.trimf(x_bad_words, [4, 10, 10])

    # Determine the classification based on the bad word count
    if fuzz.interp_membership(x_bad_words, low, bad_word_count) > 0:
        return 'Low'
    elif fuzz.interp_membership(x_bad_words, medium, bad_word_count) > 0:
        return 'Medium'
    else:
        return 'High'

# Apply the classification function to each tweet
tweets_df['classification'] = tweets_df.apply(lambda row: classify_tweet(row['processed_tweet'], row['label']), axis=1)

# Save the classified tweets to a new CSV file
tweets_df.to_csv('/content/classified_tweet.csv', index=False)

print("Classification completed and saved to 'classified_tweet.csv'")

Classification completed and saved to 'classified_tweet.csv'
