In [1]:
import sys
import os
import pandas as pd

In [2]:
BASE_PATH = "/Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying"
sys.path.append(BASE_PATH)
os.chdir(BASE_PATH)

# Imports

In [3]:
from scripts import config
from scripts.data_loader import DataLoader
from scripts.data_understanding import DataUnderstanding
from scripts.language_detection import LanguageDetector
from scripts.data_cleaning import DataCleaner
from scripts.text_preprocessing import TextPreprocessor
from scripts.data_builder import DatasetBuilder
from scripts.data_splitting import DataSplitter
from scripts.data_saver import DataSaver

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manuelemessere/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data loading

In [4]:
loader = DataLoader(config.RAW_DATA_PATH)
df = loader.load_dataset()

Dataset loaded with shape: (47692, 2)


# Data understanding

In [5]:
inspector = DataUnderstanding(df, text_column=config.TEXT_COLUMN, class_column=config.LABEL_COLUMN)

In [6]:
inspector.class_distribution()

print("--------------------------")

inspector.check_imbalance()

print("--------------------------")

inspector.check_missing_values()

print("--------------------------")

inspector.check_empty_strings()

print("--------------------------")

inspector.check_duplicates()

print("--------------------------")

inspector.inspect_duplicates()

print("--------------------------")

inspector.hashtag_analysis()

print("--------------------------")

inspector.emoji_analysis()


Class Distribution:
cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64
--------------------------

Class Imbalance Ratio (max/min): 1.02
--------------------------

Missing Values:
tweet_text            0
cyberbullying_type    0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text            0
cyberbullying_type    0
dtype: int64
--------------------------

Number of duplicated tweet_text: 1675
--------------------------

Total duplicated texts (same text, any label): 3350 rows

Label counts among all duplicates:
other_cyberbullying: 1580
not_cyberbullying: 1525
gender: 226
ethnicity: 11
religion: 8

Perfect duplicates (same text and same label): 72 rows
Imperfect duplicates (same text, different labels): 3278 rows
--------------------------


  .applymap(lambda x: isinstance(x, str) and x.strip() == '')



Average Hashtags per Tweet: 0.24
--------------------------

Average Emojis per Tweet: 0.02


# Language filtering

In [7]:
# Filter for English language only
lang_detector = LanguageDetector(target_lang=config.LANGUAGE)
df["is_en"] = df[config.TEXT_COLUMN].astype(str).apply(lang_detector.is_target_language)
df = df[df["is_en"] == True].drop(columns=["is_en"])

# Preprocessing

In [8]:
# Filter out 'other_cyberbullying' from the dataset
df = df[df['cyberbullying_type'] != 'other_cyberbullying'].reset_index(drop=True)

In [9]:
print(df['cyberbullying_type'].unique())

['not_cyberbullying' 'gender' 'religion' 'age' 'ethnicity']


In [10]:
cleaner = DataCleaner(df, text_column='tweet_text')

In [11]:
# --- Before cleaning, duplicates analysis ---
print("----- Before removing -----")
print(f"Total rows numbers: {df.shape[0]}")
print(f"Only text duplicates (tweet_text): {df.duplicated(subset=['tweet_text']).sum()}")
print(f"Text duplicates + label (tweet_text + cyberbullying_type): {df.duplicated(subset=['tweet_text', 'cyberbullying_type']).sum()}")

----- Before removing -----
Total rows numbers: 37751
Only text duplicates (tweet_text): 91
Text duplicates + label (tweet_text + cyberbullying_type): 35


In [12]:
# Duplicates removing
cleaned_df = cleaner.clean_text_duplicates()



--- CLEANING DUPLICATES COLUMN BY COLUMN: ['tweet_text'] ---

Processing column: 'tweet_text'
 - Removed 112 imperfect duplicates (conflicting labels)
 - Removed 35 perfect duplicates (keeping one)

Total rows removed: 147
--- DUPLICATE CLEANING COMPLETED ---


  duplicates_imperfect = duplicates_all[


In [13]:
# --- After cleaning---
print("\n----- After removing -----")
print(f"Total rows numbers controll: {cleaned_df.shape[0]}")
print(f"Only text duplicates controll (tweet_text): {cleaned_df.duplicated(subset=['tweet_text']).sum()}")
print(f"Duplicati su testo + label controll (tweet_text + cyberbullying_type): {cleaned_df.duplicated(subset=['tweet_text', 'cyberbullying_type']).sum()}")


----- After removing -----
Total rows numbers controll: 37604
Only text duplicates controll (tweet_text): 0
Duplicati su testo + label controll (tweet_text + cyberbullying_type): 0


# Text preprocessing

In [14]:
preprocessor = TextPreprocessor()
cleaned_df["tweet_soft"] = cleaned_df[config.TEXT_COLUMN].astype(str).apply(preprocessor.clean_text_soft)
cleaned_df["tweet_full"] = cleaned_df[config.TEXT_COLUMN].astype(str).apply(preprocessor.clean_text_full)

# Preprocessed dataset building

In [16]:
# Preprocessed full dataset, for checking and working on it
builder = DatasetBuilder(cleaned_df)
builder.add_binary_label()
builder.add_multiclass_label()
cleaned_df = builder.df

# Saving preprocessed dataset
saver = DataSaver()
saver.save_dataframe(cleaned_df, os.path.join(config.PROCESSED_DATA_PATH, "dataset_preprocessed.csv"))

Dataset saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/processed_data/dataset_preprocessed.csv.


# Data splitting

In [17]:
# Splitting dataset
splitter = DataSplitter(cleaned_df, label_column=config.BINARY_LABEL_COLUMN, random_state=config.SEED)
train_df, val_df, test_df = splitter.split()

# Saving splitted data
if config.SAVE_SPLITS:
    saver = DataSaver()
    saver.save_dataframe(train_df, os.path.join(config.INTERIM_DATA_PATH, "train.csv"))
    saver.save_dataframe(val_df, os.path.join(config.INTERIM_DATA_PATH, "val.csv"))
    saver.save_dataframe(test_df, os.path.join(config.INTERIM_DATA_PATH, "test.csv"))

Train set: 26698 samples
Validation set: 7145 samples
Test set: 3761 samples
Dataset saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/interim/train.csv.
Dataset saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/interim/val.csv.
Dataset saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/interim/test.csv.
