In [1]:
import sys
import os
import pandas as pd

In [2]:
BASE_PATH = "/Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying"
sys.path.append(BASE_PATH)
os.chdir(BASE_PATH)

# Imports

In [None]:
from scripts import config
from scripts.data_loader import DataLoader
from scripts.data_understanding import DataUnderstanding
from scripts.language_detection import LanguageDetector
from scripts.data_cleaning import DataCleaner
from scripts.text_preprocessing import TextPreprocessor
from scripts.data_builder import DatasetBuilder
from scripts.data_splitting import DataSplitter
from scripts.data_saver import DataSaver

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manuelemessere/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data loading

In [None]:
loader = DataLoader(config.RAW_DATA_PATH)
df = loader.load_dataset()

Dataset loaded with shape: (47692, 2)


# Data understanding

In [None]:
print("\n--- DATA UNDERSTANDING ---")
inspector = DataUnderstanding(df, text_column=config.TEXT_COLUMN, class_column=config.LABEL_COLUMN)


--- DATA UNDERSTANDING ---


In [6]:
inspector.class_distribution()


Class Distribution:
cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64


In [7]:
inspector.check_imbalance()


Class Imbalance Ratio (max/min): 1.02


In [8]:
inspector.check_missing_values()


Missing Values:
tweet_text            0
cyberbullying_type    0
dtype: int64


In [9]:
inspector.check_empty_strings()


Empty or whitespace-only strings per column:
tweet_text            0
cyberbullying_type    0
dtype: int64


  .applymap(lambda x: isinstance(x, str) and x.strip() == '')


In [10]:
inspector.check_duplicates()


Number of duplicated tweet_text: 1675


In [11]:
inspector.inspect_duplicates()


Total duplicated texts (same text, any label): 3350 rows

Label counts among all duplicates:
other_cyberbullying: 1580
not_cyberbullying: 1525
gender: 226
ethnicity: 11
religion: 8

Perfect duplicates (same text and same label): 72 rows
Imperfect duplicates (same text, different labels): 3278 rows


In [12]:
inspector.hashtag_analysis()


Average Hashtags per Tweet: 0.24


In [13]:
inspector.emoji_analysis()


Average Emojis per Tweet: 0.02


# Language filtering

In [None]:
# Filter for English language only
print("\n--- LANGUAGE FILTERING ---")
lang_detector = LanguageDetector(target_lang=config.LANGUAGE)
df["is_en"] = df[config.TEXT_COLUMN].astype(str).apply(lang_detector.is_target_language)
df = df[df["is_en"] == True].drop(columns=["is_en"])


--- LANGUAGE FILTERING ---


# Preprocessing

In [15]:
cleaner = DataCleaner(df)

In [16]:
# --- Before cleaning, duplicates analysis ---
print("----- Before removing -----")
print(f"Total rows numbers: {df.shape[0]}")
print(f"Only text duplicates (tweet_text): {df.duplicated(subset=['tweet_text']).sum()}")
print(f"Text duplicates + label (tweet_text + cyberbullying_type): {df.duplicated(subset=['tweet_text', 'cyberbullying_type']).sum()}")

----- Before removing -----
Total rows numbers: 44668
Only text duplicates (tweet_text): 1501
Text duplicates + label (tweet_text + cyberbullying_type): 35


In [17]:
# Duplicates removing
cleaned_df = cleaner.clean_all_duplicates()



--- CLEANING DUPLICATES BASED ON TEXT COLUMN: tweet_text ---

[1/3] Removing perfect duplicates...
Removed 35 perfect duplicates.

[2/3] Removing conflicting label duplicates...
Removed 2932 conflicting label rows.

[3/3] Forcing final text-only duplicate removal...
Removed 0 pure text duplicates.

DUPLICATE CLEANING COMPLETED.


In [18]:
# --- After cleaning---
print("\n----- After removing -----")
print(f"Total rows numbers controll: {cleaned_df.shape[0]}")
print(f"Only text duplicates controll (tweet_text): {cleaned_df.duplicated(subset=['tweet_text']).sum()}")
print(f"Duplicati su testo + label controll (tweet_text + cyberbullying_type): {cleaned_df.duplicated(subset=['tweet_text', 'cyberbullying_type']).sum()}")


----- After removing -----
Total rows numbers controll: 41701
Only text duplicates controll (tweet_text): 0
Duplicati su testo + label controll (tweet_text + cyberbullying_type): 0


# Text preprocessing

In [None]:
print("\n--- TEXT PREPROCESSING ---")
preprocessor = TextPreprocessor()
cleaned_df["tweet_soft"] = cleaned_df[config.TEXT_COLUMN].astype(str).apply(preprocessor.clean_text_soft)
cleaned_df["tweet_full"] = cleaned_df[config.TEXT_COLUMN].astype(str).apply(preprocessor.clean_text_full)


--- TEXT PREPROCESSING ---


# Preprocessed dataset building

In [20]:
# Preprocessed full dataset, for checking and working on it
print("\n--- PREPROCESSED DATASET BUILDING ---")
builder = DatasetBuilder(cleaned_df)
builder.add_binary_label()
builder.add_multiclass_label()
builder.save_label_mapping(os.path.join(config.PROCESSED_DATA_PATH, "label_mapping.json"))
cleaned_df = builder.df

print("\n--- SAVING PREPROCESSED DATASET ---")
saver = DataSaver()
saver.save_dataframe(cleaned_df, os.path.join(config.PROCESSED_DATA_PATH, "dataset_preprocessed.csv"))


--- PREPROCESSED DATASET BUILDING ---
Label mapping saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/processed_data/label_mapping.json.

--- SAVING PREPROCESSED DATASET ---
Dataset saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/processed_data/dataset_preprocessed.csv.


# Data splitting

In [None]:
print("\n--- SPLITTING DATASET ---")
splitter = DataSplitter(cleaned_df, label_column=config.BINARY_LABEL_COLUMN, random_state=config.SEED)
train_df, val_df, test_df = splitter.split()

print("\n--- SAVING SPLITTED DATASET ---")
if config.SAVE_SPLITS:
    saver = DataSaver()
    saver.save_dataframe(train_df, os.path.join(config.INTERIM_DATA_PATH, "train.csv"))
    saver.save_dataframe(val_df, os.path.join(config.INTERIM_DATA_PATH, "val.csv"))
    saver.save_dataframe(test_df, os.path.join(config.INTERIM_DATA_PATH, "test.csv"))


--- SPLITTING DATASET ---
Train set: 29607 samples
Validation set: 7923 samples
Test set: 4171 samples

--- SAVING SPLITTED DATASET ---
Dataset saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/interim/train.csv.
Dataset saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/interim/val.csv.
Dataset saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/interim/test.csv.
