In [1]:
import sys
import os
import pandas as pd

In [2]:
BASE_PATH = "/Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying"
sys.path.append(BASE_PATH)
os.chdir(BASE_PATH)

# Binary part

# Imports

In [3]:
from scripts.config import VAL_SET_PATH, TEXT_COLUMN, LABEL_COLUMN, PROCESSED_DATA_PATH
from scripts.data_loader import DataLoader
from scripts.data_understanding import DataUnderstanding
from scripts.data_cleaning import DataCleaner
from scripts.data_saver import DataSaver

# Training data loading

In [4]:
loader = DataLoader(file_path=VAL_SET_PATH)
val_df = loader.load_dataset()

Dataset loaded with shape: (7145, 6)


# Training data understanding

## Raw column

In [5]:
du_raw = DataUnderstanding(
    dataset=val_df,
    text_column='tweet_text',          
    class_column=LABEL_COLUMN        
)

In [6]:
du_raw.class_distribution()

print("--------------------------")

du_raw.check_imbalance()

print("--------------------------")

du_raw.check_missing_values()

print("--------------------------")

du_raw.check_empty_strings()

print("--------------------------")

du_raw.check_duplicates()

print("--------------------------")

du_raw.inspect_duplicates(text_column='tweet_text', label_column=LABEL_COLUMN)

print("--------------------------")

du_raw.average_tweet_length()

print("--------------------------")

du_raw.binary_class_distribution()

print("--------------------------")



Class Distribution:
cyberbullying_type
age                  1557
religion             1506
gender               1456
ethnicity            1373
not_cyberbullying    1253
Name: count, dtype: int64
--------------------------

Class Imbalance Ratio (max/min): 1.24
--------------------------

Missing Values:
tweet_text             0
cyberbullying_type     0
tweet_soft             3
tweet_full             6
is_cyberbullying       0
cyberbullying_label    0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text             0
cyberbullying_type     0
tweet_soft             0
tweet_full             0
is_cyberbullying       0
cyberbullying_label    0
dtype: int64
--------------------------

Number of duplicated tweet_text: 0
--------------------------

Total duplicated texts (same text, any label): 0 rows

Label counts among all duplicates:

Perfect duplicates (same text and same label): 0 rows
Imperfect duplicates (same text, different labels): 0 rows


  .applymap(lambda x: isinstance(x, str) and x.strip() == '')


## Soft column

In [7]:
du_soft = DataUnderstanding(
    dataset=val_df,
    text_column='tweet_soft',          
    class_column=LABEL_COLUMN        
)

In [8]:
du_soft.class_distribution()

print("--------------------------")

du_soft.check_imbalance()

print("--------------------------")

du_soft.check_missing_values()

print("--------------------------")

du_soft.check_empty_strings()

print("--------------------------")

du_soft.check_duplicates()

print("--------------------------")

du_soft.inspect_duplicates(text_column='tweet_soft', label_column=LABEL_COLUMN)

print("--------------------------")

du_soft.average_tweet_length()

print("--------------------------")

du_soft.binary_class_distribution()

print("--------------------------")



Class Distribution:
cyberbullying_type
age                  1557
religion             1506
gender               1456
ethnicity            1373
not_cyberbullying    1253
Name: count, dtype: int64
--------------------------

Class Imbalance Ratio (max/min): 1.24
--------------------------

Missing Values:
tweet_text             0
cyberbullying_type     0
tweet_soft             3
tweet_full             6
is_cyberbullying       0
cyberbullying_label    0
char_length            0
word_length            0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text             0
cyberbullying_type     0
tweet_soft             0
tweet_full             0
is_cyberbullying       0
cyberbullying_label    0
char_length            0
word_length            0
dtype: int64
--------------------------

Number of duplicated tweet_soft: 18
--------------------------

Total duplicated texts (same text, any label): 27 rows

Label counts among all duplicates:
gender: 17
a

  .applymap(lambda x: isinstance(x, str) and x.strip() == '')


## Full column

In [9]:
du_full = DataUnderstanding(
    dataset=val_df,
    text_column='tweet_full',          
    class_column=LABEL_COLUMN        
)

In [10]:
du_full.class_distribution()

print("--------------------------")

du_full.check_imbalance()

print("--------------------------")

du_full.check_missing_values()

print("--------------------------")

du_full.check_empty_strings()

print("--------------------------")

du_full.check_duplicates()

print("--------------------------")

du_full.inspect_duplicates(text_column='tweet_full', label_column=LABEL_COLUMN)

print("--------------------------")

du_full.average_tweet_length()

print("--------------------------")

du_full.binary_class_distribution()

print("--------------------------")



Class Distribution:
cyberbullying_type
age                  1557
religion             1506
gender               1456
ethnicity            1373
not_cyberbullying    1253
Name: count, dtype: int64
--------------------------

Class Imbalance Ratio (max/min): 1.24
--------------------------

Missing Values:
tweet_text             0
cyberbullying_type     0
tweet_soft             3
tweet_full             6
is_cyberbullying       0
cyberbullying_label    0
char_length            0
word_length            0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text             0
cyberbullying_type     0
tweet_soft             0
tweet_full             0
is_cyberbullying       0
cyberbullying_label    0
char_length            0
word_length            0
dtype: int64
--------------------------

Number of duplicated tweet_full: 38
--------------------------

Total duplicated texts (same text, any label): 56 rows

Label counts among all duplicates:
gender: 17
a

  .applymap(lambda x: isinstance(x, str) and x.strip() == '')


# Data preprocessing

In [11]:
# Text columns to be cleaned
text_columns = ['tweet_text', 'tweet_soft', 'tweet_full']

cleaner = DataCleaner(val_df, text_column=text_columns, label_column=LABEL_COLUMN)

# 1. Remove global duplicates across all specified text columns
cleaned_df = cleaner.clean_text_duplicates()

# 2. Drop rows with missing values in any of the specified text columns
cleaned_df = cleaner.drop_missing_values(important_columns=text_columns)


--- CLEANING DUPLICATES COLUMN BY COLUMN: ['tweet_text', 'tweet_soft', 'tweet_full'] ---

Processing column: 'tweet_text'
 - Removed 0 imperfect duplicates (conflicting labels)
 - Removed 0 perfect duplicates (keeping one)

Processing column: 'tweet_soft'
 - Removed 0 imperfect duplicates (conflicting labels)
 - Removed 18 perfect duplicates (keeping one)

Processing column: 'tweet_full'
 - Removed 0 imperfect duplicates (conflicting labels)
 - Removed 38 perfect duplicates (keeping one)

Total rows removed: 38
--- DUPLICATE CLEANING COMPLETED ---

--- DROPPING MISSING VALUES IN: ['tweet_text', 'tweet_soft', 'tweet_full'] ---
Removed 1 rows with missing values.

MISSING VALUE CLEANING COMPLETED.


  duplicates_imperfect = duplicates_all[
  duplicates_imperfect = duplicates_all[
  duplicates_imperfect = duplicates_all[


In [12]:
# Rinomina la label e seleziona le colonne desiderate
cleaned_df = cleaned_df.rename(columns={'cyberbullying_label': 'label'})

# Seleziona le colonne da includere nel file finale
columns_to_keep = ['tweet_text', 'tweet_soft', 'tweet_full', 'label', 'cyberbullying_type']
cleaned_df = cleaned_df[columns_to_keep]

# Salvataggio
saver = DataSaver()
output_path = os.path.join(PROCESSED_DATA_PATH, "validation_multiclass_preprocessed.csv")
saver.save_full_dataset(cleaned_df, output_path)

Dataset saved to /Users/manuelemessere/Documents/Università /a) corsi/Human Language Technologies/HLT24_25/hlt_projct/cyberbullying/data/processed_data/validation_multiclass_preprocessed.csv.


In [13]:
# Check
print(f"Remaining rows: {cleaned_df.shape[0]}")


Remaining rows: 7106


# Preprocessed training data understanding

## Raw column

In [14]:
cleaned_du = DataUnderstanding(
    dataset=cleaned_df,
    text_column='tweet_text',          
    class_column=LABEL_COLUMN        
)

In [15]:
cleaned_du.class_distribution()

print("--------------------------")

cleaned_du.check_imbalance()

print("--------------------------")

cleaned_du.check_missing_values()

print("--------------------------")

cleaned_du.check_empty_strings()

print("--------------------------")

cleaned_du.check_empty_strings()

print("--------------------------")

cleaned_du.check_duplicates()

print("--------------------------")

cleaned_du.inspect_duplicates(text_column='tweet_text', label_column=LABEL_COLUMN)

print("--------------------------")

cleaned_du.binary_class_distribution()


Class Distribution:
cyberbullying_type
age                  1546
religion             1504
gender               1444
ethnicity            1368
not_cyberbullying    1244
Name: count, dtype: int64
--------------------------

Class Imbalance Ratio (max/min): 1.24
--------------------------

Missing Values:
tweet_text            0
tweet_soft            0
tweet_full            0
label                 0
cyberbullying_type    0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text            0
tweet_soft            0
tweet_full            0
label                 0
cyberbullying_type    0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text            0
tweet_soft            0
tweet_full            0
label                 0
cyberbullying_type    0
dtype: int64
--------------------------

Number of duplicated tweet_text: 0
--------------------------

Total duplicated texts (same text, any label): 0 rows

Lab

  .applymap(lambda x: isinstance(x, str) and x.strip() == '')


## Soft 

In [16]:
cleaned_du_soft = DataUnderstanding(
    dataset=cleaned_df,
    text_column='tweet_soft',          
    class_column=LABEL_COLUMN        
)

In [17]:
cleaned_du_soft.class_distribution()

print("--------------------------")

cleaned_du_soft.check_imbalance()

print("--------------------------")

cleaned_du_soft.check_missing_values()

print("--------------------------")

cleaned_du_soft.check_empty_strings()

print("--------------------------")

cleaned_du_soft.check_empty_strings()

print("--------------------------")

cleaned_du_soft.check_duplicates()

print("--------------------------")

cleaned_du_soft.inspect_duplicates(text_column='tweet_text', label_column=LABEL_COLUMN)

print("--------------------------")

cleaned_du_soft.binary_class_distribution()


Class Distribution:
cyberbullying_type
age                  1546
religion             1504
gender               1444
ethnicity            1368
not_cyberbullying    1244
Name: count, dtype: int64
--------------------------

Class Imbalance Ratio (max/min): 1.24
--------------------------

Missing Values:
tweet_text            0
tweet_soft            0
tweet_full            0
label                 0
cyberbullying_type    0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text            0
tweet_soft            0
tweet_full            0
label                 0
cyberbullying_type    0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text            0
tweet_soft            0
tweet_full            0
label                 0
cyberbullying_type    0
dtype: int64
--------------------------

Number of duplicated tweet_soft: 0
--------------------------

Total duplicated texts (same text, any label): 0 rows

Lab

  .applymap(lambda x: isinstance(x, str) and x.strip() == '')


## Full

In [18]:
cleaned_du_full = DataUnderstanding(
    dataset=cleaned_df,
    text_column='tweet_full',          
    class_column=LABEL_COLUMN        
)

In [19]:
cleaned_du_full.class_distribution()

print("--------------------------")

cleaned_du_full.check_imbalance()

print("--------------------------")

cleaned_du_full.check_missing_values()

print("--------------------------")

cleaned_du_full.check_empty_strings()

print("--------------------------")

cleaned_du_full.check_empty_strings()

print("--------------------------")

cleaned_du_full.check_duplicates()

print("--------------------------")

cleaned_du_full.inspect_duplicates(text_column='tweet_text', label_column=LABEL_COLUMN)

print("--------------------------")

cleaned_du_full.binary_class_distribution()


Class Distribution:
cyberbullying_type
age                  1546
religion             1504
gender               1444
ethnicity            1368
not_cyberbullying    1244
Name: count, dtype: int64
--------------------------

Class Imbalance Ratio (max/min): 1.24
--------------------------

Missing Values:
tweet_text            0
tweet_soft            0
tweet_full            0
label                 0
cyberbullying_type    0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text            0
tweet_soft            0
tweet_full            0
label                 0
cyberbullying_type    0
dtype: int64
--------------------------

Empty or whitespace-only strings per column:
tweet_text            0
tweet_soft            0
tweet_full            0
label                 0
cyberbullying_type    0
dtype: int64
--------------------------

Number of duplicated tweet_full: 0
--------------------------

Total duplicated texts (same text, any label): 0 rows

Lab

  .applymap(lambda x: isinstance(x, str) and x.strip() == '')
