In [1]:
### 

In [6]:
# Jupyter cell: Cyberbullying dataset cleaning pipeline

import re
import pandas as pd
import spacy
from difflib import get_close_matches
import logging
import uuid

# Setup logging to display info in notebook
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataCleaner:
    """
    A microservice for cleaning and normalizing cyberbullying dataset.
    Handles label normalization, type categorization, and data validation.
    """
    
    def __init__(self, spacy_model: str = "en_core_web_sm"):
        self.LABEL_CANONICAL = {
            "bullying": "Bullying",
            "not bullying": "Not-Bullying",
        }
        
        self.TYPE_CANONICAL = {
            "ethnicity": "Ethnicity",
            "ethnic": "Ethnicity",
            "ethnically": "Ethnicity",
            "sexual": "Sexual",
            "saxual": "Sexual",
            "sex": "Sexual",
            "religion": "Religion",
            "religious": "Religion",
            "relgion": "Religion",
            "threat": "Threats",
            "threats": "Threats",
            "troll": "Troll",
            "vocational": "Vocational",
            "vocation": "Vocational",
            "political": "Political",
            "racism": "Racism",
        }
        
        # Initialize spaCy pipeline
        try:
            self.nlp = spacy.load(spacy_model)
            logger.info(f"Loaded spaCy model: {spacy_model}")
        except OSError:
            self.nlp = spacy.blank("en")
            logger.warning(f"Using blank spaCy model for language: en")
    
    def normalize_with_spacy(self, text: str):
        if pd.isna(text):
            return None
        doc = self.nlp(str(text))
        tokens = []
        for token in doc:
            if token.is_space or token.is_punct:
                continue
            lemma = (token.lemma_ or token.text).lower().strip()
            if lemma:
                tokens.append(lemma)
        normalized = " ".join(tokens)
        normalized = normalized.replace("-", " ")
        normalized = re.sub(r"[^a-z\s]", " ", normalized)
        normalized = re.sub(r"\s+", " ", normalized).strip()
        return normalized or None
    
    def clean_label(self, value):
        normalized = self.normalize_with_spacy(value)
        if normalized is None:
            return pd.NA
        if normalized in self.LABEL_CANONICAL:
            return self.LABEL_CANONICAL[normalized]
        if "bully" in normalized:
            if any(flag in normalized for flag in ("not", "no", "non")):
                return self.LABEL_CANONICAL["not bullying"]
            return self.LABEL_CANONICAL["bullying"]
        match = get_close_matches(normalized, self.LABEL_CANONICAL.keys(), n=1, cutoff=0.6)
        if match:
            return self.LABEL_CANONICAL[match[0]]
        return pd.NA
    
    def clean_type(self, value):
        normalized = self.normalize_with_spacy(value)
        if normalized is None:
            return pd.NA
        if normalized in self.TYPE_CANONICAL:
            return self.TYPE_CANONICAL[normalized]
        if normalized.endswith("s") and normalized[:-1] in self.TYPE_CANONICAL:
            return self.TYPE_CANONICAL[normalized[:-1]]
        match = get_close_matches(normalized, self.TYPE_CANONICAL.keys(), n=1, cutoff=0.55)
        if match:
            return self.TYPE_CANONICAL[match[0]]
        return normalized.title()
    
    def remove_duplicates(self, df, text_column="Text", keep="first"):
        initial_count = len(df)
        cleaned_df = df.drop_duplicates(subset=[text_column], keep=keep).reset_index(drop=True)
        duplicates_removed = initial_count - len(cleaned_df)
        if duplicates_removed > 0:
            logger.info(f" Removed {duplicates_removed} duplicate rows (kept='{keep}')")
            logger.info(f"   Before: {initial_count} rows → After: {len(cleaned_df)} rows")
        else:
            logger.info(f"No duplicate rows found")
        return cleaned_df
    
    def clean_dataset(self, df, text_column="Text", label_column="Label", type_column="Types", remove_duplicates=True):
        logger.info("Starting dataset cleaning pipeline...")
        cleaned_df = df.copy()
        if remove_duplicates:
            logger.info("Removing duplicate rows...")
            cleaned_df = self.remove_duplicates(cleaned_df, text_column=text_column)
        logger.info("Cleaning label column...")
        cleaned_df[label_column] = cleaned_df[label_column].apply(self.clean_label)
        logger.info("Cleaning type column...")
        cleaned_df[type_column] = cleaned_df[type_column].apply(self.clean_type)
        mask = cleaned_df[label_column] == "Not-Bullying"
        cleaned_df.loc[mask, type_column] = pd.NA
        initial_count = len(cleaned_df)
        cleaned_df = cleaned_df[cleaned_df[label_column].notna()].reset_index(drop=True)
        final_count = len(cleaned_df)
        logger.info(f"Cleaning complete. Removed {initial_count - final_count} invalid rows.")
        logger.info(f"Final dataset: {final_count} rows")
        return cleaned_df

# Read CSV 
csv_path = "../data/raw/Approach to Social Media Cyberbullying and Harassment Detection Using Advanced Machine Learning.csv"
  

# Load raw data with pandas
raw_df = pd.read_csv(csv_path)

# Instantiate cleaner and clean the dataset
cleaner = DataCleaner()
cleaned_df = cleaner.clean_dataset(raw_df, text_column="Text", label_column="Label", type_column="Types")

# Show cleaned data
cleaned_df.head()


INFO:__main__:Loaded spaCy model: en_core_web_sm
INFO:__main__:Starting dataset cleaning pipeline...
INFO:__main__:Removing duplicate rows...
INFO:__main__:Loaded spaCy model: en_core_web_sm
INFO:__main__:Starting dataset cleaning pipeline...
INFO:__main__:Removing duplicate rows...
INFO:__main__: Removed 2472 duplicate rows (kept='first')
INFO:__main__:   Before: 8452 rows → After: 5980 rows
INFO:__main__:Cleaning label column...
INFO:__main__: Removed 2472 duplicate rows (kept='first')
INFO:__main__:   Before: 8452 rows → After: 5980 rows
INFO:__main__:Cleaning label column...
INFO:__main__:Cleaning type column...
INFO:__main__:Cleaning type column...
INFO:__main__:Cleaning complete. Removed 2 invalid rows.
INFO:__main__:Final dataset: 5978 rows
INFO:__main__:Cleaning complete. Removed 2 invalid rows.
INFO:__main__:Final dataset: 5978 rows


Unnamed: 0,Text,Label,Types
0,Ten outside soon doctor shake everyone treatme...,Not-Bullying,
1,my life has come to a standstill and at this p...,Not-Bullying,
2,girl this nigga make me sick to my stomach,Bullying,Ethnicity
3,I wanna fuck you,Bullying,Sexual
4,"Oh hey, you should be ashamed of your disgusti...",Not-Bullying,


In [1]:
pwd

'/home/iyedpc1/TEST/FinalFinal/notebooks'

In [5]:
cleaned_df.to_csv("../data/clean/posts.csv", index=False)

# 📊 Pipeline Statistics

Let's examine the cleaning results and statistics:

In [7]:
print(f"Original dataset: {len(raw_df)} rows")
print(f"Cleaned dataset: {len(cleaned_df)} rows")
print(f"Rows removed: {len(raw_df) - len(cleaned_df)}")
print(f"\nColumns: {list(cleaned_df.columns)}")
print(f"\nLabel distribution:")
print(cleaned_df['Label'].value_counts())
print(f"\nType distribution:")
print(cleaned_df['Types'].value_counts())

Original dataset: 8452 rows
Cleaned dataset: 5978 rows
Rows removed: 2474

Columns: ['Text', 'Label', 'Types']

Label distribution:
Label
Bullying        3282
Not-Bullying    2696
Name: count, dtype: int64

Type distribution:
Types
Troll         825
Sexual        682
Vocational    488
Political     484
Religion      428
Threats       216
Ethnicity     132
Racism          2
Name: count, dtype: int64
