# Data Cleaning Pipeline
Removing hashtags, mentions, and URLs from the dataset to prevent data leakage and noise.

In [1]:
import pandas as pd
import re
import os

def clean_text(text):
    if not isinstance(text, str):
        return text
    # Remove all hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove all mentions
    text = re.sub(r'@\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_dataset(input_path, output_path):
    if not os.path.exists(input_path):
        print(f'File {input_path} not found.')
        return
        
    print(f'Loading {input_path}...')
    df = pd.read_csv(input_path)
    
    print(f'Original shape: {df.shape}')
    
    if 'text' in df.columns:
        df['text'] = df['text'].apply(clean_text)
        
        # Remove rows where text became empty
        df = df[df['text'].astype(str).str.strip() != '']
        df = df.dropna(subset=['text'])
        print(f'Cleaned shape: {df.shape}')
        
        df.to_csv(output_path, index=False)
        print(f'Saved cleaned dataset to {output_path}')
    else:
        print("Column 'text' not found in the dataset.")

if __name__ == '__main__':
    input_csv = 'Data/sarcasm_hinghlish_dataset.csv'
    output_csv = 'Data/sarcasm_hinghlish_dataset_cleaned.csv'
    clean_dataset(input_csv, output_csv)
    
    input_csv2 = 'Data/mlt_hinghlish_dataset.csv'
    output_csv2 = 'Data/mlt_hinghlish_dataset_cleaned.csv'
    clean_dataset(input_csv2, output_csv2)


Loading Data/sarcasm_hinghlish_dataset.csv...
Original shape: (9593, 2)
Cleaned shape: (9593, 2)
Saved cleaned dataset to Data/sarcasm_hinghlish_dataset_cleaned.csv
Loading Data/mlt_hinghlish_dataset.csv...
Original shape: (30000, 2)
Column 'text' not found in the dataset.
