Dataset needs to be cleaned

In [1]:
import pandas as pd
import re
import os

from tqdm import tqdm

# Optional: tqdm for apply()
tqdm.pandas()

import sys
from pathlib import Path

# Add src/ to path (once, so imports work)
sys.path.append(str(Path().resolve().parent / "src"))

# Now you can import paths
from paths import DATA_PROCESSED
from paths import DATA_CLEANED

Get dataset

In [2]:
df_train = pd.read_csv(DATA_PROCESSED / "hasoc_2019_en_train.tsv", sep='\t')
df_test = pd.read_csv(DATA_PROCESSED / "hasoc_2019_en_test.tsv", sep='\t')

Helper function to clean dataset

In [3]:
def clean_text(text):
    text = str(text)

    # 1. Remove Twitter handles (@usernames)
    text = re.sub(r'@\w+', '', text)

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # 3. Remove hashtags but keep the text (optional)
    text = re.sub(r'#', '', text)

    # 4. Remove special characters except basic punctuation
    text = re.sub(r'[^\w\s.,!?]', '', text)

    # 5. Convert to lowercase
    text = text.lower()

    # 6. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [4]:
df_train['text'] = df_train['text'].progress_apply(clean_text)
df_test['text'] = df_test['text'].progress_apply(clean_text)

100%|██████████| 5852/5852 [00:00<00:00, 29064.75it/s]
100%|██████████| 1153/1153 [00:00<00:00, 27108.72it/s]


Save cleaned dataset

In [5]:
output_path = DATA_CLEANED / "hasoc_2019_en_train_cleaned.tsv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_train.to_csv(output_path, sep='\t', index=False)
print(f"Cleaned file saved to: {output_path}")

Cleaned file saved to: C:\Users\JuliusAdmin\Documents\GitHub\HateSpeechDetection\data\cleaned\hasoc_2019_en_train_cleaned.tsv


In [6]:
output_path = DATA_CLEANED / "hasoc_2019_en_test_cleaned.tsv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_test.to_csv(output_path, sep='\t', index=False)
print(f"Cleaned file saved to: {output_path}")

Cleaned file saved to: C:\Users\JuliusAdmin\Documents\GitHub\HateSpeechDetection\data\cleaned\hasoc_2019_en_test_cleaned.tsv
