# Data cleaning

## Setup

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import os
import pandas as pd
from pathlib import Path
import sys

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

## Cleaning

In [8]:
from utils.data_cleaning import clean_na_variations, remove_duplicate_sentences_per_case

brc_referrals_raw = pd.read_csv("../data/raw/BRC-Data/Cases_depersonalised.csv")

In [11]:
print("Original data shape:", brc_referrals_raw.shape)
print("Columns:", brc_referrals_raw.columns.tolist())

# 1. Remove missing values for referral notes
print(f"\nOriginal referral notes missing values: {brc_referrals_raw['Referral Notes (depersonalised)'].isnull().sum()}")

brc_cleaned = brc_referrals_raw.dropna(subset=['Referral Notes (depersonalised)']).copy()
print(f"After removing missing referral notes: {brc_cleaned.shape}")

# 2. Apply NA cleaning to referral notes
brc_cleaned['Referral Notes (depersonalised)'] = brc_cleaned['Referral Notes (depersonalised)'].apply(clean_na_variations)
    
# Remove rows where referral notes became NaN after cleaning
before_na_clean = brc_cleaned.shape[0]
brc_cleaned = brc_cleaned.dropna(subset=['Referral Notes (depersonalised)'])
print(f"After removing NA variations: {brc_cleaned.shape} (removed {before_na_clean - brc_cleaned.shape[0]} rows) \n")

# 3. Remove duplicate sentences within multiple notes corresponding to a single Case Reference
# As done in Keloth et al. (2025)
brc_final = remove_duplicate_sentences_per_case(brc_cleaned)
print(f"\nFinal cleaned data shape: {brc_final.shape}")
print(f"Number of unique referral notes: {brc_final['Referral Notes (depersonalised)'].nunique()}")

# Save the cleaned data
output_path = Path("../data/processed/BRC_referrals_cleaned.csv")
brc_final.to_csv(output_path, index=False)
print(f"\nCleaned data saved to: {output_path}")


Original data shape: (181085, 19)
Columns: ['Area', 'Scheme', 'Case Reference', 'Assessment Result', 'Case Status', 'Referral Date/Time', 'End Date Case', 'Has Disability', 'Has Risk', 'Risk Type', 'Unique Case', 'IMD_decile', 'Country', 'Age', 'Gender', 'Ethnicity', 'Disability', 'Living Arrangements', 'Referral Notes (depersonalised)']

Original referral notes missing values: 30621
After removing missing referral notes: (150464, 19)
After removing NA variations: (150464, 19) (removed 0 rows) 

Removing duplicate sentences within cases...
Original sentences: 340393
Unique sentences: 182168
Sentences removed: 158225

Final cleaned data shape: (99560, 19)
Number of unique referral notes: 51658

Cleaned data saved to: ../data/processed/BRC_referrals_cleaned.csv
