# Data cleaning

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from pathlib import Path
import sys

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

## Cleaning referrals

In [None]:
from utils.data_cleaning_helpers import clean_na_variations, remove_duplicate_sentences_per_case

brc_referrals_raw = pd.read_csv("../data/raw/BRC-Data/Cases_depersonalised.csv")

In [None]:
print("Original data shape:", brc_referrals_raw.shape)
print("Columns:", brc_referrals_raw.columns.tolist())

# 1. Remove missing values for referral notes
print(f"\nOriginal referral notes missing values: {brc_referrals_raw['Referral Notes (depersonalised)'].isnull().sum()}")

brc_cleaned = brc_referrals_raw.dropna(subset=['Referral Notes (depersonalised)']).copy()
print(f"After removing missing referral notes: {brc_cleaned.shape}")

# 2. Apply NA cleaning to referral notes
brc_cleaned['Referral Notes (depersonalised)'] = brc_cleaned['Referral Notes (depersonalised)'].apply(clean_na_variations)
    
# Remove rows where referral notes became NaN after cleaning
before_na_clean = brc_cleaned.shape[0]
brc_cleaned = brc_cleaned.dropna(subset=['Referral Notes (depersonalised)'])
print(f"After removing NA variations: {brc_cleaned.shape} (removed {before_na_clean - brc_cleaned.shape[0]} rows) \n")

# 3. Remove duplicate sentences within multiple notes corresponding to a single Case Reference
# As done in Keloth et al. (2025)
brc_final = remove_duplicate_sentences_per_case(brc_cleaned)
print(f"\nFinal cleaned data shape: {brc_final.shape}")
print(f"Number of unique referral notes: {brc_final['Referral Notes (depersonalised)'].nunique()}")

# Save the cleaned data
output_path = Path("../data/processed/BRC_referrals_cleaned.csv")
brc_final.to_csv(output_path, index=False)
print(f"\nCleaned data saved to: {output_path}")

Original data shape: (181085, 19)
Columns: ['Area', 'Scheme', 'Case Reference', 'Assessment Result', 'Case Status', 'Referral Date/Time', 'End Date Case', 'Has Disability', 'Has Risk', 'Risk Type', 'Unique Case', 'IMD_decile', 'Country', 'Age', 'Gender', 'Ethnicity', 'Disability', 'Living Arrangements', 'Referral Notes (depersonalised)']

Original referral notes missing values: 30621
After removing missing referral notes: (150464, 19)
After removing NA variations: (150464, 19) (removed 0 rows) 

Removing duplicate sentences within cases...
Original sentences: 340393
Unique sentences: 182168
Sentences removed: 158225

Final cleaned data shape: (99560, 19)
Number of unique referral notes: 51658

Cleaned data saved to: ../data/processed/BRC_referrals_cleaned.csv


## Joining with outcomes datasets

In [3]:
referrals_cleaned = pd.read_csv("../data/processed/BRC_referrals_cleaned.csv")
hiu_raw = pd.read_csv("../data/raw/BRC-Data/HIU_depersonalised.csv")
snap_raw = pd.read_csv("../data/raw/BRC-Data/SNAP_depersonalised.csv")

### Check overlap

In [4]:
# Standardize case reference columns
referrals_cleaned['case_ref'] = referrals_cleaned['Case Reference']
hiu_raw['case_ref'] = 'CAS-' + hiu_raw['Q2.1. CAS-'].astype(str).str.replace('.0', '', regex=False)
snap_raw['case_ref'] = snap_raw['BRM case number:']

# Get case reference sets
ref_cases = set(referrals_cleaned['case_ref'].dropna())
hiu_cases = set(hiu_raw['case_ref'].dropna())
snap_cases = set(snap_raw['case_ref'].dropna())

# Print overlap counts
print("Dataset Sizes:")
print(f"Referrals: {len(ref_cases):,}")
print(f"HIU: {len(hiu_cases):,}")
print(f"SNAP: {len(snap_cases):,}")

print("\nOverlaps:")
print(f"Referrals ∩ HIU: {len(ref_cases & hiu_cases):,}")
print(f"Referrals ∩ SNAP: {len(ref_cases & snap_cases):,}")
print(f"HIU ∩ SNAP: {len(hiu_cases & snap_cases):,}")
print(f"All three: {len(ref_cases & hiu_cases & snap_cases):,}")

Dataset Sizes:
Referrals: 99,560
HIU: 1,430
SNAP: 1,305

Overlaps:
Referrals ∩ HIU: 1,110
Referrals ∩ SNAP: 1,096
HIU ∩ SNAP: 0
All three: 0


### Merge datasets

In [5]:
# Prepare datasets with standardized case reference
referrals = referrals_cleaned.copy()
referrals['case_ref'] = referrals['Case Reference']

hiu = hiu_raw.copy()
hiu['case_ref'] = 'CAS-' + hiu['Q2.1. CAS-'].astype(str).str.replace('.0', '', regex=False)

snap = snap_raw.copy()
snap['case_ref'] = snap['BRM case number:']

In [6]:
# Set case_ref as index for merging
referrals.set_index('case_ref', inplace=True)
hiu.set_index('case_ref', inplace=True)
snap.set_index('case_ref', inplace=True)

# Merge HIU and SNAP into referrals dataset
merged = referrals.join(hiu, how='left', rsuffix='_hiu').join(snap, how='left', rsuffix='_snap')

# Create dataset source indicator variables
merged['has_referral'] = True  # All cases have referral data since we're using left join
merged['has_hiu'] = merged['Q2.1. CAS-'].notna()
merged['has_snap'] = merged['BRM case number:'].notna()

# Create combined dataset source variable
merged['dataset_sources'] = (
    merged['has_referral'].astype(str).replace({'True': 'Referral', 'False': ''}) + 
    (merged['has_hiu'].apply(lambda x: '+HIU' if x else '')) +
    (merged['has_snap'].apply(lambda x: '+SNAP' if x else ''))
)

# Reset index to get case_ref back as column
merged.reset_index(inplace=True)

# Save the merged dataset
output_merged_path = Path("../data/processed/BRC_referrals_merged.csv")
merged.to_csv(output_merged_path, index=False)
print(f"\nMerged dataset saved to: {output_merged_path}")


Merged dataset saved to: ../data/processed/BRC_referrals_merged.csv


In [7]:
print(f"Merged dataset shape: {merged.shape}")
print(f"\nDataset source combinations:")
print(merged['dataset_sources'].value_counts())

print(f"\nData availability:")
print(f"Has referral data: {merged['has_referral'].sum():,}")
print(f"Has HIU data: {merged['has_hiu'].sum():,}")
print(f"Has SNAP data: {merged['has_snap'].sum():,}")

Merged dataset shape: (100882, 182)

Dataset source combinations:
dataset_sources
Referral         97354
Referral+HIU      1880
Referral+SNAP     1648
Name: count, dtype: int64

Data availability:
Has referral data: 100,882
Has HIU data: 1,880
Has SNAP data: 1,648
