# Data cleaning & EDA

## 0. Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from pathlib import Path
import sys

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

## 1. Pre-cleaning data

In [3]:
# Load and standardize case references
datasets = {
    'referrals': pd.read_csv("../data/raw/BRC-Data/Cases_depersonalised.csv"),
    'hiu': pd.read_csv("../data/raw/BRC-Data/HIU_depersonalised.csv"),
    'snap': pd.read_csv("../data/raw/BRC-Data/SNAP_depersonalised.csv")
}

# Standardize case reference columns
datasets['referrals']['case_ref'] = datasets['referrals']['Case Reference']
datasets['hiu']['case_ref'] = 'CAS-' + datasets['hiu']['Q2.1. CAS-'].astype(str).str.replace('.0', '', regex=False)
datasets['snap']['case_ref'] = datasets['snap']['BRM case number:']

In [6]:
print("=== DUPLICATE ANALYSIS BEFORE MERGING ===")

for name, df in datasets.items():
    print(f"\n{name.upper()} Dataset:")
    print(f"  Total rows: {len(df):,}")
    print(f"  Unique case_ref: {df['case_ref'].nunique():,}")
    
    # Define columns to exclude from duplication analysis (depersonalized/randomized)
    if name == 'referrals':
        # For referrals: only consider case_ref and referral notes
        analysis_cols = ['case_ref', 'Referral Notes (depersonalised)', 'Referral Date/Time']
        available_cols = [col for col in analysis_cols if col in df.columns]
        print(f"  Analysing duplicates based on: {available_cols}")
    elif name == 'snap':
        # Exclude: Has Disability, IMD Decile, Country, Age, Gender, Ethnicity, Living Arrangements
        exclude_cols = ['Has Disability', 'IMD Decile', 'Country', 'Age', 'Gender', 'Ethnicity', 'Living Arrangements']
        available_cols = [col for col in df.columns if col not in exclude_cols]
        print(f"  Analysing duplicates excluding {len(exclude_cols)} depersonalised columns")
    elif name == 'hiu':
        # Exclude: Age, Gender, Ethnicity, Living Arrangements
        exclude_cols = ['Age', 'Gender', 'Ethnicity', 'Living Arrangements']
        available_cols = [col for col in df.columns if col not in exclude_cols]
        print(f"  Analysing duplicates excluding {len(exclude_cols)} depersonalised columns")
    
    # Check for perfect duplicates based on relevant columns only
    perfect_duplicates = df.duplicated(subset=available_cols).sum()
    print(f"  Perfect duplicates (relevant columns): {perfect_duplicates:,}")
    
    # Check for duplicates by case_ref only
    case_ref_duplicates = df['case_ref'].duplicated().sum()
    print(f"  Duplicate case_ref: {case_ref_duplicates:,}")
    
    if case_ref_duplicates > 0:
        # Show examples of duplicate case_ref: Show top 5 by count
        duplicate_cases = df[df['case_ref'].duplicated(keep=False)]['case_ref'].value_counts().head(5)
        print(f"  Top 5 duplicate case_ref by count:")
        for case_ref, count in duplicate_cases.items():
            print(f"    {case_ref}: {count} rows")
        
        # Check if duplicate case_ref have identical relevant data - CHANGED: Use highest count case
        highest_count_case = duplicate_cases.index[0]  # This is now the case with most duplicates
        duplicate_rows = df[df['case_ref'] == highest_count_case]
        
        # Check duplicates based on relevant columns only
        duplicate_subset = duplicate_rows[available_cols]
        identical_duplicates = duplicate_subset.duplicated().sum()
        
        print(f"    For case {highest_count_case}: {identical_duplicates}/{len(duplicate_rows)-1} duplicates are identical (relevant data)")
        
        # Show what's different in duplicate rows (if any) - only relevant columns
        if identical_duplicates < len(duplicate_rows) - 1:
            print(f"    Non-identical relevant columns for {highest_count_case}:")
            for col in available_cols:
                if col in duplicate_rows.columns:
                    unique_vals = duplicate_rows[col].nunique()
                    if unique_vals > 1:
                        values = duplicate_rows[col].tolist()
                        # For referral notes, show preview
                        if col == 'Referral Notes (depersonalised)':
                            values = [str(v)[:50] + "..." if len(str(v)) > 50 else str(v) for v in values]
                        print(f"      {col}: {values}")

=== DUPLICATE ANALYSIS BEFORE MERGING ===

REFERRALS Dataset:
  Total rows: 181,085
  Unique case_ref: 126,717
  Analysing duplicates based on: ['case_ref', 'Referral Notes (depersonalised)', 'Referral Date/Time']
  Perfect duplicates (relevant columns): 52,146
  Duplicate case_ref: 54,368
  Top 5 duplicate case_ref by count:
    CAS-537284: 60 rows
    CAS-530729: 45 rows
    CAS-565892: 42 rows
    CAS-592629: 42 rows
    CAS-537229: 40 rows
    For case CAS-537284: 45/59 duplicates are identical (relevant data)
    Non-identical relevant columns for CAS-537284:
      Referral Notes (depersonalised): ['XXXX hazard', '4.Fall', '4.Trips and falls', '4.Trips and falls', 'XXXX hazard', '4.Fall', '4.Trips and falls', 'XXXX hazard', '4.Trips and falls', 'XXXX of benefits', '3.Person who client claimed kidnapped her is now o...', 'Distress', 'XXXX of benefits', 'XXXX hazard', '3.Person who client claimed kidnapped her is now o...', 'Distress', 'abusive behaviour', 'XXXX of benefits', '4.Fal

### 1.1 Pre-cleaning referrals

I start with the pre-cleaning of referrals.

1. I remove identical rows based on: ['case_ref', 'Referral Notes (depersonalised)', 'Referral Date/Time']. Those are pure duplicates (other columns might be different due to the depersonalisation).

Then, for each CAS which has multiple rows (most likely on different dates):

2. I create two columns: one with the number of observations, and one with the date range of those observations;
3. Then, if one or some of the rows have a non-NA referral, I remove the rows which don't. 
4. Finally, if all observations have the same referrals, or if none of the observations have a referral, I only keep the most recent row (the columns created earlier will keep the other relevant information).

The only duplicated CAS left will be those with different referrals on different dates.

In [None]:
from utils.data_cleaning_helpers import clean_referrals_dataset

# Load the referrals data
referrals_df = datasets['referrals'].copy()

# Clean the dataset
referrals_cleaned = clean_referrals_dataset(referrals_df)

# Save the cleaned dataset
referrals_cleaned.to_csv("../data/processed/referrals_cleaned.csv", index=False)
print(f"\nCleaned dataset saved to: ../data/processed/referrals_cleaned.csv")

=== REFERRALS DATASET PRE-CLEANING ===
Initial dataset: 181,085 rows, 126,717 unique cases

Step 0: Cleaning NA variations in text columns...

Step 1: Removing identical rows...
  Using columns: ['case_ref', 'Referral Notes (depersonalised)', 'Referral Date/Time']
  Removed 52,146 identical rows
  Remaining: 128,939 rows, 126,717 unique cases

Step 2: Processing cases with multiple observations...
  Step 2a: Adding observation count and date range columns...
  Step 2b: Removing rows without referral notes when others exist...
    Removed 0 rows without referral notes
  Step 2c: Consolidating cases with identical or missing referral notes...
    Consolidated 0 rows with identical/missing referral notes

Final dataset: 128,939 rows, 126,717 unique cases

Cases with multiple rows remaining (different referrals): 1,434
Top 5 cases by number of remaining rows:
  CAS-537284: 15 rows
  CAS-530729: 9 rows
  CAS-483850: 8 rows
  CAS-503700: 8 rows
  CAS-560840: 8 rows

Example case CAS-537284 w

### 1.2 Pre-cleaning SNAP data

In [16]:
from utils.data_cleaning_helpers import clean_snap_dataset

snap_df = datasets['snap'].copy()

# Clean the SNAP dataset
snap_cleaned = clean_snap_dataset(snap_df)

# Save cleaned dataset
snap_cleaned.to_csv("../data/processed/snap_cleaned.csv", index=False)
print(f"\nCleaned dataset saved to: ../data/processed/snap_cleaned.csv")


=== SNAP DATASET PRE-CLEANING ===
Initial dataset: 2,012 rows, 1,305 unique cases

Step 0: Cleaning NA variations in text columns...

Step 1: Removing perfect duplicates...
  Removed 54 perfect duplicates

Step 2: Identifying valid baseline and valid post-support assessments...
  Cases with valid baseline: 983
  Cases with valid post-support: 540
  Cases with both (complete valid pairs): 475
  Cases with valid baseline only: 508

Step 3: Adding metadata columns...

Final dataset: 1,958 rows, 1,305 unique cases

Cleaned dataset saved to: ../data/processed/snap_cleaned.csv


In [12]:
print(cols_snap)

['Area:', 'Service:', 'BRM case number:', 'Survey completed:', 'Date of assessment', 'Possible to record outcomes:', 'Why was it not possible:', 'Other reason', 'Daily life:', 'Clean & comfortable:', 'Food & drink:', 'Clean home', 'Social life:', 'Safe in home:', 'Safe when out:', 'Spend your time:', 'Paid support', 'Medication:', 'Finances:', 'submitter', 'Timepoint', 'Control (PAC)', 'Personal Cleanliness (PAC)', 'Food and drink (PAC)', 'Accommodation (PAC)', 'Social Participation (PAC)', 'Safe in home (PAC)', 'Safe when out (PAC)', 'Occupation (PAC)', 'Dignity (PAC)', 'Medication (PAC)', 'Finances (PAC)', 'Control (QLC)', 'Personal cleanliness (QLC)', 'Food and drink (QLC)', 'Personal safety (QLC)', 'Social Participation (QLC)', 'Occupation (QLC)', 'Accommodation (QLC)', 'Dignity 1 (QLC)', 'Dignity 2 (QLC)', 'Medication (QLC)', 'Finances (QLC)', 'Needs', 'Length of support', 'All Qs complete', 'Q1\r\nControl\r\n(B)', 'Q2\r\nPersonal Cleanliness\r\n(B)', 'Q3\r\nFood & Drink\r\n(B)', 

### 1.3 Pre-cleaning HIU data

## 2. Merging data

**Merging strategy**

In [None]:
# Print dataset overlap summary
case_sets = {name: set(df['case_ref'].dropna()) for name, df in datasets.items()}
print(f"Dataset sizes: Referrals: {len(case_sets['referrals']):,}, HIU: {len(case_sets['hiu']):,}, SNAP: {len(case_sets['snap']):,}")
print(f"Overlaps: Ref ∩ HIU: {len(case_sets['referrals'] & case_sets['hiu']):,}, Ref ∩ SNAP: {len(case_sets['referrals'] & case_sets['snap']):,}, HIU ∩ SNAP: {len(case_sets['hiu'] & case_sets['snap']):,}, All: {len(case_sets['referrals'] & case_sets['hiu'] & case_sets['snap']):,}")

# Add prefixes to HIU and SNAP columns (except case_ref)
hiu_renamed = datasets['hiu'].rename(columns={col: f'hiu_{col}' for col in datasets['hiu'].columns if col != 'case_ref'})
snap_renamed = datasets['snap'].rename(columns={col: f'snap_{col}' for col in datasets['snap'].columns if col != 'case_ref'})

### 2.1. Merging SNAP & Referrals

### 2.2. Merging HIU & Referrals

In [None]:
# Merge datasets on case_ref
merged = (datasets['referrals'].set_index('case_ref')
          .join(hiu_renamed.set_index('case_ref'))
          .join(snap_renamed.set_index('case_ref'))
          .reset_index())

# Add source indicators
merged['has_hiu'] = merged['hiu_Q2.1. CAS-'].notna()
merged['has_snap'] = merged['snap_BRM case number:'].notna()
merged['dataset_sources'] = ('Referral' + 
                            merged['has_hiu'].map({True: '+HIU', False: ''}) + 
                            merged['has_snap'].map({True: '+SNAP', False: ''}))

In [12]:
# Replace various missing data indicators with NaN
missing_indicators = ['-', '--', 'N/A', 'n/a', '', ' ', 'NULL', 'null']
for col in merged.columns:
    if merged[col].dtype == 'object':
        merged[col] = merged[col].replace(missing_indicators, pd.NA)

In [13]:
# Save merged dataset
output_path = Path("../data/processed/BRC_referrals_merged.csv")
merged.to_csv(output_path, index=False)
print(f"Merged dataset saved: {output_path}")
print(f"Final dataset shape: {merged.shape}")

Merged dataset saved: ../data/processed/BRC_referrals_merged.csv
Final dataset shape: (184050, 181)


In [15]:
cols = list(merged.columns)

## 3. Exploratory Data Analysis

In [14]:
from utils.data_cleaning_helpers import get_note_length_category

# Import merged dataset
merged = pd.read_csv("../data/processed/BRC_referrals_merged.csv")

# Add note categories
merged['note_length_category'] = merged['Referral Notes (depersonalised)'].apply(get_note_length_category)

  merged = pd.read_csv("../data/processed/BRC_referrals_merged.csv")


In [15]:
note_counts = merged['note_length_category'].value_counts()

print(f"\n=== REFERRAL NOTES QUALITY ===")
for category, count in note_counts.items():
    print(f"{category}: {count:,} ({count/len(merged):.1%})")


=== REFERRAL NOTES QUALITY ===
Short note (<5 words): 63,549 (34.5%)
Medium note (5-19 words): 46,007 (25.0%)
Long note (20+ words): 43,475 (23.6%)
No note: 31,019 (16.9%)


Let's first look at the characteristics of referrals for cases which have health outcomes data (in SNAP and HIE).

### 3.1. SNAP (Support at Home, Care at Home, Hospital at Home, and Social Prescribing services)

These services have started using an outcomes framework called SNAP (Social Needs and
Preferences), which was only introduced in late 2024.

I consider 'valid' SNAP data when the service user has responded to the survey (i.e. 'Possible to record outcomes:' == Yes). This does not necessarily mean all questions were answered.

In [23]:
# Filter dataset to only include referrals with SNAP data
merged_snap = merged[merged['has_snap']].copy()
merged_snap = merged_snap.loc[:, ~merged_snap.columns.str.startswith('hiu_')]

cols_snap = list(merged_snap.columns)

print("=== SNAP DATA OVERVIEW ===")
print(f"Total SNAP observations: {len(merged_snap):,}")
print(f"Unique cases: {merged_snap['case_ref'].nunique():,}")
print(f"Date range: {merged_snap['snap_Date of assessment'].min()} to {merged_snap['snap_Date of assessment'].max()}")

# Check for valid SNAP data
valid_snap = merged_snap['snap_Possible to record outcomes:'] == 'Yes'
print(f"\nValid SNAP data: {valid_snap.sum():,} observations ({valid_snap.mean():.1%})")
print(f"Valid unique cases: {merged_snap[valid_snap]['case_ref'].nunique():,}")

# Assessment timepoints
timepoint_counts = merged_snap['snap_Survey completed:'].value_counts()
print(f"\n=== ASSESSMENT TIMEPOINTS ===")
for timepoint, count in timepoint_counts.items():
    print(f"{timepoint}: {count:,} ({count/len(merged_snap):.1%})")

# Cases with both baseline and outcomes (with valid data)
baseline_cases = set(merged_snap[(merged_snap['snap_Survey completed:'] == 'at the start of support') & valid_snap]['case_ref'])
outcome_cases = set(merged_snap[(merged_snap['snap_Survey completed:'] == 'at the end of support') & valid_snap]['case_ref'])
both_timepoints = baseline_cases & outcome_cases

# Also show the impact of invalid data
all_baseline_cases = set(merged_snap[merged_snap['snap_Survey completed:'] == 'at the start of support']['case_ref'])
all_outcome_cases = set(merged_snap[merged_snap['snap_Survey completed:'] == 'at the end of support']['case_ref'])
all_both_timepoints = all_baseline_cases & all_outcome_cases

print(f"\n=== BASELINE + OUTCOMES AVAILABILITY ===")
print(f"Total unique cases with valid data: {len(baseline_cases | outcome_cases):,}")

print(f"\nCases with valid baseline only: {len(baseline_cases - outcome_cases):,}")
print(f"Cases with valid outcomes only: {len(outcome_cases - baseline_cases):,}")

print(f"\nCases with both valid baseline + outcomes: {len(both_timepoints):,}")
print(f"Cases with both timepoints (any validity): {len(all_both_timepoints):,}")

=== SNAP DATA OVERVIEW ===
Total SNAP observations: 3,468
Unique cases: 1,222
Date range: 01/05/2024 to 31/10/2024

Valid SNAP data: 2,839 observations (81.9%)
Valid unique cases: 1,006

=== ASSESSMENT TIMEPOINTS ===
at the start of support: 2,050 (59.1%)
at the end of support: 1,418 (40.9%)

=== BASELINE + OUTCOMES AVAILABILITY ===
Total unique cases with valid data: 1,006

Cases with valid baseline only: 471
Cases with valid outcomes only: 63

Cases with both valid baseline + outcomes: 472
Cases with both timepoints (any validity): 621


Many cases also have more than two observations. We can investigate these patterns. Also, from here onwards, I filter the SNAP dataset to valid SNAP records.

In [24]:
merged_snap_valid = merged_snap[merged_snap['snap_Possible to record outcomes:'] == 'Yes'].copy()

# Focus on the 472 cases with both baseline + outcomes
both_timepoints_snap = merged_snap_valid[merged_snap_valid['case_ref'].isin(both_timepoints)].drop_duplicates('case_ref')

print(f"=== BASELINE + OUTCOMES CASES ANALYSIS (n={len(both_timepoints):,}) ===")

length_distribution = both_timepoints_snap['note_length_category'].value_counts()

print(f"\nReferral notes length distribution:")
for category, count in length_distribution.items():
    print(f"  {category}: {count:,} ({count/len(both_timepoints_snap):.1%})")


=== BASELINE + OUTCOMES CASES ANALYSIS (n=472) ===

Referral notes length distribution:
  Long note (20+ words): 167 (35.4%)
  Short note (<5 words): 120 (25.4%)
  Medium note (5-19 words): 119 (25.2%)
  No note: 66 (14.0%)


Let's now check the completeness of the data for cases which have both baseline and post-support observations.

In [25]:
# Data completeness for key domains
key_domains = ['snap_Control (QLC)', 'snap_Personal cleanliness (QLC)', 'snap_Food and drink (QLC)', 
               'snap_Personal safety (QLC)', 'snap_Social Participation (QLC)', 'snap_Occupation (QLC)', 
               'snap_Accommodation (QLC)', 'snap_Dignity 1 (QLC)', 'snap_Medication (QLC)', 'snap_Finances (QLC)']

print(f"\n=== DATA COMPLETENESS ===")
for domain in key_domains:
    if domain in merged_snap_valid.columns:
        completeness = merged_snap_valid[domain].notna().mean()
        print(f"{domain.replace('snap_', '').replace(' (QLC)', '')}: {completeness:.1%} complete")



=== DATA COMPLETENESS ===
Control: 90.1% complete
Personal cleanliness: 91.3% complete
Food and drink: 91.5% complete
Personal safety: 92.3% complete
Social Participation: 90.2% complete
Occupation: 89.7% complete
Accommodation: 91.0% complete
Dignity 1: 89.2% complete
Medication: 89.6% complete
Finances: 86.8% complete


Some cases appear more than twice (baseline + post-support). They also seem to display different change values across their associated observations, i.e., these are not simply duplicated rows. 

However, there is only one referral note associated with each CAS. So, this discrepancy will have to be dealt with. Probably by using the values appear first in chronological order. This is because the BRC will have more information on the service user than contained in the referral note if they use their services for the second or third time for example.

### 3.2. HIU (High Intensity Use service for people frequently attending A&E)

In [28]:
# Filter dataset to only include referrals with HIU data
merged_hiu = merged[merged['has_hiu']].copy()
merged_hiu = merged_hiu.loc[:, ~merged_hiu.columns.str.startswith('snap_')]

cols_hiu = list(merged_hiu.columns)

In [33]:
print("=== HIU DATA OVERVIEW ===")
print(f"Total HIU observations: {len(merged_hiu):,}")
print(f"Unique cases: {merged_hiu['case_ref'].nunique():,}")

# Check for exact duplicates
print(f"\n=== DUPLICATE ANALYSIS ===")
print(f"Total rows: {len(merged_hiu):,}")

# Check duplicates based on case reference and all HIU variables
hiu_cols = [col for col in merged_hiu.columns if col.startswith('hiu_')]
duplicate_subset = ['case_ref'] + hiu_cols

duplicates = merged_hiu.duplicated(subset=duplicate_subset, keep=False)
print(f"Exact duplicate rows (case + HIU variables): {duplicates.sum():,}")

if duplicates.sum() > 0:
    print(f"\nExample of duplicate cases:")
    duplicate_cases = merged_hiu[duplicates]['case_ref'].value_counts().head(3)
    for case_ref, count in duplicate_cases.items():
        print(f"  {case_ref}: {count} identical rows")

# Remove exact duplicates
print(f"\n=== REMOVING DUPLICATES ===")
merged_hiu_clean = merged_hiu.drop_duplicates(subset=duplicate_subset, keep='first')
print(f"After removing duplicates: {len(merged_hiu_clean):,} rows")
print(f"Duplicates removed: {len(merged_hiu) - len(merged_hiu_clean):,}")
print(f"Unique cases after cleaning: {merged_hiu_clean['case_ref'].nunique():,}")

=== HIU DATA OVERVIEW ===
Total HIU observations: 4,269
Unique cases: 1,400

=== DUPLICATE ANALYSIS ===
Total rows: 4,269
Exact duplicate rows (case + HIU variables): 2,932

Example of duplicate cases:
  CAS-525447: 27 identical rows
  CAS-527026: 24 identical rows
  CAS-482848: 20 identical rows

=== REMOVING DUPLICATES ===
After removing duplicates: 2,370 rows
Duplicates removed: 1,899
Unique cases after cleaning: 1,400


In [36]:
# Check for valid HIU data
# Looking at the columns, it seems like "hiu_Q6. Why wasn't it possible to record outcomes for this client?" indicates invalid data
merged_hiu_valid = merged_hiu_clean["hiu_Q6. Why wasn't it possible to record outcomes for this client?"].isna()
print(f"\nValid HIU data: {merged_hiu_valid.sum():,} observations ({merged_hiu_valid.mean():.1%})")
print(f"Valid unique cases: {merged_hiu_valid['case_ref'].nunique():,}")


Valid HIU data: 2,231 observations (94.1%)


KeyError: 'case_ref'

## 3. Cleaning referrals

In [None]:
from utils.data_cleaning_helpers import clean_na_variations, remove_duplicate_sentences_per_case

brc_referrals_raw = pd.read_csv("../data/raw/BRC-Data/Cases_depersonalised.csv")

In [None]:
print("Original data shape:", brc_referrals_raw.shape)
print("Columns:", brc_referrals_raw.columns.tolist())

# 1. Remove missing values for referral notes
print(f"\nOriginal referral notes missing values: {brc_referrals_raw['Referral Notes (depersonalised)'].isnull().sum()}")

brc_cleaned = brc_referrals_raw.dropna(subset=['Referral Notes (depersonalised)']).copy()
print(f"After removing missing referral notes: {brc_cleaned.shape}")

# 2. Apply NA cleaning to referral notes
brc_cleaned['Referral Notes (depersonalised)'] = brc_cleaned['Referral Notes (depersonalised)'].apply(clean_na_variations)
    
# Remove rows where referral notes became NaN after cleaning
before_na_clean = brc_cleaned.shape[0]
brc_cleaned = brc_cleaned.dropna(subset=['Referral Notes (depersonalised)'])
print(f"After removing NA variations: {brc_cleaned.shape} (removed {before_na_clean - brc_cleaned.shape[0]} rows) \n")

# 3. Remove duplicate sentences within multiple notes corresponding to a single Case Reference
# As done in Keloth et al. (2025)
brc_final = remove_duplicate_sentences_per_case(brc_cleaned)
print(f"\nFinal cleaned data shape: {brc_final.shape}")
print(f"Number of unique referral notes: {brc_final['Referral Notes (depersonalised)'].nunique()}")

# Save the cleaned data
output_path = Path("../data/processed/BRC_referrals_cleaned.csv")
brc_final.to_csv(output_path, index=False)
print(f"\nCleaned data saved to: {output_path}")

Original data shape: (181085, 19)
Columns: ['Area', 'Scheme', 'Case Reference', 'Assessment Result', 'Case Status', 'Referral Date/Time', 'End Date Case', 'Has Disability', 'Has Risk', 'Risk Type', 'Unique Case', 'IMD_decile', 'Country', 'Age', 'Gender', 'Ethnicity', 'Disability', 'Living Arrangements', 'Referral Notes (depersonalised)']

Original referral notes missing values: 30621
After removing missing referral notes: (150464, 19)
After removing NA variations: (150464, 19) (removed 0 rows) 

Removing duplicate sentences within cases...
Original sentences: 340393
Unique sentences: 182168
Sentences removed: 158225

Final cleaned data shape: (99560, 19)
Number of unique referral notes: 51658

Cleaned data saved to: ../data/processed/BRC_referrals_cleaned.csv
