# Data cleaning & EDA

## 0. Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
from pathlib import Path
import sys
import json

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

## 1. Duplicate analysis

In [None]:
# Load and standardize case references
datasets = {
    'referrals': pd.read_csv("../data/raw/BRC-Data/Cases_depersonalised.csv"),
    'hiu': pd.read_csv("../data/raw/BRC-Data/HIU_depersonalised.csv"),
    'snap': pd.read_csv("../data/raw/BRC-Data/SNAP_depersonalised.csv")
}

# Standardize case reference columns
datasets['referrals']['case_ref'] = datasets['referrals']['Case Reference']
datasets['hiu']['case_ref'] = 'CAS-' + datasets['hiu']['Q2.1. CAS-'].astype(str).str.replace('.0', '', regex=False)
datasets['snap']['case_ref'] = datasets['snap']['BRM case number:']

In [None]:
print("=== DUPLICATE ANALYSIS BEFORE CLEANING ===")

for name, df in datasets.items():
    print(f"\n{name.upper()} Dataset:")
    print(f"  Total rows: {len(df):,}")
    print(f"  Unique case_ref: {df['case_ref'].nunique():,}")
    
    # Define columns to exclude from duplication analysis (depersonalized/randomized)
    if name == 'referrals':
        # For referrals: only consider case_ref and referral notes
        analysis_cols = ['case_ref', 'Referral Notes (depersonalised)', 'Referral Date/Time']
        available_cols = [col for col in analysis_cols if col in df.columns]
        print(f"  Analysing duplicates based on: {available_cols}")
    elif name == 'snap':
        # Exclude: Has Disability, IMD Decile, Country, Age, Gender, Ethnicity, Living Arrangements
        exclude_cols = ['Has Disability', 'IMD Decile', 'Country', 'Age', 'Gender', 'Ethnicity', 'Living Arrangements']
        available_cols = [col for col in df.columns if col not in exclude_cols]
        print(f"  Analysing duplicates excluding {len(exclude_cols)} depersonalised columns")
    elif name == 'hiu':
        # Exclude: Age, Gender, Ethnicity, Living Arrangements
        exclude_cols = ['Age', 'Gender', 'Ethnicity', 'Living Arrangements']
        available_cols = [col for col in df.columns if col not in exclude_cols]
        print(f"  Analysing duplicates excluding {len(exclude_cols)} depersonalised columns")
    
    # Check for perfect duplicates based on relevant columns only
    perfect_duplicates = df.duplicated(subset=available_cols).sum()
    print(f"  Perfect duplicates (relevant columns): {perfect_duplicates:,}")
    
    # Check for duplicates by case_ref only
    case_ref_duplicates = df['case_ref'].duplicated().sum()
    print(f"  Duplicate case_ref: {case_ref_duplicates:,}")
    
    if case_ref_duplicates > 0:
        # Show examples of duplicate case_ref: Show top 5 by count
        duplicate_cases = df[df['case_ref'].duplicated(keep=False)]['case_ref'].value_counts().head(5)
        print(f"  Top 5 duplicate case_ref by count:")
        for case_ref, count in duplicate_cases.items():
            print(f"    {case_ref}: {count} rows")
        
        # Check if duplicate case_ref have identical relevant data - CHANGED: Use highest count case
        highest_count_case = duplicate_cases.index[0]  # This is now the case with most duplicates
        duplicate_rows = df[df['case_ref'] == highest_count_case]
        
        # Check duplicates based on relevant columns only
        duplicate_subset = duplicate_rows[available_cols]
        identical_duplicates = duplicate_subset.duplicated().sum()
        
        print(f"    For case {highest_count_case}: {identical_duplicates}/{len(duplicate_rows)-1} duplicates are identical (relevant data)")
        
        # Show what's different in duplicate rows (if any) - only relevant columns
        if identical_duplicates < len(duplicate_rows) - 1:
            print(f"    Non-identical relevant columns for {highest_count_case}:")
            for col in available_cols:
                if col in duplicate_rows.columns:
                    unique_vals = duplicate_rows[col].nunique()
                    if unique_vals > 1:
                        values = duplicate_rows[col].tolist()
                        # For referral notes, show preview
                        if col == 'Referral Notes (depersonalised)':
                            values = [str(v)[:50] + "..." if len(str(v)) > 50 else str(v) for v in values]
                        print(f"      {col}: {values}")

## 2. Pre-cleaning referrals

I start with the pre-cleaning of referrals.

1. I remove identical rows based on: ['case_ref', 'Referral Notes (depersonalised)', 'Referral Date/Time']. Those are pure duplicates (other columns might be different due to the depersonalisation).

Then, for each CAS which has multiple rows (most likely on different dates):

2. I create two columns: one with the number of observations, and one with the date range of those observations;
3. Then, if one or some of the rows have a non-NA referral, I remove the rows which don't. 
4. Finally, if all observations have the same referrals, or if none of the observations have a referral, I only keep the most recent row (the columns created earlier will keep the other relevant information).

The only duplicated CAS left will be those with different referrals on different dates.

In [None]:
from src.data_cleaning.data_cleaning_helpers import clean_referrals_dataset

# Load the referrals data
referrals_df = datasets['referrals'].copy()

# Clean the dataset
referrals_cleaned = clean_referrals_dataset(referrals_df)

In [None]:
print("=== ANALYSING REFERRALS BY DATE PATTERNS ===")

# Convert date column to datetime if not already
referrals_cleaned['Referral Date/Time'] = pd.to_datetime(referrals_cleaned['Referral Date/Time'], errors='coerce')
referrals_cleaned['referral_date'] = referrals_cleaned['Referral Date/Time'].dt.date

# Cases with multiple referrals
multi_referral_cases = referrals_cleaned['case_ref'].value_counts()
multi_referral_cases = multi_referral_cases[multi_referral_cases > 1]

print(f"Cases with multiple referrals: {len(multi_referral_cases):,}")
print(f"Cases with single referral: {referrals_cleaned['case_ref'].nunique() - len(multi_referral_cases):,}")

# Analyze date patterns for multi-referral cases
print(f"\n=== DATE PATTERNS FOR {len(multi_referral_cases):,} MULTI-REFERRAL CASES ===")

same_date_stats = []
for case_ref in multi_referral_cases.index:
    case_data = referrals_cleaned[referrals_cleaned['case_ref'] == case_ref].copy()
    
    # Count unique dates for this case
    unique_dates = case_data['referral_date'].nunique()
    total_referrals = len(case_data)
    
    same_date_stats.append({
        'case_ref': case_ref,
        'total_referrals': total_referrals,
        'unique_dates': unique_dates,
        'all_same_date': unique_dates == 1,
        'multiple_dates': unique_dates > 1
    })

same_date_df = pd.DataFrame(same_date_stats)

# Summary statistics
all_same_date = same_date_df['all_same_date'].sum()
multiple_dates = same_date_df['multiple_dates'].sum()

print(f"Cases where ALL referrals are on the SAME date: {all_same_date:,} ({all_same_date/len(multi_referral_cases)*100:.1f}%)")
print(f"Cases with referrals on MULTIPLE dates: {multiple_dates:,} ({multiple_dates/len(multi_referral_cases)*100:.1f}%)")

For cases which have multiple observations with DIFFERENT referrals, they are always done on the same date. I decide to keep the longest referral, to keep the maximum information.

In [None]:
from utils.data_cleaning_helpers import consolidate_referrals_longest

# Apply consolidation
referrals_consolidated = consolidate_referrals_longest(referrals_cleaned)

# Save consolidated dataset
referrals_consolidated.to_csv("../data/processed/referrals_cleaned.csv", index=False)
print(f"\nConsolidated referrals saved to: ../data/processed/referrals_cleaned.csv")

## 3. Pre-cleaning SNAP data & merging with referrals

I start with **SNAP** (Support at Home, Care at Home, Hospital at Home, and Social Prescribing services).

These services have started using an outcomes framework called hiu (Social Needs and Preferences), which was only introduced in late 2024.

The pre-cleaning of SNAP is done as follows:

1. I removed perfect duplicates based on columns that were not randomised in the depersonalisation process.

Then, I noticed that CAS have either one or two observations (not more). These correspond to observations at the start and / or at the end of support from the BRC. I get a brief overview.

2. For each case, I determine what type of valid assessments are available, only counting valid where 'Possible to record outcomes:' == 'Yes'. 
3. I then create a summary for each case showing:
    - Total number of assessments
    - Whether valid baseline assessment exists (timepoint 1.0 + recordable outcomes)
    - Whether valid post-support assessment exists (timepoint 2.0 + recordable outcomes)
    - Whether case has both valid assessments (complete usable pair)

In [None]:
from utils.data_cleaning_helpers import clean_snap_dataset

snap_df = datasets['snap'].copy()

# Clean the SNAP dataset
snap_cleaned = clean_snap_dataset(snap_df)

# Save cleaned dataset
snap_cleaned.to_csv("../data/processed/snap_cleaned.csv", index=False)
print(f"\nCleaned dataset saved to: ../data/processed/snap_cleaned.csv")


### Merging with SNAP & referrals

I now proceed to creating a merged dataset with referrals.

In [None]:
def merge_referrals_with_snap():
    """
    Merge referrals (1 row per case) with SNAP data (1-2 rows per case).
    Only keeps cases that appear in both datasets.
    """
    
    print("=== MERGING REFERRALS WITH SNAP DATA ===")
    
    # Load datasets
    referrals_df = pd.read_csv("../data/processed/referrals_cleaned.csv")
    snap_df = pd.read_csv("../data/processed/snap_cleaned.csv")
    
    print(f"Referrals dataset: {len(referrals_df):,} rows, {referrals_df['case_ref'].nunique():,} unique cases")
    print(f"SNAP dataset: {len(snap_df):,} rows, {snap_df['case_ref'].nunique():,} unique cases")
    
    # Find cases that appear in both datasets
    referrals_cases = set(referrals_df['case_ref'])
    snap_cases = set(snap_df['case_ref'])
    
    common_cases = referrals_cases.intersection(snap_cases)
    
    print(f"\nCases in both datasets: {len(common_cases):,}")
    print(f"Cases only in referrals: {len(referrals_cases - snap_cases):,}")
    print(f"Cases only in SNAP: {len(snap_cases - referrals_cases):,}")
    
    # Filter both datasets to common cases only
    referrals_common = referrals_df[referrals_df['case_ref'].isin(common_cases)].copy()
    snap_common = snap_df[snap_df['case_ref'].isin(common_cases)].copy()
    
    print(f"\nAfter filtering to common cases:")
    print(f"Referrals: {len(referrals_common):,} rows")
    print(f"SNAP: {len(snap_common):,} rows")
    
    # Merge: each referral row will be duplicated for each SNAP row of the same case
    merged_df = snap_common.merge(referrals_common, on='case_ref', how='inner', suffixes=('_snap', '_referral'))
    
    print(f"\nMerged dataset: {len(merged_df):,} rows, {merged_df['case_ref'].nunique():,} unique cases")
    
    # Show structure breakdown
    rows_per_case = merged_df['case_ref'].value_counts()
    print(f"\nRows per case in merged dataset:")
    print(f"  1 row (baseline only): {(rows_per_case == 1).sum():,} cases")
    print(f"  2 rows (baseline + outcome): {(rows_per_case == 2).sum():,} cases")
    if (rows_per_case > 2).any():
        print(f"  >2 rows (unexpected): {(rows_per_case > 2).sum():,} cases")
    
    # Show SNAP assessment validity breakdown for merged cases
    if 'has_both' in merged_df.columns:
        cases_with_both = merged_df.drop_duplicates('case_ref')['has_both'].sum()
        cases_baseline_only = merged_df.drop_duplicates('case_ref')['has_valid_baseline'].sum() - cases_with_both
        
        print(f"\nSNAP assessment validity in merged dataset:")
        print(f"  Cases with valid baseline + outcome: {cases_with_both:,}")
        print(f"  Cases with valid baseline only: {cases_baseline_only:,}")
    
    return merged_df

# Perform the merge
merged_referrals_snap = merge_referrals_with_snap()

# Save merged dataset
merged_referrals_snap.to_csv("../data/processed/merged_referrals_snap.csv", index=False)
print(f"\nMerged dataset saved to: ../data/processed/merged_referrals_snap.csv")

Let's now check the completeness of the data for cases which have both baseline and post-support observations.

In [None]:
# Data completeness for key domains
key_domains = ['Control (QLC)', 'Personal cleanliness (QLC)', 'Food and drink (QLC)', 
               'Personal safety (QLC)', 'Social Participation (QLC)', 'Occupation (QLC)', 
               'Accommodation (QLC)', 'Dignity 1 (QLC)', 'snap_Medication (QLC)', 'Finances (QLC)']

# Filter rows with both baseline and outcome
filtered_snap = merged_referrals_snap[merged_referrals_snap['has_both']]

print(f"\n=== DATA COMPLETENESS ===")
for domain in key_domains:
    if domain in filtered_snap.columns:
        completeness = filtered_snap[domain].notna().mean()
        print(f"{domain}: {completeness:.1%} complete")


## 4. Pre-cleaning HIU data & merging with referrals

I move on to **HIU** (High Intensity Use service for people frequently attending A&E).

1. Same as before, I first remove perfect duplicates, excluding the 4 depersonalized columns (Age, Gender, Ethnicity, Living Arrangements) from duplicate detection since these were randomised.
2. I then identify valid assessments by 'Time Points'. This step uses the 'Q6. Why wasn't it possible to record outcomes for this client?' column - if this is filled out, outcomes were NOT recordable, so we need this to be empty/NaN for valid assessments

In [None]:
from utils.data_cleaning_helpers import clean_hiu_dataset

hiu_df = datasets['hiu'].copy()

hiu_cleaned = clean_hiu_dataset(hiu_df)

# Save cleaned dataset
hiu_cleaned.to_csv("../data/processed/hiu_cleaned.csv", index=False)
print(f"\nCleaned dataset saved to: ../data/processed/hiu_cleaned.csv")


In [None]:
# Data completeness for key domains
key_domains2 = ['Change in Activation', 'Change in Wellbeing', 'Change in Housing', 
               'Change in Finance', 'Change in Loneliness', 'Change in Social Value']

# Filter rows with both baseline and outcome
filtered_hiu = hiu_cleaned[hiu_cleaned['has_both']]

print(f"\n=== DATA COMPLETENESS ===")
for domain in key_domains2:
    if domain in filtered_hiu.columns:
        completeness = filtered_hiu[domain].notna().mean()
        print(f"{domain}: {completeness:.1%} complete")


### Merging HIU & referrals

In [None]:
def merge_referrals_with_hiu():
    """
    Merge referrals (1 row per case) with HIU data (multiple rows per case).
    Only keeps cases that appear in both datasets.
    """
    
    print("=== MERGING REFERRALS WITH HIU DATA ===")
    
    # Load datasets
    referrals_df = pd.read_csv("../data/processed/referrals_cleaned.csv")
    hiu_df = pd.read_csv("../data/processed/hiu_cleaned.csv")
    
    print(f"Referrals dataset: {len(referrals_df):,} rows, {referrals_df['case_ref'].nunique():,} unique cases")
    print(f"HIU dataset: {len(hiu_df):,} rows, {hiu_df['case_ref'].nunique():,} unique cases")
    
    # Find cases that appear in both datasets
    referrals_cases = set(referrals_df['case_ref'])
    hiu_cases = set(hiu_df['case_ref'])
    
    common_cases = referrals_cases.intersection(hiu_cases)
    
    print(f"\nCases in both datasets: {len(common_cases):,}")
    print(f"Cases only in referrals: {len(referrals_cases - hiu_cases):,}")
    print(f"Cases only in HIU: {len(hiu_cases - referrals_cases):,}")
    
    # Filter both datasets to common cases only
    referrals_common = referrals_df[referrals_df['case_ref'].isin(common_cases)].copy()
    hiu_common = hiu_df[hiu_df['case_ref'].isin(common_cases)].copy()
    
    print(f"\nAfter filtering to common cases:")
    print(f"Referrals: {len(referrals_common):,} rows")
    print(f"HIU: {len(hiu_common):,} rows")
    
    # Merge: each referral row will be duplicated for each HIU row of the same case
    merged_df = hiu_common.merge(referrals_common, on='case_ref', how='inner', suffixes=('_hiu', '_referral'))
    
    print(f"\nMerged dataset: {len(merged_df):,} rows, {merged_df['case_ref'].nunique():,} unique cases")
    
    # Show structure breakdown
    rows_per_case = merged_df['case_ref'].value_counts()
    print(f"\nRows per case in merged dataset:")
    print(f"  1 row (baseline only): {(rows_per_case == 1).sum():,} cases")
    print(f"  2 rows (baseline + outcome): {(rows_per_case == 2).sum():,} cases")
    if (rows_per_case > 2).any():
        print(f"  >2 rows (unexpected): {(rows_per_case > 2).sum():,} cases")
    
    # Show HIU assessment validity breakdown for merged cases
    if 'has_both' in merged_df.columns:
        cases_with_both = merged_df.drop_duplicates('case_ref')['has_both'].sum()
        cases_baseline_only = merged_df.drop_duplicates('case_ref')['has_valid_baseline'].sum() - cases_with_both
        
        print(f"\HIU assessment validity in merged dataset:")
        print(f"  Cases with valid baseline + outcome: {cases_with_both:,}")
        print(f"  Cases with valid baseline only: {cases_baseline_only:,}")
    
    return merged_df

# Perform the merge
merged_referrals_hiu = merge_referrals_with_hiu()

# Save merged dataset
merged_referrals_hiu.to_csv("../data/processed/merged_referrals_hiu.csv", index=False)
print(f"\nMerged dataset saved to: ../data/processed/merged_referrals_hiu.csv")