# Care Homes

In [1]:
import polars as pl
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import pickle
import io
import boto3
import re
import os
from enum import Enum
from sklearn.base import BaseEstimator

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
import sys
notebook_dir = os.getcwd()

project_root = os.path.dirname(notebook_dir)

sys.path.insert(0, project_root)

from utilities.version import ModelVersionManager

## Key features 

In [3]:
SERVICES_LOOKUP = {
#     "service_acute_with_beds": "Acute services with overnight beds",
#     "service_acute_without_beds": "Acute services without overnight beds / listed acute services with or without overnight beds",
#     "service_ambulance": "Ambulance service",
    "service_care_home_with_nursing": "Care home service with nursing",
    "service_care_home_without_nursing": "Care home service without nursing",
#     "service_community_substance_misuse": "Community based services for people who misuse substances",
#     "service_community_learning_disability": "Community based services for people with a learning disability",
#     "service_community_mental_health": "Community based services for people with mental health needs",
#     "service_community_health_nurse_agency": "Community health care services - Nurses Agency only",
#     "service_community_health": "Community healthcare service",
#     "service_dental": "Dental service",
#     "service_diagnostic": "Diagnostic and/or screening service",
#     "service_doctors_consultant": "Doctors consultation service",
#     "service_doctors_treatment": "Doctors treatment service",
    "service_domiciliary": "Domiciliary care service",
    "service_extra_care_housing": "Extra Care housing services",
#     "service_hospice_at_home": "Hospice services at home",
#     "service_hospice": "Hospice services",
#     "service_hospital": "Hospital services for people with mental health needs, learning disabilities and problems with substance misuse",
#     "service_long_term": "Long term conditions services",
#     "service_mobile_doctors": "Mobile doctors service",
#     "service_prison_healthcare": "Prison Healthcare Services",
#     "service_rehab": "Rehabilitation services",
#     "service_remote_advice": "Remote clinical advice service",
#     "service_residential_rehab": "Residential substance misuse treatment and/or rehabilitation service",
    "service_shared_lives": "Shared Lives",
    "service_specialist_college": "Specialist college service",
    "service_supported_living": "Supported living service",
#     "service_urgent_care": "Urgent care services",
}

SPECIALISMS_LOOKUP = {
    "specialism_adults_over_65": "Caring for adults over 65 yrs",
    "specialism_adults_under_65": "Caring for adults under 65 yrs",
    "specialism_children": "Caring for children",
    "specialism_dementia": "Dementia",
#     "specialism_detained_under_mental_health_act": "Caring for people whose rights are restricted under the Mental Health Act",
#     "specialism_eating_disorders": "Eating disorders",
    "specialism_learning_disabilities": "Learning disabilities",
    "specialism_mental_health": "Mental health conditions",
    "specialism_physical_disabilities": "Physical disabilities",
#     "specialism_sensory_impairment": "Sensory impairment",
#     "specialism_substance_misuse": "Substance misuse problems",
    "specialism_whole_population": "Services for everyone",
}   

RURAL_URBAN_INDICATOR_LOOKUP = {
    "rui_rural_hamlet": "Rural hamlet and isolated dwellings",
    "rui_rural_hamlet_sparse": "Rural hamlet and isolated dwellings in a sparse setting",
    "rui_rural_town": "Rural town and fringe",
    "rui_rural_town_sparse": "Rural town and fringe in a sparse setting",
    "rui_rural_village": "Rural village",
    "rui_rural_village_sparse": "Rural village in a sparse setting",
    "rui_urban_city": "Urban city and town",
    "rui_urban_city_sparse": "Urban city and town in a sparse setting",
    "rui_urban_major": "Urban major conurbation",
    "rui_urban_minor": "Urban minor conurbation",
    # "rui_sparse": "Sparce setting",
}

REGION_LOOKUP = {
    "region_east_midlands": "East Midlands",
    "region_eastern": "Eastern",
    "region_london": "London",
    "region_north_east": "North East",
    "region_north_west": "North West",
    "region_south_east": "South East",
    "region_south_west": "South West",
    "region_west_midlands": "West Midlands",
    "region_yorkshire_and_the_humber": "Yorkshire and the Humber",
}

RELATED_LOCATION_LOOKUP = {
    "no_related_location": "N",
}

## Define S3 bucket and Location of the Data

In [4]:
BUCKET = 'sfc-mt-sagemaker-demo'                                                                                    #used for demo purposes 
CAREHOMEDATA = 's3://sfc-main-datasets/domain=ind_cqc_filled_posts/dataset=ind_cqc_estimated_missing_ascwds_filled_posts/'

## Read and Filter Dataset

In [5]:
df1 = pl.scan_parquet(CAREHOMEDATA)

In [6]:
first_filtered_loc_data = df1.filter(
    (pl.col("carehome") == 'Y') 
)

In [7]:
filtered_loc_data = df1.collect()

In [None]:
#def calculate_care_home_status_count(df: DataFrame) -> DataFrame:
#    w = Window.partitionBy("locationid")

#    df = df.withColumn(
#        "care_home_status_count",
#        F.size((F.collect_set("carehome").over(w))),
#    )
#    return df

In [8]:
def calculate_care_home_status_count(df: pl.DataFrame) -> pl.DataFrame:
    # Step 1: Get number of unique carehomes per locationId 
    counts = (
        df.group_by("locationId")
        .agg(pl.col("careHome").unique().len().alias("care_home_status_count"))
    )

    # Step 2: Join back to original frame
    return df.join(counts, on="locationId")

#once again the case insensitivity of pyspark 
#     made locationid = locationID, carehome = careHome

In [9]:
locations_df = calculate_care_home_status_count(filtered_loc_data)

In [10]:
fully_filtered_loc_data = locations_df.filter((pl.col("care_home_status_count") == 1))

### Create filled posts 

In [13]:
fully_filtered_loc_data = fully_filtered_loc_data.with_columns((pl.col("imputed_filled_posts_per_bed_ratio_model") * pl.col("numberOfBeds")).alias("imputed_filled_posts_model"))
#once again the case insensitivity of pyspark 
#     made numberofbeds = numberOfBeds

### Service, Activity and  count