# CHF Shelter Data Federated Learning Demo

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit


from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
LDays = 548
Data_Days = 120
Data_periods = 5
Data_freq='24D'

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
tbl = pd.read_parquet('MergedShelterData-Nov17.parquet')

In [None]:
tbl

## EDA
---

In [None]:
print(f'Dates: {tbl.Date.min()} to {tbl.Date.max()}')

In [None]:
nPrsn = len(tbl.ClientId.unique())
print(f'{nPrsn} people in the data.')

In [None]:
print(f'{len(tbl.Agency.unique())} different shelters.')

#### Number of people who use different shelters.

Total Population

In [None]:
nShelter = tbl.groupby('ClientId').progress_apply(lambda x: len(x.Agency.unique()))

In [None]:
def number_of_shelter_breakdown(nShelter,nPrsn):
    hist = nShelter.value_counts()
    for nS in hist.index.sort_values():
        print(f'{hist[nS]}/{nPrsn} ({100*hist[nS]/nPrsn:.2f}%) people used {nS} shelters.')        

In [None]:
number_of_shelter_breakdown(nShelter,nPrsn)

Heavy System Users

In [None]:
nStay = tbl.groupby('ClientId').Date.count()

In [None]:
heavyPctl = 0.95
heavyIds = nStay.sort_values().iloc[int(nPrsn*heavyPctl):].index.to_numpy()

In [None]:
number_of_shelter_breakdown(nShelter[heavyIds],len(heavyIds))

## Labelling
---
- We'll use cluster based analysis on total stay and total number of stay episode values to label our data set.
- More information on this methodology here: https://arxiv.org/abs/2210.13619

In [None]:
import pandas as pd

# Assuming tbl is your DataFrame
tbl['Date'] = pd.to_datetime(tbl['Date'])

def filter_group(group):
    group = group.sort_values('Date')
    min_date = group['Date'].min()
    # Keep only dates within the first 90 days
    group = group[group['Date'] <= min_date + pd.Timedelta(days=LDays)]
    # Drop duplicates in the Date column
    group = group.drop_duplicates(subset='Date', keep=False)
    return group

tbl = tbl.groupby('ClientId').apply(filter_group).reset_index(drop=True)

In [None]:
episodeGap = 30 # days

def calc_stays_and_gaps(tbl):    
    stayDates = tbl.Date.drop_duplicates().sort_values() 
    nStay = len(stayDates)

    gapVals = stayDates.diff()
    nEpi = len(gapVals.loc[gapVals >= pd.Timedelta(f'{episodeGap} day') ])+1
    
    return pd.Series({ 'NStays': nStay, 'NEpisodes': nEpi })

In [None]:
tblStayGap = tbl.groupby('ClientId').progress_apply(calc_stays_and_gaps)

In [None]:
#tblStayGap

In [None]:
def gen_cluster_labels(tbl):
    dat = tbl.to_numpy()
    nrm = (dat - dat.mean(axis=0))/np.sqrt(dat.var(axis=0))

    kmeans = KMeans(n_clusters=3, random_state=0).fit(nrm)
    labels = kmeans.labels_    

    labelVal = {}
    labelVal['Trn'] = np.argmin(kmeans.cluster_centers_.sum(axis=1)) # Transitional: Fewest stays and episodes.
    labelVal['Epi'] = np.argmax(kmeans.cluster_centers_[:,1])  # Chronic: Most stays.
    labelVal['Chr'] = np.argmax(kmeans.cluster_centers_[:,0])  # Episodic: Most episodes.

    print("Sum of centroids:", kmeans.cluster_centers_.sum(axis=1))
    print("Max in second feature:", kmeans.cluster_centers_[:, 1])
    print("Max in first feature:", kmeans.cluster_centers_[:, 0])

    cohort = {}
    for k in labelVal.keys():
        cohort[k] = tbl.loc[labels == labelVal[k]].index.to_numpy()

    return cohort


In [None]:
def cluster_stats(labels):    
    nPrsn = 0
    for k in labels.keys():
        nPrsn += len(labels[k])
        
    for k in labels.keys():
        print(f'{k}: {len(labels[k])}/{nPrsn} ({100*len(labels[k])/nPrsn:.2f}%)')        
    

In [None]:
labels = { 'Cntrl': gen_cluster_labels(tblStayGap) }

In [None]:
cluster_stats(labels['Cntrl'])

In [None]:
element = labels['Cntrl']

In [None]:
# Creating DataFrame from the dictionary
frames = []
for key, values in element.items():
    temp_df = pd.DataFrame({'ClientId': values, 'ListNumber': key})
    frames.append(temp_df)

Labels_df = pd.concat(frames).reset_index(drop=True)

In [None]:
Labels_df = Labels_df.sort_values(by='ClientId').reset_index(drop=True)


In [None]:
def plot_clusters(tbl,labels):
    
    plt.rcParams['font.size'] = 16
    fig,ax = plt.subplots(figsize=(16,12))
    
    colors = { 'Trn': '#919191', 'Epi': '#474747', 'Chr': '#c7c7c7' }
    
    for k in labels.keys():
        plt.plot(tbl.loc[labels[k]].NEpisodes.to_numpy(),tbl.loc[labels[k]].NStays.to_numpy(),color=colors[k],marker='o',ls='None',label=k)
    plt.xlabel('Total Number of Episodes',fontsize=16)
    plt.ylabel('Total Number of Stays',fontsize=16)
    plt.legend(fontsize=16)
    #plt.savefig('ClusterResults.eps')
    plt.show()

In [None]:
plot_clusters(tblStayGap,labels['Cntrl'])

## Centralized Model for Chronic Shelter Use Prediction
---
- A simple threshold based model for the first $T_O$ days in shelter.

In [None]:
tO = 120 # Model observation window (days)
seed = 12

In [None]:
x = tbl.groupby('ClientId').progress_apply(lambda x: ((x.Date - x.Date.min()).dt.days.drop_duplicates() < tO).sum())

In [None]:
y = x.index.isin(labels['Cntrl']['Chr'])

In [None]:
trainIdx, testIdx = next( StratifiedShuffleSplit(n_splits=1,test_size=0.3,random_state=seed).split(x,y) )

#### Hyper Parameter Tuning

In [None]:
thrshVals = np.arange(5,85,dtype='int')

In [None]:
xTrn = x.to_numpy()[trainIdx]
yTrn = y[trainIdx]

In [None]:
def evaluate_model(x,y,thrsh):
    hat = x >= thrsh    
    
    cMtx = np.zeros((2,2),dtype='int')
    cMtx[0,0] = (hat & y).sum()  # True Positives
    cMtx[0,1] = (hat & ~y).sum() # False Positives
    cMtx[1,0] = (~hat & y).sum() # False Negatives
    cMtx[1,1] = (~hat & ~y).sum() # True Negatives   
    
    return cMtx

In [None]:
def calc_performance(cMtx):
    tP,fP,fN,tN = cMtx[0,0],cMtx[0,1],cMtx[1,0],cMtx[1,1]
    precision = tP/(tP+fP)
    recall = tP/(tP+fN)
    fscore = 2*tP/(2*tP+fN+fP)
    return precision,recall,fscore

In [None]:
def tune_hyperparameters(x,y,nSplit):
    precision = np.zeros((nSplit,len(thrshVals)))
    recall = np.zeros((nSplit,len(thrshVals)))
    fscore = np.zeros((nSplit,len(thrshVals)))

    for i,(iSpltTrn,iSpltVal) in enumerate(StratifiedKFold(n_splits=5,shuffle=True,random_state=seed).split(x,y)):

        # No training since it's just a threshold test.

        # Validate hyperparameter settings (ie. the threshold value)
        for j,thrsh in enumerate(thrshVals):
            cMtx = evaluate_model(x[iSpltVal],y[iSpltVal],thrsh)
            tP,fP,fN,tN = cMtx[0,0],cMtx[0,1],cMtx[1,0],cMtx[1,1]
            precision[i,j],recall[i,j],fscore[i,j] = calc_performance(cMtx)

    precision = precision.mean(axis=0)
    recall = recall.mean(axis=0)
    fscore = fscore.mean(axis=0)

    iBest = np.argmax(fscore)

    print(f'Best Threshold: {thrshVals[iBest]} (Precision: {precision[iBest]:.3f}, Recall: {recall[iBest]:.3f}, FScore: {fscore[iBest]:.3f}')
    
    return thrshVals[iBest]

In [None]:
bestThrsh = tune_hyperparameters(xTrn,yTrn,nSplit=5)

#### Test

In [None]:
xTest = x.to_numpy()[testIdx]
yTest = y[testIdx]

In [None]:
cMtx = evaluate_model(xTest,yTest,bestThrsh)
precision,recall,fscore = calc_performance(cMtx)
print(f'Test Performance - Precision: {precision:.3f}, Recall: {recall:.3f}, FScore: {fscore:.3f}')

## Labelling at Individual Shelters
---
- Assume the centralized labels are the ground truth and compare how close labels generated in each shelter get.

In [None]:
nShelters = tbl['Agency'].drop_duplicates()

In [None]:
for shelter in nShelters:
    
    print(f'\n--- Shelter {shelter} ---')
    tblStayGapShelter = tbl[tbl.Agency == shelter].groupby('ClientId').apply(calc_stays_and_gaps)
    labels[shelter] = gen_cluster_labels(tblStayGapShelter)      
    
    print(f'{len(tblStayGapShelter.index)} people.')
    cluster_stats(labels[shelter])
    
    tP = np.isin(labels[shelter]['Chr'],labels['Cntrl']['Chr']).sum()
    fP = (~np.isin(labels[shelter]['Chr'],labels['Cntrl']['Chr'])).sum()
    fN = np.isin(labels[shelter]['Epi'],labels['Cntrl']['Chr']).sum() + np.isin(labels[shelter]['Trn'],labels['Cntrl']['Chr']).sum()
    
    print(f'Precision: {tP/(tP+fP):.3f}, Recall: {tP/(tP+fN):.3f}')


## Chronic Shelter Use Prediction at Individual Shelters
---

In [None]:
for shelter in nShelters:
    
    print(f'\n--- Shelter {shelter} ---')
    
    x = tbl.loc[tbl.Agency==shelter].groupby('ClientId').apply(lambda x: ((x.Date - x.Date.min()).dt.days.drop_duplicates() < tO).sum())
    
    # Select hyperparameters with labels available in shelter.
    y = x.index.isin(labels[shelter]['Chr'])
    trainIdx, testIdx = next( StratifiedShuffleSplit(n_splits=1,test_size=0.3,random_state=seed).split(x,y) )    
    
    xTrn = x.to_numpy()[trainIdx]
    yTrn = y[trainIdx]    
    bestThrsh = tune_hyperparameters(xTrn,yTrn,nSplit=5)    
    
    # Test with the centralized labels.
    y = x.index.isin(labels['Cntrl']['Chr'])    
    xTest = x.to_numpy()[testIdx]
    yTest = y[testIdx]
    
    cMtx = evaluate_model(xTest,yTest,bestThrsh)
    precision,recall,fscore = calc_performance(cMtx)
    print(f'Test Performance - Precision: {precision:.3f}, Recall: {recall:.3f}, FScore: {fscore:.3f}')    

Step 1: Find the total time frame.



Step 2: Implement Windowing for like 4 months.

In [None]:
import pandas as pd

# Assuming tbl is your DataFrame
tbl['Date'] = pd.to_datetime(tbl['Date'])

def filter_group(group):
    group = group.sort_values('Date')
    min_date = group['Date'].min()
    # Keep only dates within the first 90 days
    group = group[group['Date'] <= min_date + pd.Timedelta(days=Data_Days)]
    # Drop duplicates in the Date column
    group = group.drop_duplicates(subset='Date', keep=False)
    return group

filtered_tbl = tbl.groupby('ClientId').apply(filter_group).reset_index(drop=True)


In [None]:
episodeGap = 30 # days

def calc_stays_and_gaps(tbl):    
    stayDates = tbl.Date.drop_duplicates().sort_values() 
    nStay = len(stayDates)

    gapVals = stayDates.diff()
    nEpi = len(gapVals.loc[gapVals >= pd.Timedelta(f'{episodeGap} day') ])+1
    
    return pd.Series({ 'NStays': nStay, 'NEpisodes': nEpi })

In [None]:
filtered_tbl_2 = filtered_tbl.groupby('ClientId').progress_apply(calc_stays_and_gaps)

In [None]:
filtered_tbl['Sleep'] = 1

In [None]:
import pandas as pd

def aggregate_to_30_day_windows(group):
    group.set_index('Date', inplace=True)
    # Initialize an empty DataFrame to store the final result for this group
    final_group = pd.DataFrame()
    calculated_freq = f'{int(Data_Days/Data_periods)}D'
    # Create periods of 30 days each, starting from the minimum date
    for period_start in pd.date_range(start=group.index.min(), periods=Data_periods, freq=calculated_freq):
        # Define the end of the period (30 days after the start)
        period_end = period_start + pd.Timedelta(days=Data_Days/Data_periods)
        # Filter the group for events within the current 30-day period
        events_in_period = group[(group.index >= period_start) & (group.index < period_end)]
        # Count the number of events in the period
        event_count = len(events_in_period)
        # Add a row with the count of events and the correct ClientId
        count_row = pd.DataFrame({'Sleep': [event_count]}, index=[period_start])
        final_group = final_group.append(count_row)
    final_group['ClientId'] = group.name  # Add the ClientId to each row
    return final_group

# Assuming filtered_tbl is your pre-filtered DataFrame
aggregated_tbl = filtered_tbl.groupby('ClientId').apply(aggregate_to_30_day_windows).reset_index(drop=True)



In [None]:
# This will create a series from 0, 1, 2, 0, 1, 2, etc.
helper_column = (aggregated_tbl.groupby('ClientId').cumcount() + 1).astype(str)

# Add this as a new column to the DataFrame
aggregated_tbl['CountId'] = 'Sleep_' + helper_column

# Pivot the table
pivot_table = aggregated_tbl.pivot(index='ClientId', columns='CountId', values='Sleep')

# Reset the index to turn ClientId into a column
pivot_table.reset_index(inplace=True)

In [None]:
unique_clients = filtered_tbl.drop_duplicates(subset='ClientId', keep='first')

In [None]:
# Set ClientId as index for easy lookup
pivot_table.set_index('ClientId', inplace=True)
unique_clients.set_index('ClientId', inplace=True)

# Update Agency in agg_tbl based on unique_clients
pivot_table['Agency'] = pivot_table.index.map(unique_clients['Agency'])

# Reset index if you want ClientId as a column
pivot_table.reset_index(inplace=True)


Combine Shelters over ClientID

Run a model in Pytorch

In [None]:
pivot_table = pivot_table.fillna(0)

In [None]:
unique_clients.reset_index(inplace=True)

In [None]:
filtered_tbl_2.reset_index(inplace=True)

In [None]:
pivot_table['Episodes'] = filtered_tbl_2['NEpisodes']
pivot_table['Stays'] = filtered_tbl_2['NStays']
pivot_table['Agency'] = unique_clients['Agency']

In [None]:
# # Save aggregated_tbl as a CSV file
file_name = f'CHF_Data_1/CHF_{Data_Days}D_{Data_periods}W.csv'
pivot_table.to_csv(file_name, index=False)

# # # Save Labels_df as a CSV file
file_name = f'CHF_Data_1/CHF_Labels_{LDays}.csv'
Labels_df.to_csv(file_name, index=False)
