# CHF Shelter Data Federated Learning Demo

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit


from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

Agencies: 4, 13, 55, 188, 213, 225, 330, 333

In [None]:
LDays = 548
Data_Days = 90
Data_periods = 10
Agency = 333

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
tbl = pd.read_parquet('MergedShelterData-Nov17.parquet')

In [None]:
tbl

In [None]:
import pandas as pd

# Assuming tbl is your DataFrame
tbl['Date'] = pd.to_datetime(tbl['Date'])

def filter_group(group):
    group = group.sort_values('Date')
    min_date = group['Date'].min()
    # Keep only dates within the first 90 days
    group = group[group['Date'] <= min_date + pd.Timedelta(days=LDays)]
    # Drop duplicates in the Date column
    group = group.drop_duplicates(subset='Date', keep=False)
    return group

tbl = tbl.groupby('ClientId').apply(filter_group).reset_index(drop=True)

## EDA
---

In [None]:
print(f'Dates: {tbl.Date.min()} to {tbl.Date.max()}')

In [None]:
nPrsn = len(tbl.ClientId.unique())
print(f'{nPrsn} people in the data.')

In [None]:
print(f'{len(tbl.Agency.unique())} different shelters.')

#### Number of people who use different shelters.

Total Population

In [None]:
nShelter = tbl.groupby('ClientId').progress_apply(lambda x: len(x.Agency.unique()))

In [None]:
def number_of_shelter_breakdown(nShelter,nPrsn):
    hist = nShelter.value_counts()
    for nS in hist.index.sort_values():
        print(f'{hist[nS]}/{nPrsn} ({100*hist[nS]/nPrsn:.2f}%) people used {nS} shelters.')        

In [None]:
number_of_shelter_breakdown(nShelter,nPrsn)

Heavy System Users

In [None]:
nStay = tbl.groupby('ClientId').Date.count()

In [None]:
heavyPctl = 0.95
heavyIds = nStay.sort_values().iloc[int(nPrsn*heavyPctl):].index.to_numpy()

In [None]:
number_of_shelter_breakdown(nShelter[heavyIds],len(heavyIds))

In [None]:
agency_dfs = {}
unique_agencies = tbl['Agency'].unique()

for agency in unique_agencies:
    agency_dfs[agency] = tbl[tbl['Agency'] == agency]

In [None]:
#df_agency = agency_dfs[Agency] 
df_agency_4 = agency_dfs[4]  
df_agency_13 = agency_dfs[13]  
df_agency_55 = agency_dfs[55]  
df_agency_188 = agency_dfs[188]  
df_agency_213 = agency_dfs[213]  
df_agency_225 = agency_dfs[225]  
df_agency_330 = agency_dfs[330]  
df_agency_333 = agency_dfs[333] 

In [None]:
episodeGap = 30 # days

def calc_stays_and_gaps(tbl):    
    stayDates = tbl.Date.drop_duplicates().sort_values() 
    nStay = len(stayDates)

    gapVals = stayDates.diff()
    nEpi = len(gapVals.loc[gapVals >= pd.Timedelta(f'{episodeGap} day') ])+1
    
    return pd.Series({ 'NStays': nStay, 'NEpisodes': nEpi })

In [None]:
df_agency_4G = df_agency_4.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_13G = df_agency_13.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_55G = df_agency_55.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_188G = df_agency_188.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_213G = df_agency_213.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_225G = df_agency_225.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_330G = df_agency_330.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_333G = df_agency_333.groupby('ClientId').progress_apply(calc_stays_and_gaps)

In [None]:
df_agency_4G.reset_index(inplace=True)
df_agency_13G.reset_index(inplace=True)
df_agency_55G.reset_index(inplace=True)
df_agency_188G.reset_index(inplace=True)
df_agency_213G.reset_index(inplace=True)
df_agency_225G.reset_index(inplace=True)
df_agency_330G.reset_index(inplace=True)
df_agency_333G.reset_index(inplace=True)

In [None]:
df_agency_4G['ClientId'] = df_agency_4G['ClientId'].astype(str) + '_4'
df_agency_4G = df_agency_4G.set_index('ClientId')

df_agency_13G['ClientId'] = df_agency_13G['ClientId'].astype(str) + '_13'
df_agency_13G = df_agency_13G.set_index('ClientId')

df_agency_55G['ClientId'] = df_agency_55G['ClientId'].astype(str) + '_55'
df_agency_55G = df_agency_55G.set_index('ClientId')

df_agency_188G['ClientId'] = df_agency_188G['ClientId'].astype(str) + '_188'
df_agency_188G = df_agency_188G.set_index('ClientId')

df_agency_213G['ClientId'] = df_agency_213G['ClientId'].astype(str) + '_213'
df_agency_213G = df_agency_213G.set_index('ClientId')

df_agency_225G['ClientId'] = df_agency_225G['ClientId'].astype(str) + '_225'
df_agency_225G = df_agency_225G.set_index('ClientId') 

df_agency_330G['ClientId'] = df_agency_330G['ClientId'].astype(str) + '_330'
df_agency_330G = df_agency_330G.set_index('ClientId')

df_agency_333G['ClientId'] = df_agency_333G['ClientId'].astype(str) + '_333'
df_agency_333G = df_agency_333G.set_index('ClientId')

In [None]:
from sklearn.cluster import KMeans
import numpy as np

def gen_cluster_labels(tbl):
    dat = tbl.to_numpy()
    nrm = (dat - dat.mean(axis=0))/np.sqrt(dat.var(axis=0))

    kmeans = KMeans(n_clusters=3, random_state=0).fit(nrm)
    labels = kmeans.labels_    

    labelVal = {}
    labelVal['Trn'] = np.argmin(kmeans.cluster_centers_.sum(axis=1)) # Transitional: Fewest stays and episodes.
    labelVal['Epi'] = np.argmax(kmeans.cluster_centers_[:,1])  # Chronic: Most stays.
    labelVal['Chr'] = np.argmax(kmeans.cluster_centers_[:,0])  # Episodic: Most episodes.

    print("Sum of centroids:", kmeans.cluster_centers_.sum(axis=1))
    print("Max in second feature:", kmeans.cluster_centers_[:, 1])
    print("Max in first feature:", kmeans.cluster_centers_[:, 0])

    cohort = {}
    for k in labelVal.keys():
        cohort[k] = tbl.loc[labels == labelVal[k]].index.to_numpy()

    return cohort


In [None]:
from scipy.spatial.distance import cdist

def predict_global_model(tbl, centroids):
    # Normalize the data
    dat = tbl.to_numpy()
    nrm = (dat - dat.mean(axis=0)) / np.sqrt(dat.var(axis=0))

    # Check if the number of features matches
    if nrm.shape[1] != centroids.shape[1]:
        raise ValueError("Number of features in the data does not match the number of features in the centroids.")

    # Calculate distances and get labels
    distances = cdist(nrm, centroids, 'euclidean')
    labels = np.argmin(distances, axis=1)

    # Determine the labels for each type
    labelVal = {}
    labelVal['Trn'] = np.argmin(centroids.sum(axis=1))  # Fewest stays and episodes
    labelVal['Epi'] = np.argmax(centroids[:, 1])  # Most stays
    labelVal['Chr'] = np.argmax(centroids[:, 0])  # Most episodes

    # print("Sum of centroids:", centroids.sum(axis=1))
    # print("Max in second feature:", centroids[:, 1])
    # print("Max in first feature:", centroids[:, 0])
    # Assign records to cohorts based on labels
    cohort = {}
    for k, val in labelVal.items():
        cohort[k] = tbl.iloc[labels == val].index.to_numpy()

    return cohort


In [None]:
dataframes = [
    df_agency_4G,
    df_agency_13G,
    df_agency_55G,
    df_agency_188G,
    df_agency_213G,
    df_agency_225G, 
    df_agency_330G,
    df_agency_333G
]

In [None]:
centroids = []  # List to store centroids from each model
for df in dataframes:  # Assuming 'dataframes' is a list of your 8 DataFrames
    dat = df.to_numpy()
    nrm = (dat - dat.mean(axis=0))/np.sqrt(dat.var(axis=0))
    kmeans = KMeans(n_clusters=3, random_state=0).fit(nrm)
    centroids.append(kmeans.cluster_centers_)


In [None]:
weights = [len(df) for df in dataframes]
total_weight = sum(weights)
normalized_weights = [w / total_weight for w in weights]


In [None]:
# average_centroids = sum(w * c for w, c in zip(normalized_weights, centroids)) / len(dataframes)
average_centroids = sum(w * c for w, c in zip(normalized_weights, centroids))

In [None]:
global_labels_4 = predict_global_model(df_agency_4G, average_centroids)
global_labels_13 = predict_global_model(df_agency_13G, average_centroids)
global_labels_55 = predict_global_model(df_agency_55G, average_centroids)
global_labels_188 = predict_global_model(df_agency_188G, average_centroids)
global_labels_213 = predict_global_model(df_agency_213G, average_centroids)
global_labels_225 = predict_global_model(df_agency_225G, average_centroids)
global_labels_330 = predict_global_model(df_agency_330G, average_centroids)
global_labels_333 = predict_global_model(df_agency_333G, average_centroids)

In [None]:
combined_labels = {'Trn': [], 'Epi': [], 'Chr': []}

In [None]:
dict_list = [global_labels_4, global_labels_13, global_labels_55, global_labels_188, global_labels_213, global_labels_225, global_labels_330, global_labels_333]

for d in dict_list:
    for key in combined_labels.keys():
        combined_labels[key].extend(d[key])

In [None]:
total_elements = sum(len(lst) for lst in combined_labels.values())
print("Total number of elements in all lists:", total_elements)

In [None]:
# Flatten the dictionary into a list of tuples (label, value)
data = []
for label, values in combined_labels.items():
    for value in values:
        data.append((label, value))

# Convert the list of tuples into a DataFrame
combined_df = pd.DataFrame(data, columns=['Label', 'Value'])

print("Number of rows in the DataFrame:", len(combined_df))

In [None]:
# Save Labels_df as a CSV file
file_name = f'CHF_Data_1/CHF_Labels_FL2_{LDays}.csv'
combined_df.to_csv(file_name, index=False)