# CHF Shelter Data Federated Learning Demo

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit


from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

Agencies: 4, 13, 55, 188, 213, 225, 330, 333

In [None]:
LDays = 548
Data_Days = 90
Data_periods = 10
Agency = 333

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
tbl = pd.read_parquet('MergedShelterData-Nov17.parquet')

In [None]:
tbl

## EDA
---

In [None]:
print(f'Dates: {tbl.Date.min()} to {tbl.Date.max()}')

In [None]:
nPrsn = len(tbl.ClientId.unique())
print(f'{nPrsn} people in the data.')

In [None]:
print(f'{len(tbl.Agency.unique())} different shelters.')

#### Number of people who use different shelters.

Total Population

In [None]:
nShelter = tbl.groupby('ClientId').progress_apply(lambda x: len(x.Agency.unique()))

In [None]:
def number_of_shelter_breakdown(nShelter,nPrsn):
    hist = nShelter.value_counts()
    for nS in hist.index.sort_values():
        print(f'{hist[nS]}/{nPrsn} ({100*hist[nS]/nPrsn:.2f}%) people used {nS} shelters.')        

In [None]:
number_of_shelter_breakdown(nShelter,nPrsn)

Heavy System Users

In [None]:
nStay = tbl.groupby('ClientId').Date.count()

In [None]:
heavyPctl = 0.95
heavyIds = nStay.sort_values().iloc[int(nPrsn*heavyPctl):].index.to_numpy()

In [None]:
number_of_shelter_breakdown(nShelter[heavyIds],len(heavyIds))

In [None]:
agency_dfs = {}
unique_agencies = tbl['Agency'].unique()

for agency in unique_agencies:
    agency_dfs[agency] = tbl[tbl['Agency'] == agency]

In [None]:
#df_agency = agency_dfs[Agency] 
df_agency_4 = agency_dfs[4]  
df_agency_13 = agency_dfs[13]  
df_agency_55 = agency_dfs[55]  
df_agency_188 = agency_dfs[188]  
df_agency_213 = agency_dfs[213]  
df_agency_225 = agency_dfs[225]  
df_agency_330 = agency_dfs[330]  
df_agency_333 = agency_dfs[333] 

In [None]:
episodeGap = 30 # days

def calc_stays_and_gaps(tbl):    
    stayDates = tbl.Date.drop_duplicates().sort_values() 
    nStay = len(stayDates)

    gapVals = stayDates.diff()
    nEpi = len(gapVals.loc[gapVals >= pd.Timedelta(f'{episodeGap} day') ])+1
    
    return pd.Series({ 'NStays': nStay, 'NEpisodes': nEpi })

In [None]:
df_agency_4G = df_agency_4.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_13G = df_agency_13.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_55G = df_agency_55.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_188G = df_agency_188.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_213G = df_agency_213.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_225G = df_agency_225.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_330G = df_agency_330.groupby('ClientId').progress_apply(calc_stays_and_gaps)
df_agency_333G = df_agency_333.groupby('ClientId').progress_apply(calc_stays_and_gaps)

In [None]:
df_agency_4G.reset_index(inplace=True)
df_agency_13G.reset_index(inplace=True)
df_agency_55G.reset_index(inplace=True)
df_agency_188G.reset_index(inplace=True)
df_agency_213G.reset_index(inplace=True)
df_agency_225G.reset_index(inplace=True)
df_agency_330G.reset_index(inplace=True)
df_agency_333G.reset_index(inplace=True)

In [None]:
# df_agency_4G.drop(columns=['Date'], inplace=True)
# df_agency_13G.drop(columns=['Date'], inplace=True)
# df_agency_55G.drop(columns=['Date'], inplace=True)
# df_agency_188G.drop(columns=['Date'], inplace=True)
# df_agency_213G.drop(columns=['Date'], inplace=True)
# df_agency_225G.drop(columns=['Date'], inplace=True)
# df_agency_330G.drop(columns=['Date'], inplace=True)
# df_agency_333G.drop(columns=['Date'], inplace=True)

In [None]:
df_agency_4G['ClientId'] = df_agency_4G['ClientId'].astype(str) + '_4'
df_agency_4G = df_agency_4G.set_index('ClientId')

df_agency_13G['ClientId'] = df_agency_13G['ClientId'].astype(str) + '_13'
df_agency_13G = df_agency_13G.set_index('ClientId')

df_agency_55G['ClientId'] = df_agency_55G['ClientId'].astype(str) + '_55'
df_agency_55G = df_agency_55G.set_index('ClientId')

df_agency_188G['ClientId'] = df_agency_188G['ClientId'].astype(str) + '_188'
df_agency_188G = df_agency_188G.set_index('ClientId')

df_agency_213G['ClientId'] = df_agency_213G['ClientId'].astype(str) + '_213'
df_agency_213G = df_agency_213G.set_index('ClientId')

df_agency_225G['ClientId'] = df_agency_225G['ClientId'].astype(str) + '_225'
df_agency_225G = df_agency_225G.set_index('ClientId') 

df_agency_330G['ClientId'] = df_agency_330G['ClientId'].astype(str) + '_330'
df_agency_330G = df_agency_330G.set_index('ClientId')

df_agency_333G['ClientId'] = df_agency_333G['ClientId'].astype(str) + '_333'
df_agency_333G = df_agency_333G.set_index('ClientId')

In [None]:
# def gen_cluster_labels(tbl):
#     dat = tbl.to_numpy()
#     nrm = (dat - dat.mean(axis=0))/np.sqrt(dat.var(axis=0))

#     kmeans = KMeans(n_clusters=3, random_state=0).fit(nrm)
#     labels = kmeans.labels_    

#     labelVal = {}
#     labelVal['Trn'] = np.argmin(kmeans.cluster_centers_.sum(axis=1)) # Transitional: Fewest stays and episodes.
#     labelVal['Epi'] = np.argmax(kmeans.cluster_centers_[:,1])  # Chronic: Most stays.
#     labelVal['Chr'] = np.argmax(kmeans.cluster_centers_[:,0])  # Episodic: Most episodes.

#     cohort = {}
#     for k in labelVal.keys():
#         cohort[k] = tbl.loc[labels == labelVal[k]].index.to_numpy()

#     return cohort

In [None]:
# from sklearn.cluster import KMeans
# import numpy as np

# def gen_cluster_labels(tbl):
#     # Print the number of elements in the input DataFrame
#     print(f"Number of elements in input: {len(tbl)}")
    
#     dat = tbl.to_numpy()
#     nrm = (dat - dat.mean(axis=0)) / np.sqrt(dat.var(axis=0))

#     kmeans = KMeans(n_clusters=3, random_state=0).fit(nrm)
#     labels = kmeans.labels_

#     labelVal = {}
#     labelVal['Trn'] = np.argmin(kmeans.cluster_centers_.sum(axis=1))  # Transitional: Fewest stays and episodes.
#     labelVal['Epi'] = np.argmax(kmeans.cluster_centers_[:, 1])  # Chronic: Most stays.
#     labelVal['Chr'] = np.argmax(kmeans.cluster_centers_[:, 0])  # Episodic: Most episodes.

#     cohort = {}
#     x = 0
#     for k in labelVal.keys():
#         cohort[k] = tbl.loc[labels == labelVal[k]].index.to_numpy()
#         # Print the number of elements in each output category
#         x = x + len(cohort[k])
#     print(f"Number of elements in output: {x}")

#     return cohort


In [None]:
from sklearn.cluster import KMeans
import numpy as np

def gen_cluster_labels(tbl):
    dat = tbl.to_numpy()
    nrm = (dat - dat.mean(axis=0))/np.sqrt(dat.var(axis=0))

    kmeans = KMeans(n_clusters=3, random_state=0).fit(nrm)
    labels = kmeans.labels_    

    labelVal = {}
    labelVal['Trn'] = np.argmin(kmeans.cluster_centers_.sum(axis=1)) # Transitional: Fewest stays and episodes.
    labelVal['Epi'] = np.argmax(kmeans.cluster_centers_[:,1])  # Chronic: Most stays.
    labelVal['Chr'] = np.argmax(kmeans.cluster_centers_[:,0])  # Episodic: Most episodes.

    print("Sum of centroids:", kmeans.cluster_centers_.sum(axis=1))
    print("Max in second feature:", kmeans.cluster_centers_[:, 1])
    print("Max in first feature:", kmeans.cluster_centers_[:, 0])

    cohort = {}
    for k in labelVal.keys():
        cohort[k] = tbl.loc[labels == labelVal[k]].index.to_numpy()

    return cohort

In [None]:
dataframes = [
    df_agency_4G,
    df_agency_13G,
    df_agency_55G,
    df_agency_188G,
    df_agency_213G,
    df_agency_225G, 
    df_agency_330G,
    df_agency_333G
]

In [None]:
cohort_agency_4 = gen_cluster_labels(df_agency_4G)
cohort_agency_13 = gen_cluster_labels(df_agency_13G)
cohort_agency_55 = gen_cluster_labels(df_agency_55G)
cohort_agency_188 = gen_cluster_labels(df_agency_188G)
cohort_agency_213 = gen_cluster_labels(df_agency_213G)
cohort_agency_225 = gen_cluster_labels(df_agency_225G)
cohort_agency_330 = gen_cluster_labels(df_agency_330G)
cohort_agency_333 = gen_cluster_labels(df_agency_333G)

In [None]:
labels_4 = { 'Cntrl': cohort_agency_4}
labels_13 = { 'Cntrl': cohort_agency_13}
labels_55 = { 'Cntrl': cohort_agency_55}
labels_188 = { 'Cntrl': cohort_agency_188}
labels_213 = { 'Cntrl': cohort_agency_213}
labels_225 = { 'Cntrl': cohort_agency_225}
labels_330 = { 'Cntrl': cohort_agency_330}
labels_333 = { 'Cntrl': cohort_agency_333}

In [None]:
element_4 = labels_4['Cntrl']
element_13 = labels_13['Cntrl']
element_55 = labels_55['Cntrl']
element_188 = labels_188['Cntrl']
element_213 = labels_213['Cntrl']
element_225 = labels_225['Cntrl']
element_330 = labels_330['Cntrl']
element_333 = labels_333['Cntrl']

In [None]:
# Define a function to create DataFrame from dictionary
def create_dataframe_from_dict(element_dict):
    frames = []
    for key, values in element_dict.items():
        temp_df = pd.DataFrame({'ClientId': values, 'ListNumber': key})
        frames.append(temp_df)
    return pd.concat(frames).reset_index(drop=True)

# Dictionaries to be converted to DataFrames
element_dicts = [element_4, element_13, element_55, element_188, element_213, element_225, element_330, element_333]

# Create a DataFrame for each dictionary and store in a list
dfs = [create_dataframe_from_dict(element_dict) for element_dict in element_dicts]

# Concatenate all DataFrames into a single DataFrame
all_elements_df = pd.concat(dfs).reset_index(drop=True)

In [None]:
# # # Save Labels_df as a CSV file
file_name = f'CHF_Data_1/CHF_Labels_Local_{LDays}.csv'
all_elements_df.to_csv(file_name, index=False)