# Dependencies

In [None]:
import pandas as pd
import numpy as np

# Load data

In [None]:
# COVID transition data
df = pd.read_csv('COVID_kitchen_.csv').iloc[:,1:]
df['start_date'] = pd.to_datetime(df['start_date'])
df

## or

In [None]:
# post-COVID transition data 
df = pd.read_csv('kitchen_.csv').iloc[:,1:]
df['start_date'] = pd.to_datetime(df['start_date'])
df

# Processing transitions

In [None]:
kitchen_ = df.groupby('household').resample('1h',on='start_date').agg(vc = ('transition','value_counts')).fillna(0).reset_index().rename(columns = {'vc': 'transition_count'}) # where df is COVID_kitchen_ or kitchen_
kitchen_['date'] = kitchen_['start_date'].dt.date
kitchen_['hour'] = kitchen_['start_date'].dt.hour 
kitchen_ = kitchen_.pivot(index=['household','transition','date'], columns='hour', values='transition_count').fillna(0)
kitchen_ = kitchen_.reset_index()
kitchen_ = kitchen_.melt(id_vars=['household','transition','date'], var_name='hour', value_name='transition_count')
kitchen_ = kitchen_.pivot(index=['household','date','hour'], columns='transition', values='transition_count').fillna(0)
kitchen_ = kitchen_.reset_index()
kitchen_ = kitchen_.melt(id_vars=['household','date','hour'], var_name='transition', value_name='transition_count')
kitchen_ = kitchen_.pivot(index=['household','transition','hour'], columns='date', values='transition_count')
kitchen_ = kitchen_.where(kitchen_.notna(), kitchen_.median(axis=1), axis=0)
kitchen_ = kitchen_.reset_index()
kitchen_ = kitchen_.melt(id_vars=['household','transition','hour'], var_name='date', value_name='transition_count')
kitchen_.columns = ['household','transition','hour','start_date','transition_count']
kitchen_['start_date'] = pd.to_datetime(kitchen_['start_date'])
kitchen_

# Sliding window algorithm

In [None]:
# function to extract transition probabilities
def get_markov_prob(df:pd.DataFrame,start:str,end:str): 
    dummy = df.query("@start < start_date < @end")
    dummy = dummy.groupby(['household','hour','transition'])['transition_count'].sum().to_frame().unstack().fillna(0).stack().reset_index()
    dummy['transition'] = dummy.transition.astype(str)
    dummy[['source','sink']] = dummy.transition.str.split('>', expand=True)
    dummy['total'] = dummy.groupby(['household','hour','source'])['transition_count'].transform('sum')
    dummy['markov'] = dummy['transition_count']/dummy['total']
    dummy['markov'] = dummy.markov.fillna(0)
    dummy = dummy.set_index(['household','hour','source','sink'])['markov'].unstack().fillna(0)
    return(dummy)

In [None]:
#set up for the sliding window
df = kitchen_
bwin,cwin,freq = (3,'W'),(1,'W'),'1d'
start_date,end_date = df.start_date.dt.date.agg(['min','max']).astype('datetime64[ns]')
base_line = start_date + np.timedelta64(*bwin)
current = base_line + np.timedelta64(*cwin)
clean_date = lambda x: pd.to_datetime(str(x.date())) #clearing time out because timedelta brings in time
rng = pd.date_range(clean_date(base_line),clean_date(end_date),freq=freq)
#sliding window
similarity = {}
for win in rng:
    baseline_markov = get_markov_prob(df,base_line-np.timedelta64(*bwin),base_line)
    current_markov = get_markov_prob(df,current-np.timedelta64(*cwin),current)
    similarity[win] = (baseline_markov-current_markov).groupby(['household','hour']).apply(lambda x: np.linalg.norm(x.values))
    shift = win - base_line
    base_line,current = base_line + shift,current + shift
    #print(f'{base_line}-{current}')
    #print(baseline_markov)
    #print(current_markov)

# Get Similarity

In [None]:
df_similarity = pd.DataFrame.from_dict(similarity) # return as dataframe
df_similarity