# Pre-processing Notebook 2 for filtering first 90days of clients data
<p> <em>The first 90 days of shelter access including sleeps,logs,counsellor notes etc are filtered for each client.<br>Filtering is performed from the Registration date or the first DI shelter access date on the already censored raw data obtained in Notebook 1</em></p>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import scipy as sci
import scipy.special as scisp
import scipy.stats as scist
import datetime, copy, imp, sys
sys.path.append('../../lib')
from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()
plt.ion()

In [None]:
"""Standard routines for pre-processing and analyzing client data from the Calgary
Drop-In Centre.
"""

import os
import pandas as pd
import numpy as np


def RemoveByStartDate(tbl,winStartDate,winEndDate,dateSelect = pd.Series(dtype='object')):
    """Remove all records for subjects in tbl first appearing in the data between
    winStartDate and winEndDate (determined using tbl.Date values)."""

    if dateSelect.empty:
        tblFlt = tbl
    else:
        tblFlt = tbl.loc[dateSelect]
        
    startDates = tblFlt.groupby('ClientId').apply(lambda x: min(x.Date))
    notCensored = ~ ((startDates >= winStartDate) & (startDates <= winEndDate ))
    
    return tbl.loc[tbl.ClientId.isin(startDates[notCensored].index)]    

def ShelterGroupDemographics(tbl):
    """Summarizes the demographics of a group of shelter clients.
    - Fields:
     > TotalStays: Total number of shelter stays.
     > Tenure: Number of days between first and last appearance in dataset.
     > UsagePct: Percentage of days during tenure spent in shelter.
     > AvgGapLen: Average length of gaps between shelter stays (days).
                  NaN for clients with a single stay.
     > TotalEpisodes: Total number of episodes of shelter access.
     """
    
    dates = tbl.Date.drop_duplicates().sort_values() 
    tl = pd.DataFrame({
        'Date': dates,                
        'Ind': range(1,len(dates)+1)  
        })
    
    tenure = (tl.Date.max() - tl.Date.min()).days + 1
    gapVals = tl.Date.diff().astype('timedelta64[D]')
    nStays = tl.Ind.max()
    
    return pd.Series({
        'Tenure': tenure,  # Total span of days a client interacts with shelter.
        'UsagePct': 100.0*nStays/tenure,  # Percentage of days during tenure client stayed in shelter.
        'AvgGapLen': gapVals.mean(),  # Average length of gaps in shelter stays.
        'TotalStays': nStays,  # Total number of shelter stays.
        'TotalEpisodes': sum(gapVals >= episodeGap)+1  # Total number of episodes.
    })

def CalculateStaySequence(tbl):
    """Determines a stay timeline for a subject.
    - Each event in the timeline is represented by an index and a timestamp.
    - A stay is defined as accessing one or more services (typically sleep services) 
      in a 24 hour period.
    - Timestamps generated using tbl.Date values."""
    
    dates = tbl.Date.drop_duplicates().sort_values() # Drop duplicates since stay is one or more sleep.
    return pd.DataFrame({
        'Date': dates,                 # Date of each stay.
        'Ind': range(1,len(dates)+1)   # Index of each stay.
    })


episodeGap = 30  # The max gap in stays before a new episode is created.

def CalculateEpisodeSequence(tbl):    
    """Determines an episode timeline for a subject.  
    - Each event in the timeline is represented by an index and a timestamp.
    - An episode is a series of shelter stays separated by gaps of less than 
      di_data.episodeGap days.
    - A stay is defined as accessing one or more services (typically sleep services) 
      in a 24 hour period.
    - Timestamps generated using tbl.Date values."""
    
    stayDates = tbl.Date.drop_duplicates().sort_values() # Drop duplicates since stay is one or more sleep.
    gapVals = stayDates.diff().astype('timedelta64[D]')
    gapInd = (gapVals >= episodeGap).astype('int').cumsum().drop_duplicates(keep='first')
    
    return pd.DataFrame({
        'Date': tbl.loc[gapInd.index].Date, # Date of first day of each episode.
        'Ind': range(1,len(gapInd)+1)       # Episode index.
    })


def TimeWinThresholdTest(tbl,posFlag,negFlag,thresh,winSzDays):
    """Analyze a subject timeline and determine if the number of events
    exceed thresh in a time window of winSzDays.
    - idDate is the date the threshold test is satisfied.
    - reqTime is the number of days it took to satisfy the threshold test.
    - If the test is satisfied, return a series with posFlag, idDate and reqTime.
    - If the test is not satisfied, return a series with negFlag and nan values
      for idDate and reqTime."""
    
    win = tbl.rolling('{:d}d'.format(winSzDays),on='Date').count().Ind
    
    registrationDate = tbl.Date.min()
    idDate = tbl[win >= thresh].Date.min()  # Will be equal to NaN if the threshold isn't met.
    reqTime = (idDate - registrationDate).days
    
    if idDate == idDate:   # Satisfied if idDate is not NaN.
        return pd.Series({
            'Flag': posFlag,  
            'Date': idDate,  # Date subject was identified.
            'Time': reqTime  # Number of days it took to identify subject.
        })
    else:
        return pd.Series({   # Returned if the test is not satisfied.
            'Flag': negFlag,
            'Date': pd.NaT,
            'Time': np.nan
        })

    
def ChooseEarliestTest(test1,test2):
    """Merges two test tables.  If each test is positive for a subject, the test that
    occurs earliest in a subject's timeline is chosen.  
    
    If you have more than two test tables, you can call this routine several times.  For 
    example, tables A, B and C can be merged by:
      mrg = ChooseEarliestTest(A,B)
      mrg = ChooseEarliestTest(mrg,C)
      
    Assumptions:
    - Both test tables contain the identical list of subjects.
    - Both tests use the same flag for a negative result.
    """
    nRec = len(test1.index)
    
    tbl = pd.DataFrame({ 'Flag': ['']*nRec, 'Date': [pd.NaT]*nRec, 'Time': [np.nan]*nRec },index=test1.index)

    bothNeg = (test1.Time != test1.Time) & (test2.Time != test2.Time)
    tbl[bothNeg] = test1[bothNeg]
    
    isOne = (test1.Time == test1.Time) & (test2.Time == test2.Time) & (test1.Time < test2.Time)
    isOne = isOne | ( (test1.Time == test1.Time) & (test2.Time != test2.Time) )
    tbl[isOne] = test1[isOne]
    
    isTwo = (test1.Time == test1.Time) & (test2.Time == test2.Time) & (test1.Time >= test2.Time)
    isTwo = isTwo | ( (test2.Time == test2.Time) & (test1.Time != test1.Time) )
    tbl[isTwo] = test2[isTwo]
    
    return tbl

In [None]:
validClientsDf = pd.read_hdf('validClientsDf.h5') # Loading valid clients dataframe after censoring

In [None]:
validClientsDf.head(1).transpose().index

In [None]:
validClientsDf.head(5)

### First shelter access dates
- To find the Registration Date or the first shelter access date for each client
- Note: Registration date is Not the actual registration date, it is used in code for convinence of naming

In [None]:
def FindRegistrationDates(tbl):
    dd1 = min(tbl.Date)
    return pd.Series({
        'FirstAcessDate': dd1
    })

In [None]:
validClientsDf2 = validClientsDf[validClientsDf["EntryType"] == "Sleep"]

In [None]:
regDates = validClientsDf2.groupby("ClientId").progress_apply(FindRegistrationDates)

In [None]:
regDates.head(5)

### Date after 90 days from First Shelter Access 
1. Find the date after 90 days of First shelter access for each client 
2. Adding the date after 90 days from first interaction to the data frame
3. Adding the first date of shelter access of each client to the data frame

In [None]:
def FindDateAfterFirst90Days(tbl):
    dd = min(tbl.Date)+np.timedelta64(90, 'D')
    return pd.Series({
        'DateAfterFirst90Days': dd
    })

In [None]:
dateAfterFirst90Days = validClientsDf2.groupby("ClientId").progress_apply(FindDateAfterFirst90Days)

In [None]:
dateAfterFirst90Days.head(5)

In [None]:
merged_df = pd.merge(validClientsDf, dateAfterFirst90Days, on='ClientId')
merged_df = pd.merge(merged_df, regDates, on='ClientId')

In [None]:
merged_df.head(2)

In [None]:
validClientsDf2 = validClientsDf2.join(dateAfterFirst90Days, on='ClientId', how ='left')

In [None]:
validClientsDf.tail(2)

In [None]:
validClientsDf2 = validClientsDf2.join(regDates, on='ClientId', how ='left')

In [None]:
########## Test case #############
#validClientsDf[validClientsDf.ClientId == 2493929] 
## To make sure the date after first 90 days is same across all entries of a particular client -- (Yes)

### Filtering
- To filter Client's data for the first 90 days of each client's shelter interaction data entries

In [None]:
validClientsFirst90DaysDf = merged_df[merged_df.Date <= merged_df.DateAfterFirst90Days]
validClientsFirst90DaysDf = validClientsFirst90DaysDf[validClientsFirst90DaysDf.FirstAcessDate<= validClientsFirst90DaysDf.Date]

In [None]:
validClientsFirst90DaysDf[validClientsFirst90DaysDf.BarDuration.notnull()].head(2)

In [None]:
validClientsFirst90DaysDf

In [None]:
########## Test case #############
#validClientsFirst90DaysDf[validClientsFirst90DaysDf.ClientId == 2493929]
## To make sure filtering is correct -- the entry after the first 90 days is removed (Yes)

In [None]:
validClientsFirst90DaysDf.shape  # Shape of this dataframe would be smaller than the one below

In [None]:
validClientsDf.shape

<h3> Saving the filtered records of first 90 days to disk</h3>

In [None]:
validClientsFirst90DaysDf.loc[:,'Location'] = validClientsFirst90DaysDf['Location'].astype(str)
validClientsFirst90DaysDf.loc[:,'EntryType'] = validClientsFirst90DaysDf['EntryType'].astype(str)
validClientsFirst90DaysDf.loc[:,'ClientState'] = validClientsFirst90DaysDf['ClientState'].astype(str)

In [None]:
validClientsFirst90DaysDf = validClientsFirst90DaysDf.sort_values(['ClientId', 'Date'])

In [None]:
validClientsFirst90DaysDf

In [None]:
validClientsFirst90DaysDf[validClientsFirst90DaysDf.BarDuration.notnull()].head(2)

In [None]:
validClientsFirst90DaysDf.to_hdf('validClientsFirst90DaysDf2.h5',key='df',mode='w')