
# Data Analysis and Windowing Techniques

This notebook demonstrates the application of windowing techniques in data analysis, focusing on preprocessing and target variable generation. The process includes setting up the environment, loading libraries, and initial data pre-processing steps. The aim is to prepare the data for further analysis, applying windowing techniques to segment data over specified intervals for detailed examination.


In [None]:

# Set the number of days for the shelter interaction timeline (range: 1-90 days)
numberOfDays = 90

# Set the number of windows for analysis (can be modified to change analysis granularity)
numberOfWindows = 1


In [None]:
%load_ext autoreload
%autoreload 1

In [None]:

# Loading necessary Python libraries and setting up environment for auto-reloading external modules
%load_ext autoreload
%autoreload 1

# Importing standard data analysis and visualization libraries
import numpy as np
import pandas as pd
import datetime, copy, imp
import time
import matplotlib.pyplot as plt
from sklearn import metrics
from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

# Adding project-specific utility functions to the Python path
import sys
sys.path.insert(0, '../util/')

# Auto-import for specific modules to ensure they are reloaded before execution
%aimport di_data
%aimport data_cache

from di_data import *
from data_cache import CacheResult

# Recording the start time of preprocessing
pre_start_time = time.time()



### Pre-Processing for Target Variable

In this section, we generate the target variable based on methodologies outlined in referenced notebooks by Dr. Messier and Caleb. This includes the pre-processing of attributes to suit the analysis needs, ensuring data is correctly formatted and ready for further processing.


In [None]:

# Directory paths configuration for data and cache folders
dirStr = ''   # Path to the data folder
cacheStr = '../cache/'   # Path to the cache folder for storing intermediate results


In [None]:
@CacheResult
def PreProcess():
    
    tblAll = pd.read_hdf(dirStr + 'UniversityExportAnonymized.hd5')

    tbl = copy.deepcopy(tblAll[ [ 'ClientId', 'Date', 'EntryType', 'Age' ] ])
    tbl['Police'] = (tblAll.PoliceLogFlag == 1) | (tblAll.CPS > 0)
    tbl['Ems'] = (tblAll.EmsLogFlag == 1) | (tblAll.EMS > 0)
    tbl['Health'] = (tblAll.Health > 0) | (tblAll.PhysicalHealth > 0) | (tblAll.MentalHealth > 0) | (tblAll.Medication > 0)
    tbl['Violence'] = (tblAll.PhysicalViolence > 0) | (tblAll.Weapon > 0) | (tblAll.Spray > 0) | (tblAll.Brawl > 0) | (tblAll.Gun > 0) | (tblAll.Knife > 0)
    tbl['Addiction'] = (tblAll.Addiction > 0) | (tblAll.Overdose > 0)    
    
    
    leftStart = tbl.Date.min()
    leftEnd = pd.to_datetime('2009-07-01')
    
    rightStart = pd.to_datetime('2018-01-06')  
    rightEnd = tbl.Date.max()
    
    nClientsAll = len(tbl.ClientId.unique())
    
    tbl = RemoveByStartDate(tbl,leftStart,leftEnd,tbl.EntryType == 'Sleep')
    nLeftRemoved = nClientsAll - len(tbl.ClientId.unique())

    tbl = RemoveByStartDate(tbl,rightStart,rightEnd,tbl.EntryType == 'Sleep')
    nRightRemoved = nClientsAll - nLeftRemoved - len(tbl.ClientId.unique())

    
    tbl = tbl.loc[tbl.Date >= pd.to_datetime('2008-09-01')]

    nClients = len(tbl.ClientId.unique())
 
    print('Total Clients: {:d}/{:d} ({:d} removed left, {:d} removed right)'
          .format(nClients,nClientsAll,nLeftRemoved,nRightRemoved))

    return tbl

In [None]:
tbl = PreProcess(path=cacheStr)

### Identify Chronic Shelter Users

Generate a timeline of stays for each client in order to determine who satisfies the DI chronic shelter use definition.

In [None]:
@CacheResult
def GenerateStayTimelines():
    return tbl.loc[tbl.EntryType=='Sleep'].groupby('ClientId').progress_apply(CalculateStaySequence)

In [None]:
tlSty = GenerateStayTimelines(path=cacheStr)

In [None]:
def TimeToChronic(tbl,thresh,winSzDays):
    
    win = tbl.rolling('%dd' % winSzDays,on='Date').count().Ind
    
    registrationDate = tbl.Date.min()
    idDate = tbl[win >= thresh].Date.min()  # Will be equal to NaN if the threshold isn't met.
    
    if idDate == idDate:   # Satisfied if idDate is not NaN.
        return pd.Series({
            'Flag': 'chr',  # Flag indicating test was satisfied.
            'Date': idDate,   # Date client was identified.
            'Time': (idDate - registrationDate).days + 1 # Number of days it took to identify client.
        })
    else:
        return pd.Series({   # Returned if the test is not satisfied.
            'Flag': 'tmp',
            'Date': tbl.Date.max(),
            'Time': (tbl.Date.max()-tbl.Date.min()).days + 1
        })

In [None]:
@CacheResult
def DiChronicTte():
    return tlSty.groupby('ClientId').progress_apply(TimeToChronic,thresh=276,winSzDays=365)

In [None]:
tteDi = DiChronicTte(path=cacheStr)  # To Generate Labels for each Client ID {'chr' or 'tmp'}

In [None]:
@CacheResult
def CalculateClientDemographics():
    return tbl.groupby('ClientId').progress_apply(ShelterGroupDemographics)

In [None]:
demog = CalculateClientDemographics(path=cacheStr)

In [None]:
def PrintStats(demog,cohortInd): 
    cohort = demog.loc[cohortInd]
    
    nPop = len(demog.index)
    nCohort = len(cohort.index)
    print( 'Clients in cohort: %d/%d (%.1f%%)' % (nCohort,nPop,100*nCohort/nPop))

    fields = [ 'Tenure', 'UsagePct', 'AvgGapLen', 'TotalStays', 'TotalEpisodes' ]
    for field in fields:
        print('%s:' % (field))
        nEntry = sum(~np.isnan(cohort[field]))                
        print(' Avg: {:.1f}, Med: {:.1f}, 10thPct: {:.1f}, 90thPct: {:.1f}' 
              .format(cohort[field].mean(),cohort[field].median(),
                    cohort[field].sort_values().iloc[int(nEntry*0.1)],
                    cohort[field].sort_values().iloc[int(nEntry*0.9)]))

In [None]:
PrintStats(demog,tteDi.loc[tteDi.Flag=='chr'].index)

In [None]:
PrintStats(demog,tteDi.loc[tteDi.Flag=='tmp'].index)

In [None]:
len(tteDi[tteDi.Flag=='chr'])

In [None]:
df = pd.read_hdf('clientTsTables90DaysPaddedDF.h5')
df.tail(2)

In [None]:
df = df.reset_index()

In [None]:
df = df[df.Day < numberOfDays+1]
df = df.set_index(['ClientId','Day'])

In [None]:
df = df.reset_index(level=[0,1])
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
def AgeFix(tbl):
    fix =    tbl.Age + tbl.Date.dt.year - 2020.0
    return fix
newAge = df.groupby("ClientId").progress_apply(AgeFix)

In [None]:
newAge = pd.DataFrame(newAge)
newAge = newAge.reset_index(level=[0,1])
newAge= newAge.rename(columns={0:'Age_fix'})
newAge = newAge.drop(columns=['level_1'])
df['AgeFix']=newAge.Age_fix

In [None]:
df = df.fillna(0)  # Zero-Imputing the NaN values
nClients = len(df.ClientId.unique())
df = df.drop(columns=['Date','Day','Age'])

In [None]:
df = df[['ClientId','EncodedVector','AgeFix','EmployeeId','EmployeeIsCounsellor','BarDuration','SleepEntry','LogEntry','CounsellorNotes','ProgressDetails','SoberState','UnderState','IntoxicatedState','DruggedState','DruggedIntoxicatedState','PoliceLogFlag','EmsLogFlag']]

In [None]:
df = pd.concat([df,pd.get_dummies(df['BarDuration'], prefix='Bar')],axis=1) #dummy_na=True currently false
df.drop(['BarDuration'],axis=1, inplace=True)
df

In [None]:
df= df.rename(columns = {'Bar_1.0': 'Bar_1', 'Bar_2.0': 'Bar_2','Bar_3.0':'Bar_3','Bar_5.0':'Bar_5','Bar_7.0':'Bar_7','Bar_14.0':'Bar_14','Bar_21.0':'Bar_21','Bar_30.0':'Bar_30','Bar_60.0':'Bar_60','Bar_90.0':'Bar_90','Bar_120.0':'Bar_120','Bar_180.0':'Bar_180','Bar_-24 Hours':'Bar_24Hours'})
df

In [None]:
def aggregationFunc(tbl):
    empAgg = tbl.EmployeeId.sum()
    emcAgg = tbl.EmployeeIsCounsellor.sum()
    sleepAgg = tbl.SleepEntry.sum()
    logAgg = tbl.LogEntry.sum()
    notesAgg = tbl.CounsellorNotes.sum()
    detailsAgg = tbl.ProgressDetails.sum()
    ssAgg = tbl.SoberState.sum()                 
    usAgg = tbl.UnderState.sum()                 
    isAgg = tbl.IntoxicatedState.sum()           
    dsAgg = tbl.DruggedState.sum()               
    disAgg = tbl.DruggedIntoxicatedState.sum()   
    policeAgg = tbl.PoliceLogFlag.sum()         
    emsAgg = tbl.EmsLogFlag.sum()               
    bar0Agg = tbl.Bar_0.sum()                       
    bar1Agg = tbl.Bar_1.sum()                  
    bar2Agg = tbl.Bar_2.sum()                   
    bar3Agg = tbl.Bar_3.sum()                   
    bar5Agg = tbl.Bar_5.sum()                   
    bar7Agg = tbl.Bar_7.sum()               
    bar14Agg = tbl.Bar_14.sum()
    bar21Agg = tbl.Bar_21.sum()   
    bar30Agg = tbl.Bar_30.sum()   
    bar60Agg = tbl.Bar_60.sum()   
    bar90Agg = tbl.Bar_90.sum()       
    bar120Agg = tbl.Bar_120.sum()         
    bar180Agg = tbl.Bar_180.sum()
    bar24HAgg = tbl.Bar_24Hours.sum()          
    barCondAgg = tbl.Bar_Conditional.sum()          
    barLifeAgg = tbl.Bar_Life.sum()                 
    barWarningAgg = tbl.Bar_Warning.sum()
    ageAgg = tbl.AgeFix.max()
    return pd.Series({
    'age': ageAgg,
    'emp': empAgg,
    'emc' : emcAgg,
    'sleep' : sleepAgg,
    'logAgg' : logAgg,
    'notesAgg' :  notesAgg,
    'detailsAgg' : detailsAgg,
    'ssAgg' : ssAgg,              
    'usAgg': usAgg,                  
    'isAgg': isAgg,            
    'dsAgg': dsAgg,                
    'disAgg': disAgg,
    'policeAgg': policeAgg,         
    'emsAgg': emsAgg,              
    'bar0Agg': bar0Agg,                      
    'bar0Agg': bar1Agg,                  
    'bar2Agg' : bar2Agg,                   
    'bar3Agg': bar3Agg,                  
    'bar5Agg': bar5Agg,                   
    'bar7Agg': bar7Agg,              
    'bar14Agg': bar14Agg, 
    'bar21Agg': bar21Agg,   
    'bar30Agg': bar30Agg,   
    'bar60Agg': bar60Agg,   
    'bar90Agg': bar90Agg,        
    'bar120Agg': bar120Agg,         
    'bar90Agg': bar90Agg, 
    'bar24HAgg': bar24HAgg,          
    'barCondAgg': barCondAgg,           
    'barLifeAgg': barLifeAgg,                 
    'barWarningAgg': barWarningAgg 
    })

In [None]:
df2 = df.copy(deep=True)
df2 = df2.groupby("ClientId").progress_apply(aggregationFunc)

In [None]:
Subject_id = pd.DataFrame()
Subject_id = df.ClientId
Subject_id.drop_duplicates(keep = 'first', inplace = True)
Subject_id.reset_index(drop=True,inplace=True)

In [None]:
import progressbar 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
WSUsed = []
for rows in range(numberOfWindows):
        IndexPerWindowLower = np.floor(numberOfDays/numberOfWindows)
        IndexPerWindowUpper = np.ceil(numberOfDays/numberOfWindows)
        ProbIndexLower = IndexPerWindowUpper - numberOfDays/numberOfWindows
        indices = [IndexPerWindowLower,IndexPerWindowUpper]
        weights = [ProbIndexLower,1-ProbIndexLower]
        IndexPerWindow = int(np.random.choice(indices, p=weights))
        WSUsed.append(IndexPerWindow)

widgets=[' [', progressbar.Timer(), '] ',progressbar.Percentage(),progressbar.Bar(),' (', progressbar.ETA(), ') ',]

startNum = 0
endNum = numberOfDays
dfFinal = []
WSUsed2 = pd.DataFrame(WSUsed)
for y in progressbar.progressbar(Subject_id, widgets=widgets):
    startWind = 0
    dfExp = df.copy(deep=True)
    dfExp = dfExp[startNum:endNum]
    dfExpAge = dfExp.AgeFix.max()
    startNum = startNum + numberOfDays
    endNum = endNum + numberOfDays
    for z in range (numberOfWindows):
        endWind = startWind + int(WSUsed2.iloc[z])
        dfExp2s = dfExp[startWind:endWind]
        dfExp2 = aggregationFunc(dfExp2s)
        dfExp2['age'] = dfExpAge
        dfExp2['subject_id'] = y
        dfExp2['index_id'] = z
        dfFinal.append(dfExp2)
        startWind = endWind
        
dfFinal2 = pd.DataFrame(dfFinal)

In [None]:
dfExp2s.head()

In [None]:
featureArray = np.array(dfFinal2)
numberOfFeatures = 30

In [None]:
dfFinal2.to_csv('DI/DATA_' + str(numberOfWindows) + '.csv')