In [130]:
# Data processing tool for Nova Home Support
# IPython Notebook for testing
#
# Import Python packages
# Note: We're only using packages that come with Anaconda standard distribution: https://www.anaconda.com/ 
import pandas as pd # For importing, manipulating, and exporting data
import re # Python regular expression support
from datetime import datetime # Python datetime conversion support
from concurrent.futures import process # Asynchronous callable execution
from tkinter import * # For user interface
from tkinter.ttk import * # UI widgets
from tkinter.filedialog import askopenfile, askopenfilename, askdirectory # UI-file system interaction

In [131]:
# Read in the dataset
df = pd.read_excel('testdata.xlsx') 
# Keep useful columns
df = df[['Service 1 Description (Code)', 
         'Service Provider','Check-In Date',
         'Check-In Time',
         'Updated Check-In Date',
         'Updated Check-In Time',
         'Check-Out Date',
         'Check-Out Time',
         'Updated Check-Out Date',
         'Updated Check-Out Time',
         'Staff Worked Duration',
         'Staff Worked Duration (Minutes)']]

In [132]:
# Remove parentheses and everything within them
df['Service 1 Description (Code)'] = df['Service 1 Description (Code)'].str.replace(r'\(.*\)', '')

# Remove prefix if it exists
prefix = 'RC-SDP-CLS-320 '
df['Service 1 Description (Code)'] = df['Service 1 Description (Code)'].apply(lambda x: x[len(prefix):] if x.startswith(prefix) else x)

  df['Service 1 Description (Code)'] = df['Service 1 Description (Code)'].str.replace(r'\(.*\)', '')


In [133]:
# Remove everything after " /"
df['Service Provider'] = df['Service Provider'].str.split(' /', n=1).str[0]

In [134]:
# Replace Date/Time with Updated Date/Time if the latter is not NaN
df['Check-In Date'] = df['Updated Check-In Date'].fillna(df['Check-In Date'])
df['Check-In Time'] = df['Updated Check-In Time'].fillna(df['Check-In Time'])
df['Check-Out Date'] = df['Updated Check-Out Date'].fillna(df['Check-Out Date'])
df['Check-Out Time'] = df['Updated Check-Out Time'].fillna(df['Check-Out Time'])
df.drop(['Updated Check-In Date', 'Updated Check-In Time','Updated Check-Out Date', 'Updated Check-Out Time'], axis=1, inplace=True)

In [135]:
# Create datetime series in Python format
CIDT = df['Check-In Date'].str.cat(df['Check-In Time'], sep=' ')
CODT = df['Check-Out Date'].str.cat(df['Check-Out Time'], sep=' ')
CIDT = CIDT.apply(lambda x: datetime.strptime(x, r'%m/%d/%Y %I:%M %p'))
CODT = CODT.apply(lambda x: datetime.strptime(x, r'%m/%d/%Y %I:%M %p'))

In [136]:
# Calculate time difference from check-in and check-out datetimes
CTD = (CODT - CIDT).dt.total_seconds() / 60
# Convert Staff Work Duration from Hour:Minutes to Minutes
SWD_min = df['Staff Worked Duration'].apply(lambda x: (int(x.split(':')[0]) * 60) + int(x.split(':')[1]))

In [137]:
# Sanity check:
# 1. Check if Staff Work Duration ==  Staff Work Duration (Minutes)
sanity1 = (SWD_min == df['Staff Worked Duration (Minutes)'])
# 2. Check if |Staff Work Duration (Minutes) - Calculated Time Difference| <= 1
sanity2 = ((df['Staff Worked Duration (Minutes)'] - CTD).abs() <= 1.1) # 1.1 to avoid float precision issues
df["Sanity Check"] = (sanity1 & sanity2) # The data is "sane" only when both checks are passed

In [138]:
#look at the final dataframe
df

Unnamed: 0,Service 1 Description (Code),Service Provider,Check-In Date,Check-In Time,Check-Out Date,Check-Out Time,Staff Worked Duration,Staff Worked Duration (Minutes),Sanity Check
0,HSSFXWKND,"Robles, Jessica",04/16/2023,02:51 pm,04/16/2023,11:25 pm,8:34,514,True
1,HSS2,"Hernandez, Elli",04/16/2023,02:54 pm,04/16/2023,11:41 pm,8:47,527,True
2,HSS1,"Duenas, Vick",04/16/2023,02:56 pm,04/16/2023,11:01 pm,8:05,485,True
3,HSS1,"Song, Michelle",04/16/2023,06:50 am,04/16/2023,02:58 pm,8:07,487,True
4,CCR,"Inocencio, Kenneth",04/16/2023,06:52 am,04/16/2023,03:07 pm,8:15,495,True
5,HSSFXW,"Martin, Sheena",04/16/2023,07:04 am,04/16/2023,03:10 pm,8:06,486,True
6,OA2,"Hernandez, Miriam",04/16/2023,10:54 pm,04/17/2023,07:04 am,8:10,490,True
7,IHSS Asleep,"Hernandez, Elli",04/16/2023,11:45 pm,04/17/2023,07:06 am,7:21,441,True
8,CCR,"Duarte, Desirae",04/17/2023,02:46 pm,04/17/2023,10:53 pm,8:06,486,True
9,HSSFXW,"Martin, Sheena",04/17/2023,02:51 pm,04/17/2023,11:01 pm,8:09,489,True
