In [3]:
import pandas as pd
import numpy as np 

df = pd.read_csv("/Users/Air/Documentss/LitteSteps-Analysis/data/visits.csv")

print(df.shape)
display(df.head(10))
# 1.3 Data Cleaning & Preprocesssing

# Handling missing values
df['service_type'] = df['service_type'].fillna("unknown")
df['visit_location'] = df['visit_location'].fillna("unspecified")
df['nurse_notes'] = df['nurse_notes'].fillna("No notes")

# Format time
from dateutil import parser

def safe_parse(date_str):
    if pd.isna(date_str):
        return pd.NaT
    try:
        return pd.to_datetime(date_str, format="%Y-%m-%d %H:%M:%S", errors="raise")
    except:
        try:
            return parser.parse(date_str, dayfirst=True)
        except:
            return pd.NaT

df['visit_start_time'] = df['visit_start_time'].apply(safe_parse)
df['visit_end_time']   = df['visit_end_time'].apply(safe_parse)

# Sort by start time
df = df.sort_values(by='visit_start_time')

# Reformat to one standard style for better analysis
df['visit_start_time'] = df['visit_start_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
df['visit_end_time']   = df['visit_end_time'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Remove duplicates
df = df.drop_duplicates(subset=['visit_id'])

# Standardise categorical variables
df['service_type'] = df['service_type'].str.strip().str.title().replace({
    "Med Admin": "Medication Administration",
    "Wund Care": "Wound Care",
    "Phisycal Therapy": "Physical Therapy",
    "Check Up": "General Check-up",
    "Check-up": "General Check-up",
    "General Check-Up": "General Check-up"
})
df['visit_location'] = df['visit_location'].str.strip().str.title().replace({
    "Nrth": "North", "Suth": "South", "Esst": "East", "Weerst": "West"
})

df.loc[df['nurse_notes'].str.contains("lorem ipsum", case=False, na=False), 'nurse_notes'] = "Nil"

# Flags for urgent/follow-up
df['urgent_flag'] = df['nurse_notes'].str.contains("urgent", case=False, na=False).astype(int)
df['followup_flag'] = df['nurse_notes'].str.contains("follow-up|revisit", case=False, na=False).astype(int)

print(df[['visit_start_time','visit_end_time','nurse_notes','urgent_flag','followup_flag']].head(10))

# Check for row size, column size and peek into the cleaned dataset
print(f"Cleaned dataset shape: {df.shape}")
display(df.head(10))

df.to_csv("/Users/Air/Documentss/LitteSteps-Analysis/data/visits_cleaned.csv", index=False)


(603, 8)


Unnamed: 0,visit_id,patient_id,nurse_id,visit_start_time,visit_end_time,service_type,visit_location,nurse_notes
0,1,P082,N04,04-01-2024 23:17,2024-01-05 00:02:00,Wound Care,Esst,Patient stable.
1,2,P065,N20,2024-01-04 17:12:00,2024-01-04 19:41:00,Check up,Nrth,Patient stable.
2,3,P055,N11,2024-02-05 04:13:00,2024-02-05 05:49:00,Phisycal Therapy,South,
3,4,P006,N15,09-03-2024 03:59,2024-03-09 06:30:00,Wund Care,West,Follow-up urgently required
4,5,P038,N03,2024-01-30 03:24:00,2024-01-30 06:16:00,Physical Therapy,Suth,
5,6,P088,N03,2024-03-18 20:10:00,2024-03-18 21:22:00,Phisycal Therapy,Nrth,Schedule revisit in 2 days
6,7,P099,N02,2024-01-30 01:51:00,2024-01-30 03:09:00,Wund Care,West,Urgent attention needed
7,8,P059,N05,2024-02-03 04:15:00,2024-02-03 06:42:00,Phisycal Therapy,Esst,
8,9,P066,N16,2024-01-12 01:55:00,2024-01-12 04:45:00,Phisycal Therapy,South,Nil
9,10,P068,N09,2024-03-11 00:43:00,2024-03-11 03:10:00,Wund Care,South,Wound will be dressed soon


        visit_start_time       visit_end_time                 nurse_notes  \
181  2024-01-01 00:19:00  2024-01-01 02:48:00                    No notes   
204  2024-01-01 09:28:00  2024-01-01 10:47:00                         Nil   
469  2024-01-01 09:59:00  2024-01-01 10:00:00  Schedule revisit in 2 days   
208  2024-01-01 12:21:00  2024-01-01 14:08:00                    No notes   
399  2024-01-01 17:23:00  2024-01-01 19:56:00                         Nil   
167  2024-01-01 17:59:00  2024-01-01 20:56:00                    No notes   
11   2024-01-01 19:20:00  2024-01-01 19:58:00             Patient stable.   
453  2024-01-01 19:56:00  2024-01-01 21:01:00                         Nil   
345  2024-01-01 20:46:00                  NaN  Schedule revisit in 2 days   
271  2024-01-01 22:36:00                  NaN                         Nil   

     urgent_flag  followup_flag  
181            0              0  
204            0              0  
469            0              1  
208            0

Unnamed: 0,visit_id,patient_id,nurse_id,visit_start_time,visit_end_time,service_type,visit_location,nurse_notes,urgent_flag,followup_flag
181,182,P017,N08,2024-01-01 00:19:00,2024-01-01 02:48:00,Physical Therapy,West,No notes,0,0
204,205,P014,N03,2024-01-01 09:28:00,2024-01-01 10:47:00,Wound Care,East,Nil,0,0
469,470,P067,N04,2024-01-01 09:59:00,2024-01-01 10:00:00,Medication Administration,South,Schedule revisit in 2 days,0,1
208,209,P018,N10,2024-01-01 12:21:00,2024-01-01 14:08:00,Wound Care,West,No notes,0,0
399,400,P077,N08,2024-01-01 17:23:00,2024-01-01 19:56:00,General Check-up,East,Nil,0,0
167,168,P099,N05,2024-01-01 17:59:00,2024-01-01 20:56:00,General Check-up,West,No notes,0,0
11,12,P100,N17,2024-01-01 19:20:00,2024-01-01 19:58:00,Medication Administration,West,Patient stable.,0,0
453,454,P064,N18,2024-01-01 19:56:00,2024-01-01 21:01:00,General Check-up,West,Nil,0,0
345,346,P016,N18,2024-01-01 20:46:00,,Wound Care,South,Schedule revisit in 2 days,0,1
271,272,P017,N06,2024-01-01 22:36:00,,Wound Care,West,Nil,0,0
