#### Feature Engineering
**Dataset:** KaggleV2-May-2016.zip  
**Author:** Luis Sergio Pastrana Lemus  
**Date:** 2025-05-24

## Libraries

In [1]:
import os
import sys
import pprint
from pathlib import Path
import pandas as pd

# Define project root dynamically, gets the current directory from whick the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *

## Path to Data file

In [2]:
data_file_path_clean = project_root / "data" / "processed" / "patients_clean.csv"
df_patients_clean = load_dataset_from_csv(data_file_path_clean, header='infer',  parse_dates=['scheduled_day', 'appointment_day'])

In [3]:
format_notebook()

### Waiting time between scheduling and appointment

In [4]:
df_patients_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype              
---  ------           --------------   -----              
 0   patient_id       110527 non-null  int64              
 1   appointment_id   110527 non-null  int64              
 2   gender           110527 non-null  object             
 3   scheduled_day    110527 non-null  datetime64[ns, UTC]
 4   appointment_day  110527 non-null  datetime64[ns, UTC]
 5   age              110527 non-null  int64              
 6   neighbourhood    110527 non-null  object             
 7   scholarship      110527 non-null  bool               
 8   hipertension     110527 non-null  bool               
 9   diabetes         110527 non-null  bool               
 10  alcoholism       110527 non-null  bool               
 11  handcap          110527 non-null  bool               
 12  sms_received     110527 non-null  bool               
 13 

In [5]:
print(df_patients_clean['patient_id'].value_counts())

patient_id
822145925426128    88
99637671331        84
26886125921145     70
33534783483176     65
871374938638855    62
                   ..
734536231958495     1
78124564369297      1
56394729949972      1
733688164476661     1
8841186448183       1
Name: count, Length: 62299, dtype: int64


In [6]:
df_patients_waiting_time = df_patients_clean.loc[:, ['patient_id', 'appointment_id', 'no_show', 'appointment_day', 'scheduled_day']]
df_patients_waiting_time['days_waiting'] = (df_patients_clean['appointment_day'].dt.normalize() - df_patients_clean['scheduled_day'].dt.normalize()).dt.days
print(df_patients_waiting_time.sample(25, random_state=333))

             patient_id  appointment_id no_show           appointment_day             scheduled_day  days_waiting
46995    62566927247961         5607061     yes 2016-05-18 00:00:00+00:00 2016-04-20 11:02:15+00:00            28
82570     3996739777887         5645533      no 2016-05-02 00:00:00+00:00 2016-05-02 10:03:24+00:00             0
71800    74981215322154         5664902     yes 2016-05-05 00:00:00+00:00 2016-05-05 13:39:17+00:00             0
78525   746757565862383         5700616      no 2016-05-17 00:00:00+00:00 2016-05-16 09:37:32+00:00             1
63501    49375648372849         5704393      no 2016-05-19 00:00:00+00:00 2016-05-16 15:37:53+00:00             3
38007    85922122763292         5634129     yes 2016-05-25 00:00:00+00:00 2016-04-28 10:25:11+00:00            27
104172   58171584692296         5770192      no 2016-06-03 00:00:00+00:00 2016-06-03 09:53:02+00:00             0
10698     8284516281776         5687146      no 2016-05-11 00:00:00+00:00 2016-05-11 14:

In [7]:
df_patients_waiting_time['no_show_bin'] = df_patients_waiting_time['no_show'].map({'no' : 0, 'yes' : 1}) 
print(df_patients_waiting_time.sample(25, random_state=333))

             patient_id  appointment_id no_show           appointment_day             scheduled_day  days_waiting  no_show_bin
46995    62566927247961         5607061     yes 2016-05-18 00:00:00+00:00 2016-04-20 11:02:15+00:00            28            1
82570     3996739777887         5645533      no 2016-05-02 00:00:00+00:00 2016-05-02 10:03:24+00:00             0            0
71800    74981215322154         5664902     yes 2016-05-05 00:00:00+00:00 2016-05-05 13:39:17+00:00             0            1
78525   746757565862383         5700616      no 2016-05-17 00:00:00+00:00 2016-05-16 09:37:32+00:00             1            0
63501    49375648372849         5704393      no 2016-05-19 00:00:00+00:00 2016-05-16 15:37:53+00:00             3            0
38007    85922122763292         5634129     yes 2016-05-25 00:00:00+00:00 2016-04-28 10:25:11+00:00            27            1
104172   58171584692296         5770192      no 2016-06-03 00:00:00+00:00 2016-06-03 09:53:02+00:00            

In [9]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "patients_dayswaiting_noshow.csv"

df_patients_waiting_time.to_csv(processed_path, index=False)

##### `LSPL`

**Note:** Feature engineering for Waiting time between scheduling and appointment relation.`

##### 🔍 `🔗 Related notebook: See [EDA Notebook](./eda.ipynb) for detailed insights.`