In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

# Healthcare No Show Appointment Data

### About Dataset
This dataset contains data on whether someone would showed up for a medical appointment or not.'

107K rows and 15 columns, 1 target variable: showed_up substantial enough to train a machine learning model

We can use this data to predict whether someone would show up for a medical appointment or not.

In [46]:
# read dataset
df = pd.read_csv('/content/drive/MyDrive/Datasets/healthcare_noshows_appointments.csv')
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Showed_up,Date.diff
0,29872500000000.0,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,False,True,False,False,False,False,True,0
1,558997800000000.0,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,False,False,False,False,False,False,True,0
2,4262962000000.0,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,False,False,False,False,False,False,True,0
3,867951200000.0,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,False,False,False,False,False,False,True,0
4,8841186000000.0,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,False,True,True,False,False,False,True,0


In [47]:
# dataaset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106987 entries, 0 to 106986
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       106987 non-null  float64
 1   AppointmentID   106987 non-null  int64  
 2   Gender          106987 non-null  object 
 3   ScheduledDay    106987 non-null  object 
 4   AppointmentDay  106987 non-null  object 
 5   Age             106987 non-null  int64  
 6   Neighbourhood   106987 non-null  object 
 7   Scholarship     106987 non-null  bool   
 8   Hipertension    106987 non-null  bool   
 9   Diabetes        106987 non-null  bool   
 10  Alcoholism      106987 non-null  bool   
 11  Handcap         106987 non-null  bool   
 12  SMS_received    106987 non-null  bool   
 13  Showed_up       106987 non-null  bool   
 14  Date.diff       106987 non-null  int64  
dtypes: bool(7), float64(1), int64(3), object(4)
memory usage: 7.2+ MB


In [48]:
# Check null values
df.isnull().sum()

Unnamed: 0,0
PatientId,0
AppointmentID,0
Gender,0
ScheduledDay,0
AppointmentDay,0
Age,0
Neighbourhood,0
Scholarship,0
Hipertension,0
Diabetes,0


### NO null value present

In [49]:
# check dublicate
df.duplicated().sum()

np.int64(0)

### No dublicate found

In [50]:
# Columns name in lower alphabet and spaces removed
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '')
df.columns

Index(['patientid', 'appointmentid', 'gender', 'scheduledday',
       'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension',
       'diabetes', 'alcoholism', 'handcap', 'sms_received', 'showed_up',
       'date.diff'],
      dtype='object')

In [51]:
# Convert 'ScheduledDay' and 'AppointmentDay' columns to datetime objects
df['scheduledday'] = pd.to_datetime(df['scheduledday'])
df['appointmentday'] = pd.to_datetime(df['appointmentday'])

# Format to dd-mm-yyyy
df['scheduledday'] = df['scheduledday'].dt.strftime('%d-%m-%Y')
df['appointmentday'] = df['appointmentday'].dt.strftime('%d-%m-%Y')

In [52]:
df.head()

Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hipertension,diabetes,alcoholism,handcap,sms_received,showed_up,date.diff
0,29872500000000.0,5642903,F,29-04-2016,29-04-2016,62,JARDIM DA PENHA,False,True,False,False,False,False,True,0
1,558997800000000.0,5642503,M,29-04-2016,29-04-2016,56,JARDIM DA PENHA,False,False,False,False,False,False,True,0
2,4262962000000.0,5642549,F,29-04-2016,29-04-2016,62,MATA DA PRAIA,False,False,False,False,False,False,True,0
3,867951200000.0,5642828,F,29-04-2016,29-04-2016,8,PONTAL DE CAMBURI,False,False,False,False,False,False,True,0
4,8841186000000.0,5642494,F,29-04-2016,29-04-2016,56,JARDIM DA PENHA,False,True,True,False,False,False,True,0


In [53]:
# Data description
df.describe()

Unnamed: 0,patientid,appointmentid,age,date.diff
count,106987.0,106987.0,106987.0,106987.0
mean,147281400000000.0,5675434.0,38.316085,10.166721
std,255826700000000.0,71332.74,22.466214,15.263508
min,39217.84,5030230.0,1.0,-6.0
25%,4173523000000.0,5640490.0,19.0,0.0
50%,31724630000000.0,5680744.0,38.0,4.0
75%,94336000000000.0,5725634.0,56.0,14.0
max,999981600000000.0,5790484.0,115.0,179.0
