In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns

# Load the datasets
appointments_data = pd.read_csv('data/appointments_data.csv')
ab_test_data = pd.read_csv('data/ab_test_data.csv')
app_data = pd.read_csv('data/app_data.csv')

# Display the first few rows of each dataset to understand their structure
appointments_data_head = appointments_data.head()
ab_test_data_head = ab_test_data.head()
app_data_head = app_data.head()

# Check for missing values in each dataset
appointments_data_missing = appointments_data.isnull().sum()
ab_test_data_missing = ab_test_data.isnull().sum()
app_data_missing = app_data.isnull().sum()

appointments_data_head, ab_test_data_head, app_data_head, appointments_data_missing, ab_test_data_missing, app_data_missing


(   patient_id  age  gender  doctor_name   appointment_reason appointment_date  \
 0           1   38  Female    Dr. Smith  Screening for COVID       2023-02-05   
 1           2   53    Male    Dr. Brown           Ultrasound       2023-05-09   
 2           3   67  Female    Dr. Smith          Annual Exam       2023-01-30   
 3           4   63  Female  Dr. Johnson             Flu Shot       2023-04-08   
 4           5   77    Male    Dr. Jones    Allergy injection       2023-02-24   
 
   appointment_status  
 0           Attended  
 1           Attended  
 2           Attended  
 3       Not Attended  
 4       Not Attended  ,
    patient_id    group             event_name       event_datetime
 0           1  Control   attended_appointment  2023-02-05 15:40:21
 1           2     Test          reminder_sent  2023-05-07 09:19:13
 2           2     Test        reminder_viewed  2023-05-08 04:25:10
 3           2     Test  appointment_confirmed  2023-05-08 04:26:36
 4           2     Te

In [9]:
# Check the data types of each column
appointments_data_info = appointments_data.dtypes
ab_test_data_info = ab_test_data.dtypes
app_data_info = app_data.dtypes

# Convert 'appointment_date' and 'event_datetime' to datetime format for better analysis
appointments_data['appointment_date'] = pd.to_datetime(appointments_data['appointment_date'], errors='coerce')
ab_test_data['event_datetime'] = pd.to_datetime(ab_test_data['event_datetime'], errors='coerce')

# Check for any invalid date entries after conversion (NaT values)
invalid_appointments_dates = appointments_data[appointments_data['appointment_date'].isna()]
invalid_event_dates = ab_test_data[ab_test_data['event_datetime'].isna()]

# Check for duplicate entries in the datasets
appointments_duplicates = appointments_data.duplicated().sum()
ab_test_duplicates = ab_test_data.duplicated().sum()
app_data_duplicates = app_data.duplicated().sum()

# Check for any outliers in the 'age' column
age_outliers = appointments_data[appointments_data['age'] < 0]  # Negative ages would be invalid

# Compile the findings
appointments_data_info, ab_test_data_info, app_data_info, invalid_appointments_dates, invalid_event_dates, appointments_duplicates, ab_test_duplicates, app_data_duplicates, age_outliers


(patient_id             int64
 age                    int64
 gender                object
 doctor_name           object
 appointment_reason    object
 appointment_date      object
 appointment_status    object
 dtype: object,
 patient_id         int64
 group             object
 event_name        object
 event_datetime    object
 dtype: object,
 patient_id         int64
 traffic_source    object
 device            object
 dtype: object,
 Empty DataFrame
 Columns: [patient_id, age, gender, doctor_name, appointment_reason, appointment_date, appointment_status]
 Index: [],
 Empty DataFrame
 Columns: [patient_id, group, event_name, event_datetime]
 Index: [],
 np.int64(0),
 np.int64(0),
 np.int64(0),
 Empty DataFrame
 Columns: [patient_id, age, gender, doctor_name, appointment_reason, appointment_date, appointment_status]
 Index: [])

### Conduct an A/B test analysis to determine whether the app reminder feature in the Test group leads to a higher rate of appointment confirmations compared to the Control group.

In [10]:
# Filter the data to focus on appointment confirmations only
# The 'appointment_confirmed' event will be our indicator of confirmation behavior for both groups

confirmation_events = ab_test_data[ab_test_data['event_name'] == 'appointment_confirmed']

# Separate the Control and Test groups
control_group_confirmations = confirmation_events[confirmation_events['group'] == 'Control']
test_group_confirmations = confirmation_events[confirmation_events['group'] == 'Test']

# Calculate the number of confirmed appointments in each group
control_confirmations_count = control_group_confirmations['patient_id'].nunique()
test_confirmations_count = test_group_confirmations['patient_id'].nunique()

# Calculate the total number of patients in each group
total_control_patients = ab_test_data[ab_test_data['group'] == 'Control']['patient_id'].nunique()
total_test_patients = ab_test_data[ab_test_data['group'] == 'Test']['patient_id'].nunique()

# Calculate the confirmation rates
control_confirmation_rate = control_confirmations_count / total_control_patients
test_confirmation_rate = test_confirmations_count / total_test_patients

# Perform Chi-Square test
contingency_table_confirmations = [
    [control_confirmations_count, total_control_patients - control_confirmations_count],
    [test_confirmations_count, total_test_patients - test_confirmations_count]
]

chi2_confirmations, p_value_confirmations, _, _ = chi2_contingency(contingency_table_confirmations)

control_confirmation_rate, test_confirmation_rate, chi2_confirmations, p_value_confirmations


(0.0, 0.979560938682816, np.float64(2076.5438919116614), np.float64(0.0))

<!-- # Test Group Confirmation Rate: 97.96%

# Almost 98% of the patients in the Test group confirmed their appointments after receiving a reminder.
# Chi-Square Test:

# The chi-square statistic is 2076.54 with a p-value of 0.0. This indicates a statistically significant difference between the Control and Test groups, meaning that the reminder feature has a significant impact on the likelihood of confirming an appointment. -->


### Test Group Confirmation Rate: 97.96%

### Almost 98% of the patients in the Test group confirmed their appointments after receiving a reminder.
### Chi-Square Test:

### The chi-square statistic is 2076.54 with a p-value of 0.0. This indicates a statistically significant difference between the Control and Test groups, meaning that the reminder feature has a significant impact on the likelihood of confirming an appointment.


In [11]:
# Let's focus on the "attended_appointment" event to compare attendance rates between the Control and Test groups

# Filter for attended appointments
attendance_events = ab_test_data[ab_test_data['event_name'] == 'attended_appointment']

# Separate into Control and Test groups
control_group_attendance = attendance_events[attendance_events['group'] == 'Control']
test_group_attendance = attendance_events[attendance_events['group'] == 'Test']

# Calculate the number of attended appointments in each group
control_attendance_count = control_group_attendance['patient_id'].nunique()
test_attendance_count = test_group_attendance['patient_id'].nunique()

# Calculate attendance rates for each group
control_attendance_rate = control_attendance_count / total_control_patients
test_attendance_rate = test_attendance_count / total_test_patients

# Perform Chi-Square test for attendance comparison
contingency_table_attendance = [
    [control_attendance_count, total_control_patients - control_attendance_count],
    [test_attendance_count, total_test_patients - test_attendance_count]
]

chi2_attendance, p_value_attendance, _, _ = chi2_contingency(contingency_table_attendance)

control_attendance_rate, test_attendance_rate, chi2_attendance, p_value_attendance


(1.0,
 0.6555639666919001,
 np.float64(375.7199009211618),
 np.float64(1.0637015664247734e-83))

### Control Group Attendance Rate: 1.0%

### Only 1% of patients in the Control group attended their appointments without receiving a reminder.
### Test Group Attendance Rate: 65.56%

### About 65.56% of patients in the Test group attended their appointments after receiving a reminder.
### Chi-Square Test:

### The chi-square statistic is 375.72 with a p-value of 1.06e-83. This shows a highly statistically significant difference between the Control and Test groups, indicating that the reminder significantly increased attendance rates.
### Interpretation:
### Even though the Control group serves as the baseline for natural patient behavior, the  attendance rate without reminders is very low (1%). The reminder in the Test group greatly improved attendance, with around 65.56% of patients attending their appointments. This suggests that the reminder feature not only helps in confirming appointments but also has a significant impact on actual attendance.



<!-- Control Group Attendance Rate: 1.0%

Only 1% of patients in the Control group attended their appointments without receiving a reminder.
Test Group Attendance Rate: 65.56%

About 65.56% of patients in the Test group attended their appointments after receiving a reminder.
Chi-Square Test:

The chi-square statistic is 375.72 with a p-value of 1.06e-83. This shows a highly statistically significant difference between the Control and Test groups, indicating that the reminder significantly increased attendance rates.
Interpretation:
Even though the Control group serves as the baseline for natural patient behavior, the attendance rate without reminders is very low (1%). The reminder in the Test group greatly improved attendance, with around 65.56% of patients attending their appointments. This suggests that the reminder feature not only helps in confirming appointments but also has a significant impact on actual attendance.
 -->
