In [None]:
import pandas as pd

df_encoded = pd.read_pickle("df_encoded_files/df_encoded_imputed_final.pkl")

In [None]:
"""PROCESSING OUTCOME"""

import pandas as pd

file_path = "data/H-43413 Data Add On.xlsx"
sheet_name = "Outpatient Encounters"

df = pd.read_excel(file_path, sheet_name=sheet_name)

df = df[
    (df["DEPARTMENT NAME"].isin(["DOW ADULT PSYCHIATRY", "YAW CHILD PSYCHE", "YAW PEDI PSYCH"])) &
    (df["APPOINTMENT STATUS"] == "Completed")
]

df['ENCOUNTER DATE'] = pd.to_datetime(df['ENCOUNTER DATE'], errors='coerce', format="%m/%d/%Y %I:%M:%S %p")

df = df[df['ENCOUNTER DATE'] > pd.Timestamp("2016-01-01")]
filtered_patients = df.groupby("ID")['ENCOUNTER DATE'].agg(['min', 'max'])
filtered_patients['max_interval'] = (pd.to_datetime('2022-12-01') - filtered_patients['min']).dt.days
valid_patients = filtered_patients[filtered_patients['max_interval'] > 180].index

df = df[df["ID"].isin(valid_patients)]

df_outpatient = df[df['VISIT CLASS'] == 'OUTPATIENT']
df_outpatient = df_outpatient.sort_values(by=['ID', 'ENCOUNTER DATE'])
df_outpatient['Time Since Last'] = df_outpatient.groupby('ID')['ENCOUNTER DATE'].diff()

# Define dropout threshold 
dropout_threshold = pd.Timedelta(days=180)

def classify_dropout(group):    
    long_gaps = group['Time Since Last'] > dropout_threshold

    if not long_gaps.any():
        return "Active"  

    first_long_gap_index = long_gaps.idxmax()
    first_long_gap_position = group.index.get_loc(first_long_gap_index)

    if first_long_gap_position + 1 >= len(group):
        return "Dropped Out" 

    # Get visits after the first long gap
    after_first_gap_visits = group.iloc[first_long_gap_position + 1:]

    if after_first_gap_visits.empty:
        return "Dropped Out"  

    # Get time gaps after the first long gap
    after_first_gap_intervals = after_first_gap_visits['Time Since Last'].dropna()

    # If any of the visits after the first long gap have a gap greater than the threshold, they are dropped out
    if (after_first_gap_intervals > dropout_threshold).any():
        return "Dropped Out"

    return "Re-engaged"

df_dropout_status = df_outpatient.groupby('ID').apply(classify_dropout).reset_index(name='Dropout Status')


In [10]:
# Count the frequency of each dropout class
dropout_counts = df_dropout_status['Dropout Status'].value_counts()

# Calculate percentages
dropout_percentages = (dropout_counts / dropout_counts.sum()) * 100

# Print results
print("Dropout Status Frequency:")
print(dropout_counts)

print("\nDropout Status Percentages:")
print(dropout_percentages.round(2).astype(str) + "%")

Dropout Status Frequency:
Dropout Status
Active         1488
Dropped Out     678
Re-engaged      462
Name: count, dtype: int64

Dropout Status Percentages:
Dropout Status
Active         56.62%
Dropped Out     25.8%
Re-engaged     17.58%
Name: count, dtype: object


In [11]:
import pandas as pd

df_encoded = pd.read_pickle("df_encoded_files/df_encoded_imputed_final_removed_low_variance.pkl")

In [12]:
# Ensure both IDs are the same type (convert df_encoded['ID'] to int)
df_encoded['ID'] = df_encoded['ID'].astype(int)

# Keep only rows in df_encoded where ID exists in df_dropout_status
df_encoded = df_encoded[df_encoded['ID'].isin(df_dropout_status['ID'])]

# Merge Dropout Status into df_encoded
df_encoded = df_encoded.merge(df_dropout_status, on='ID', how='left')

# Print updated dataframe info
print(df_encoded.shape)
print(df_encoded.head())


(2380, 102)
      ID  demo_age  RPL_THEME1  GENDER_F  GENDER_M  PRIMARY_RACE_Asian  \
0  10005        82    0.660000         0         1                   0   
1  10027        76    1.000000         0         1                   0   
2  10033        66    0.620000         0         1                   0   
3  10041        69    1.000000         1         0                   0   
4  10047        77    0.089335         1         0                   0   

   PRIMARY_RACE_Black / African American  PRIMARY_RACE_Hispanic or Latino  \
0                                      0                                0   
1                                      1                                0   
2                                      0                                0   
3                                      0                                0   
4                                      0                                0   

   PRIMARY_RACE_Other  PRIMARY_RACE_Unknown  ...  F11.90  F12.20  F43.21  \
0   

In [13]:
# Map dropout status to numerical values
df_encoded['Dropout Status'] = df_encoded['Dropout Status'].map({
    'Active': 0,
    'Re-engaged': 1,
    'Dropped Out': 2
})

# Print value counts to verify
print(df_encoded['Dropout Status'].value_counts())
print(df_encoded.head(20))

Dropout Status
0    1332
2     617
1     431
Name: count, dtype: int64
       ID  demo_age  RPL_THEME1  GENDER_F  GENDER_M  PRIMARY_RACE_Asian  \
0   10005        82    0.660000         0         1                   0   
1   10027        76    1.000000         0         1                   0   
2   10033        66    0.620000         0         1                   0   
3   10041        69    1.000000         1         0                   0   
4   10047        77    0.089335         1         0                   0   
5   10049        59    0.010000         0         1                   0   
6   10055        87    0.600000         1         0                   0   
7   10056        78    0.870000         1         0                   0   
8   10068        48    0.910000         0         1                   0   
9   10072        64    0.180000         1         0                   0   
10  10079        66    0.114930         1         0                   0   
11  10085        62    0.1900

In [14]:
df_encoded.to_pickle("df_encoded_files/df_encoded_final_with_outcome.pkl")