### Paper 1 - eating behavior

#### Define the relevant directories used in this paper

Data is pulled from the standardized data folder; subsequently, it is stored and managed in the paper 1 folder. 

In [4]:
import os

# Define the source and output directories
source_directory = r"C:\Users\Felhasználó\Desktop\Projects\PNK_DB2\DB2_standard"
paper1_directory = r"C:\Users\Felhasználó\Desktop\Projects\PNK_DB2\paper1_emotional"

# Ensure the output directory exists
os.makedirs(paper1_directory, exist_ok=True)

#### Create a research question-specific SQL database subset 

Check those medical records where any/3+/all emotional values are available, and filter the database to contain only the specified patients and medical records. Save the data to 3 new SQL files - one with any, one with some, one with all values available. For research purposes, the last one is most likely to be used. The first two may be relevant if trying to increase the sample size for one or a few specific emotional values. 

In [5]:
import sqlite3
import pandas as pd
import os

# Use the above defined directories
db_path = os.path.join(source_directory, "pnk_db2_colclean.sqlite")
conn = sqlite3.connect(db_path)

# List all tables in the database
query_tables = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql_query(query_tables, conn)
table_names = tables['name'].tolist()

# Define criteria for filtering for any/3+/all emotional values available
def create_filtered_database(criteria, output_filename):
    # Set up the appropriate query based on the criteria for the three scenarios
    if criteria == "any":
        # to select records where at least one emotional variable is not null
        query = """
        SELECT medical_record_id, patient_id
        FROM medical_records_colclean
        WHERE hunger IS NOT NULL
           OR satiety IS NOT NULL
           OR emotional_eating IS NOT NULL
           OR emotional_eating_value IS NOT NULL
           OR quantity_control IS NOT NULL
           OR impulse_control IS NOT NULL;
        """
    elif criteria == "3plus":
        # to select records where at least three emotional variables are not null
        query = """
        SELECT medical_record_id, patient_id
        FROM medical_records_colclean
        WHERE (CASE WHEN hunger IS NOT NULL THEN 1 ELSE 0 END +
               CASE WHEN satiety IS NOT NULL THEN 1 ELSE 0 END +
               CASE WHEN emotional_eating IS NOT NULL THEN 1 ELSE 0 END +
               CASE WHEN emotional_eating_value IS NOT NULL THEN 1 ELSE 0 END +
               CASE WHEN quantity_control IS NOT NULL THEN 1 ELSE 0 END +
               CASE WHEN impulse_control IS NOT NULL THEN 1 ELSE 0 END) >= 3;
        """
    elif criteria == "all":
        # to select records where all emotional variables are not null 
        query = """
        SELECT medical_record_id, patient_id
        FROM medical_records_colclean
        WHERE hunger IS NOT NULL
          AND satiety IS NOT NULL
          AND emotional_eating IS NOT NULL
          AND emotional_eating_value IS NOT NULL
          AND quantity_control IS NOT NULL
          AND impulse_control IS NOT NULL;
        """
    
    # Get the relevant records, and extract their medical record and patient IDs
    relevant_records = pd.read_sql_query(query, conn)
    relevant_medical_record_ids = tuple(relevant_records['medical_record_id'])
    relevant_patient_ids = tuple(relevant_records['patient_id'])
    
    # Create a new database in the output directory
    output_db_path = os.path.join(paper1_directory, output_filename)
    filtered_conn = sqlite3.connect(output_db_path)
    
    # Filter each table in the source SQl to only contain the records that comply the criteria of the given scenario; 
    # ie. they have records where any/3+/all emotional variables are available
    for table_name in table_names:
        if table_name.startswith("sqlite_"):
            # Skip any SQLite system tables
            continue 
        # In case of tables that may contain several medical records from the same patient, 
        # filter by medical_record_id
        if table_name == "medical_records_colclean" or table_name == "prescriptions_colclean":
            query = f"""
            SELECT * 
            FROM {table_name}
            WHERE medical_record_id IN {relevant_medical_record_ids}
            """
        else:
            # For all other tables, filter by patient_id only, as medical record ID is not available in those
            query_check_column = f"PRAGMA table_info({table_name});"
            columns = pd.read_sql_query(query_check_column, conn)
            if 'patient_id' not in columns['name'].values:
                continue  # Skip tables without patient_id
            query = f"""
            SELECT *
            FROM {table_name}
            WHERE patient_id IN {relevant_patient_ids}
            """
        
        # Execute the given query and save the result in a new SQLite database
        filtered_data = pd.read_sql_query(query, conn)
        filtered_data.to_sql(table_name, filtered_conn, index=False, if_exists="replace")
    
    filtered_conn.close()
    return len(relevant_records), len(set(relevant_records['patient_id']))

# Create and save the three databases for the three scenarios - any/3+/all emotional variables available
any_count, any_patients = create_filtered_database("any", "emotional_any_notna.sqlite")
three_plus_count, three_plus_patients = create_filtered_database("3plus", "emotional_3plus_notna.sqlite")
all_count, all_patients = create_filtered_database("all", "emotional_all_notna.sqlite")

# Close the connection
conn.close()

# Print summary
print(f"Any emotional data points are available in {any_count} records from {any_patients} patients")
print(f"At least 3 emotional data points are available in {three_plus_count} records from {three_plus_patients} patients")
print(f"All emotional data points are available in {all_count} records from {all_patients} patients")

Any emotional data points are available in 2482 records from 2437 patients
At least 3 emotional data points are available in 2169 records from 2132 patients
All emotional data points are available in 1853 records from 1826 patients


#### Clean and link measurements to prescriptions and medical records

Based on the patient ID and the date of a given measurements, look for prescriptions with the same patient ID that cover the range of time in which the measurement was taken. This way, measurements can be linked to important metadata, such as the prescription and medical record they belong to, the step of the programme they were taken in, etc. 

In summary, this is a key step in the research, without which data on any measurement's identity would be insufficient, and measurements from different prescriptions of the same individual could be mixed, for example. In a previous attempt, I tried identifying blocks of measurements as those that are taken within two months of each other, but I consider this a much more solid approach. 

It is important to note that some patients may take repeated measurements on the same occasion. These duplicates need to be removed, as they inflate the dataset. 

After removing the duplicates, measurements and prescriptions are linked in a two-step process. 

First, measurements are linked to all possible prescriptions that can belong to them based on the shared patient ID (this scenario where every option is linked to every option is called a Cartesian product). 

After, these possible links are filtered by date: a measurement belongs to a prescription if it is within its validity period, or is 5 days within its start or end dates. In the latter case, a measurement may be assigned to multiple prescriptions; if this happens, it is assigned to the one it is closer to in time. 

If a measurement is not succesfully linked to any prescription, it is lost. 

In [6]:
"""
Remove duplicate measurements before doing any data frame merging. 
Any measurement from the same patient on the same day (ignoring time) with the same weight should be considered a duplicate.
"""

import pandas as pd
import sqlite3
import os

# Connect to the database, load the measurements table
conn = sqlite3.connect(os.path.join(paper1_directory, "emotional_all_notna.sqlite"))
measurements = pd.read_sql_query("SELECT * FROM measurements_colclean", conn)

# Convert measurement_date to datetime, if not already in that format. 
# Add a temporary column with the measurement date only; time is ignored, 
# as repeated measurements are at least a few seconds or minutes apart. 
measurements['measurement_date'] = pd.to_datetime(measurements['measurement_date'])
measurements['measurement_date_date'] = measurements['measurement_date'].dt.date
# After, remove duplicates based on patient id, date, and weight. 
# Drop the temporary column. 
measurements_rowclean = measurements.drop_duplicates(subset=['patient_id', 'measurement_date_date', 'weight_kg'])
measurements_rowclean = measurements_rowclean.drop(columns=['measurement_date_date'])
# Save the cleaned measurements back to the database with the _rowclean name code
measurements_rowclean.to_sql("measurements_rowclean", conn, if_exists="replace", index=False)

print(f"Duplicate measurements removed. There are {len(measurements_rowclean)} measurements from {measurements_rowclean['patient_id'].nunique()} patients.")

Duplicate measurements removed. There are 35709 measurements from 1826 patients.


In [7]:
"""
Link metadata from the prescriptions table to measurements. 

The two dataframes are merged based on patient_id, creating a Cartesian product of the two tables, 
where every measurement from one patient is linked to every possible prescription from that patient.

This Cartesian product is then filtered based on the dates of both the measurement and the prescription, 
in order to, preferably, only consider a prescription being linked to a given measurement
if the measurement date is between the prescription's start and end dates. 

If a measurement is not within any prescription's validity period, 
there is a permissivity of 5 days, meaning that a measurement can be linked to a prescription if
it is within 5 days from the start or end date of the prescription.
If this allows a measurement to be linked to multiple prescriptions,
it is linked to the one it is closest to in date. 

If a measurement is not linked to any valid prescription, 
it is excluded from the outuput. 
"""

# Connect to the paper-specific database, load the prescriptions table, and make sure its date values are in datetime format
conn = sqlite3.connect(os.path.join(paper1_directory, "emotional_all_notna.sqlite"))
prescriptions = pd.read_sql_query("SELECT * FROM prescriptions_colclean", conn)
prescriptions['prescription_creation_date'] = pd.to_datetime(prescriptions['prescription_creation_date'])
prescriptions['prescription_validity_end_date'] = pd.to_datetime(prescriptions['prescription_validity_end_date'])

# Merge the measurements and prescriptions data frames on patient ID,
# creating the Cartesian product that needs further date-based filtering
merged = pd.merge(measurements_rowclean, prescriptions, on="patient_id", how="left", suffixes=('_meas', '_presc'))

# To execute date-based filtering: 
# First, define those measurements that are within the range of a prescription. 
# If any measurement can be assigned to a prescription based on this criteria, it will be. 
merged['measurement_in_prescription_range'] = (
    (merged['measurement_date'] >= merged['prescription_creation_date']) &
    (merged['measurement_date'] <= merged['prescription_validity_end_date'])
)
# If after this, a measurement is still not linked to any prescription due to not being in the range of any, 
# it will be linked to the prescription it is closest to, within a 5-day permissivity range. 
# For these out-of-range measurements, first, the distance from the start/end dates of any prescription is calculated. 
merged['days_before_prescription_start'] = (merged['prescription_creation_date'] - merged['measurement_date']).dt.days
merged['days_after_prescription_end'] = (merged['measurement_date'] - merged['prescription_validity_end_date']).dt.days
# After, near-range measurements are defined, 
# as measurements that are NOT within the range of any prescription, 
# AND they are at within 5 days before the start/after the end of any prescription. 
merged['measurement_near_prescription_range'] = (
    (~merged['measurement_in_prescription_range']) &
    (
        ((merged['days_before_prescription_start'] <= 5) & (merged['days_before_prescription_start'] > 0)) |
        ((merged['days_after_prescription_end'] <= 5) & (merged['days_after_prescription_end'] > 0))
    )
)
# After, a distance metric calculation determines how far a given measurement is from a prescription. 
# In-range measurements get a distance metric of 0,
# while out-of-range measurements get the minimum distance to any boundary they are close to. 
merged['measurement_distance_from_prescription_range'] = merged.apply(
    lambda row: 0 if row['measurement_in_prescription_range'] else min(max(row['days_before_prescription_start'], 0), max(row['days_after_prescription_end'], 0)),
    axis=1
)
# After defining the in-range and near-range logics, the database (currently containing Cartesian products) 
# is filtered to keep only in-or near-range measurements. 
# Any measurements not assigned to a prescription is lost. 
measurements_with_metadata = merged[merged['measurement_in_prescription_range'] | merged['measurement_near_prescription_range']].copy()
# In edge cases where multiple prescriptions are linked to a single measurement, only the closest match is kept. 
# This is done by sorting the data frame by patient id, measurement date and distance from range, 
# and if multiple measurement-prescription pairs from the same patient on the same date are found, 
# duplicates are removed and only the row with the smallest distance from range is kept. 
measurements_with_metadata = measurements_with_metadata.sort_values(['patient_id', 'measurement_date', 'measurement_distance_from_prescription_range'])
measurements_with_metadata = measurements_with_metadata.drop_duplicates(['patient_id', 'measurement_date'])

# After filtering the data frame, columns are reordered, and any irrelevant ones, like prescribed supplements, are dropped. 
column_order = [
    'patient_id',
    'medical_record_id',
    'prescription_id',
    'measurement_date',
    'prescription_creation_date',
    'prescription_validity_end_date',
    'prescription_validity_days',
    'method',
    'step',
    'weight_kg',
    'bmi',
    'bmr_kcal',
    'fat_%',
    'vat_%',
    'muscle_%',
    'water_%',
    'measurement_in_prescription_range',
    'days_before_prescription_start',
    'days_after_prescription_end',
    'measurement_near_prescription_range',
    'measurement_distance_from_prescription_range'
]
measurements_with_metadata = measurements_with_metadata[column_order]

# The measurements_with_metadata data frame is saved within the SQL database, and some summary info is printed. 
measurements_with_metadata.to_sql("measurements_with_metadata", conn, if_exists="replace", index=False)

print(f"Measurements are linked to their corresponding prescriptions and medical records. \n"
    f"There are a total of {measurements_with_metadata.shape[0]} measurements "
    f"from {measurements_with_metadata['medical_record_id'].nunique()} medical records "
    f"of {measurements_with_metadata['patient_id'].nunique()} patients.")

Measurements are linked to their corresponding prescriptions and medical records. 
There are a total of 20976 measurements from 1678 medical records of 1664 patients.


#### Add sex and baseline/final weight data to medical records

In an effort to create data frames containing the most possible information in one place, the medical records data frame is completed with the sex (originally stored in Patients) as well as the baseline and final weight data (measurements linked to medical records stored in measurements_with_metadata) of patients. 

Besides executing these merge operations, the code checks the time passed between baseline and final measurements in each medical record, along with whether the measurements are close to the beginning/end date of the medical record they belong to or not. This helps checking whether the length of the actual followup is similar to that of the medical record or not. 

Any medical record that has no associated measurements is lost here. 

In [8]:
"""
Complete Medical records by adding sex and baseline/final weight data to it. 

Sex is fetched from the Patients table, based on the patient_id.

Baseline and final weight measurements are obtained from the measurements_with_metadata table created in the previous step. 
The logic is the following: 
Measurements are grouped by patient and medical record ID, and the first and last measurements of each group are assigned
to the medical records table as baseline and final measurements, respectively.
Delta weight is calculated as the difference between final and baseline weights, to obtain negative results. 

Measurement dates are added and it is checked if they are within the medical record creation and closing dates.

If a medical record has no measurements linked to it, it is dropped. 

Additionally, the 'days_between_measurements' column is added to calculate the number of days between the baseline and final measurements.

Finally, the columns are reordered to match the desired order.
"""

import pandas as pd
import sqlite3

# Connect to the database, load relevant tables
conn = sqlite3.connect(os.path.join(paper1_directory, "emotional_all_notna.sqlite"))
medical_records = pd.read_sql_query("SELECT * FROM medical_records_colclean", conn)
patients = pd.read_sql_query("SELECT * FROM patients_colclean", conn)
measurements_with_metadata = pd.read_sql_query("SELECT * FROM measurements_with_metadata", conn)

# The following functions complete the original medical records data frame with research-relevant variables.
# First, add the sex variable to medical_records_complete by merging patients' sex into medical_records_complete based on patient_id
"""
Adding sex data to medical records
"""
medical_records_complete = pd.merge(
    medical_records,
    patients[['patient_id', 'sex']],
    on='patient_id',
    how='left'
)

# After, add baseline and final measurements to medical_records_complete
# Treat measurements coming from a given medical record as units
# by grouping measurements_with_metadata by patient_id and medical_record_id 
"""
Adding weight data to medical records
"""
grouped_measurements = measurements_with_metadata.groupby(['patient_id', 'medical_record_id'])
# Extract the first (baseline) and last (final) measurement for each group
baseline = grouped_measurements.first().reset_index()
final = grouped_measurements.last().reset_index()
# Insert baseline and final measurements into medical_records_complete
medical_records_complete = pd.merge(
    medical_records_complete,
    baseline[['patient_id', 'medical_record_id', 'measurement_date', 'weight_kg', 'bmi']],
    on=['patient_id', 'medical_record_id'],
    how='left'
)
medical_records_complete = pd.merge(
    medical_records_complete,
    final[['patient_id', 'medical_record_id', 'measurement_date', 'weight_kg', 'bmi']],
    on=['patient_id', 'medical_record_id'],
    how='left',
    suffixes=('_baseline', '_final')
)
# Make sure all dates are in datetime format for further operations, 
# and calculate delta weight and delta BMI values (final - baseline, so the resulting weight loss value is negative)
medical_records_complete['medical_record_creation_date'] = pd.to_datetime(medical_records_complete['medical_record_creation_date'])
medical_records_complete['medical_record_closing_date'] = pd.to_datetime(medical_records_complete['medical_record_closing_date'])
medical_records_complete['measurement_date_baseline'] = pd.to_datetime(medical_records_complete['measurement_date_baseline'])
medical_records_complete['measurement_date_final'] = pd.to_datetime(medical_records_complete['measurement_date_final'])
medical_records_complete['delta_weight_kg'] = medical_records_complete['weight_kg_final'] - medical_records_complete['weight_kg_baseline']
medical_records_complete['delta_bmi'] = medical_records_complete['bmi_final'] - medical_records_complete['bmi_baseline']
# Check if the baseline and final measurements are close to the starting/closing date of the medical record they belong to or not (within a 10-day window). 
# In some cases, the first measurement is recorded weeks after opening the medical record, or the last one is taken long before closing it. 
# In other cases, the medical record's closing date is absent, if this happens, the last measurement will be considered as out of range. 
# This is supposed to help identify cases where the followup has some imperfections. 
window_days = 10
medical_records_complete['baseline_measurement_inrange'] = (
    (medical_records_complete['measurement_date_baseline'] >= 
     medical_records_complete['medical_record_creation_date'] - pd.Timedelta(days=window_days)) &
    (medical_records_complete['measurement_date_baseline'] <=
     medical_records_complete['measurement_date_baseline'] + pd.Timedelta(days=window_days))
)
medical_records_complete['final_measurement_inrange'] = (
    (medical_records_complete['measurement_date_final'] >= 
     medical_records_complete['medical_record_closing_date'] - pd.Timedelta(days=window_days)) &
    (medical_records_complete['measurement_date_final'] <= 
     medical_records_complete['medical_record_closing_date'] + pd.Timedelta(days=window_days))
)
# Add a column that calculates the days passed between baseline and final measurements
# This also helps identify cases where the medical record's duration and the actual followup time are very different
medical_records_complete['days_between_measurements'] = (
    (medical_records_complete['measurement_date_final'] - medical_records_complete['measurement_date_baseline']).dt.days
)

"""
Removing medical records with no associated measurements
"""
# As for some reason (unidentified as of 16Apr25) many medical records have no available measurements associated to them, 
# any such instances are dropped from the data frame. 
medical_records_complete = medical_records_complete.dropna(subset=['weight_kg_baseline', 'weight_kg_final'])

"""
Presenting and saving the output
"""
# Rename and reorder columns for better clarity and interpretability
medical_records_complete = medical_records_complete.rename(columns={
    'measurement_date_baseline': 'baseline_measurement_date',
    'measurement_date_final': 'final_measurement_date',
    'weight_kg_baseline': 'baseline_weight_kg',
    'weight_kg_final': 'final_weight_kg', 
    'bmi_baseline': 'baseline_bmi',
    'bmi_final': 'final_bmi'
})
desired_column_order = [
    'patient_id',
    'medical_record_id',
    'medical_record_creation_date',
    'medical_record_closing_date',
    'intervention_duration_days',
    'baseline_measurement_date',
    'final_measurement_date',
    'days_between_measurements',
    'baseline_measurement_inrange',
    'final_measurement_inrange',
    'birth_date',
    'age',
    'age_when_creating_record',
    'sex',
    'height_m',
    'baseline_weight_kg',
    'final_weight_kg',
    'delta_weight_kg',
    'baseline_bmi',
    'final_bmi',
    'delta_bmi',
    'wc_cm_confirm_time',
    'pnk_method',
    'orders_in_medical_record',
    'dietitian_visits',
    'physical_activity',
    'physical_activity_frequency',
    'physical_inactivity_cause',
    'weight_gain_cause',
    'smoking',
    'medications',
    'hunger',
    'satiety',
    'emotional_eating',
    'emotional_eating_value',
    'quantity_control',
    'impulse_control'
]
medical_records_complete = medical_records_complete[desired_column_order]
# Save the complete medical records to the SQL database, and print a summary statement
medical_records_complete.to_sql("medical_records_complete", conn, if_exists="replace", index=False)
print(f"Medical records table completed with sex and baseline/final weight data. \n" 
      f"There are {len(medical_records_complete)} records available from {medical_records_complete['patient_id'].nunique()} patients.")

Medical records table completed with sex and baseline/final weight data. 
There are 1678 records available from 1664 patients.


#### Create the base input for survival analysis

Here, data frames specifically prepared for survival analysis are created. Time-to-event (days) of achieving 3 different weight loss targets (5-10-15%) in 3 different time frames (40-60-80 days) is analyzed. Relevant demographic, anthropometric and eating behavior variables are added to each analyzed medical record. 

In [15]:
import pandas as pd
import os
import sqlite3
from datetime import timedelta
# Removed: import logging

"""
CONFIGURATION
"""
# Define directories and database paths - paper1_directory should be defined 
# in the first cell of this notebook chapter
input_db_path = os.path.join(paper1_directory, 'emotional_all_notna.sqlite')
input_measurements = "measurements_with_metadata"
input_medical_records = "medical_records_complete"
output_db_path = os.path.join(paper1_directory, 'survival_analysis.sqlite')

# Define analysis parameters
weight_loss_targets = [5, 10, 15]     # Weight loss target percentages
time_windows = [40, 60, 80]       # Time windows (centers) in days
window_span = 10                   # Permissible span around windows (+/- days)

# Define the variables stored in medical_records_complete that are relevant for the analysis. 
# These include basic metadata like patient and record ID,
# basic factors such as age and sex, 
# as well as the emotional and eating behavior variables pivotal to the research question. 
# The list can be amended on demand - 
# for example, right now it does not include medical record creating and closing dates. 
relevant_medical_values = ['patient_id', 'medical_record_id', 'sex', 'age',
                             'height_m', 'baseline_bmi', 'hunger', 'satiety', 'emotional_eating',
                             'emotional_eating_value', 'quantity_control', 'impulse_control']

"""
DATA LOADING & PREPARATION
"""

def load_measurements(connection):
    """
    Load measurements from the measurement_with_metadata table; 
    make sure key values are in the correct format. 
    """
    query = f"SELECT * FROM {input_measurements}"
    measurements = pd.read_sql_query(query, connection)
    measurements['measurement_date'] = pd.to_datetime(measurements['measurement_date'], errors='coerce')
    measurements['weight_kg'] = pd.to_numeric(measurements['weight_kg'], errors='coerce')
    return measurements

def load_medical_records(connection):
    """
    Load medical records from the medical_records_complete table;
    make sure date values are in datetime format. 
    The exact columns to be used are defined in the prepare_patient_data function.
    """
    query = f"SELECT * FROM {input_medical_records}"
    medical_records = pd.read_sql_query(query, connection)
    medical_records['medical_record_creation_date'] = pd.to_datetime(medical_records['medical_record_creation_date'], errors='coerce')
    return medical_records

def prepare_patient_data(measurements, medical_records):
    """
    Filter measurements to only include those from the earliest medical record for each patient.
    Merge measurements with relevant medical record data, including the pivotal eating behavior scores. 
    """
    # Filter measurements to only include those from the first treatment record of each patient
    earliest_records_with_data = measurements.sort_values('measurement_date')\
        .groupby('patient_id')['medical_record_id']\
        .first()\
        .reset_index()
    filtered_measurements = pd.merge(
        measurements,
        earliest_records_with_data,
        on=['patient_id', 'medical_record_id'],
        how='inner'
    )
    # Identify the baseline measurement in each record
    baseline_data = filtered_measurements.sort_values('measurement_date')\
                                       .groupby(['patient_id', 'medical_record_id'])\
                                       .first()\
                                       .reset_index()

    cols_to_select = [col for col in relevant_medical_values if col in medical_records.columns]
    medical_record_data = medical_records[cols_to_select]
    # Merge baseline measurements with relevant medical record data
    prepared_data = pd.merge(
        baseline_data,
        medical_record_data,
        on=['patient_id', 'medical_record_id'],
        how='left' # Keep all baseline data
    )
    return prepared_data, filtered_measurements

"""
CALCULATE WEIGHT LOSS OUTCOMES
"""

def _get_patient_baseline(patient_data, patient_id, medical_record_id):
    """
    Get the baseline data for each patient's corresponding medical record.
    """
    patient_baseline = patient_data[
        (patient_data['patient_id'] == patient_id) &
        (patient_data['medical_record_id'] == medical_record_id)
    ]
    if len(patient_baseline) == 0:
        print(f"WARN: No baseline data found for patient {patient_id}, record {medical_record_id}. Skipping.")
        return None
    return patient_baseline.iloc[0]

def _check_target_achievement(measurements_within_window, baseline_weight, weight_loss_target):
    """
    Check if the weight loss target was achieved in some of the given measurements.
    """
    # Set default to False/None
    target_achieved = False
    first_success_measurement = None
    # Calculate weight loss percentage for each measurement in the window, 
    # and check if it meets the target
    for _, row in measurements_within_window.iterrows():
        current_weight = row['weight_kg']
        if baseline_weight is not None and baseline_weight > 0:
            current_weight_loss = ((baseline_weight - current_weight) / baseline_weight) * 100
            if round(current_weight_loss, 2) >= weight_loss_target:
                target_achieved = True
                first_success_measurement = row
                break # Stop at the first success; if that is not identified, target_achieved remains False as by default
    return target_achieved, first_success_measurement

def _determine_final_measurement(target_achieved, first_success_row, measurements_around_cutoff,
                                measurements_within_window, baseline_date, window_center):
    """
    Determine the final measurement based on success or censoring (ie. completion without success) rules.
    """
    # Set final measurement to None by default
    final_measurement = None
    # Set target final date based on the given time window
    target_date = baseline_date + timedelta(days=window_center)
    # If weight loss target was achieved at any point of the followup time window,
    # use the first success measurement as the final measurement.  
    if target_achieved:
        final_measurement = first_success_row
    # In case of no success, the date closest to the target date is used as the final measurement. 
    elif not measurements_around_cutoff.empty:
        measurements_around_cutoff = measurements_around_cutoff.copy()
        measurements_around_cutoff['distance_to_center'] = abs(
            (measurements_around_cutoff['measurement_date'] - target_date).dt.days
        )
        closest_measurement_idx = measurements_around_cutoff['distance_to_center'].idxmin()
        final_measurement = measurements_around_cutoff.loc[closest_measurement_idx]
    # In case of no success nor completion (delayed dropout), use the last available measurement as the final measurement
    elif not measurements_within_window.empty:
        final_measurement = measurements_within_window.sort_values('measurement_date').iloc[-1]
    # Else: Instant dropout, final_measurement remains None, 
    # and is set to the baseline measurementin the calculate_outcome_metrics function.
    return final_measurement

def _calculate_outcome_metrics(baseline_row, final_measurement_row):
    """
    Calculate follow-up lenght and weight loss percentage based on baseline and final measurement.
    """
    # Identify the baseline measurement
    baseline_date = baseline_row['measurement_date']
    baseline_weight = baseline_row['weight_kg']
    # In patients that have at least one followup measurement, identify the end date and final weight, 
    # to calculate followup length and weight loss in kg and %
    if final_measurement_row is not None:
        end_date = final_measurement_row['measurement_date']
        final_weight = final_measurement_row['weight_kg']
        followup_period = (end_date - baseline_date).days
        weight_loss_kg = baseline_weight - final_weight
        weight_loss_pct = ((baseline_weight - final_weight) / baseline_weight) * 100
    # In patients that have no followup measurement (instant dropouts), 
    # the end date and final weight are set to the baseline values, 
    # and followup length and weight loss are set to 0. 
    else: 
        end_date = baseline_date
        final_weight = baseline_weight
        followup_period = 0
        weight_loss_kg = 0
        weight_loss_pct = 0
    return {
        'end_date': end_date,
        'final_weight': final_weight,
        'followup_period': followup_period,
        'weight_loss_kg': weight_loss_kg,
        'weight_loss_pct': round(weight_loss_pct, 2)
    }

"""
CORE ANALYSIS FUNCTION
"""

def calculate_weight_loss_outcome(patient_data, filtered_measurements, weight_loss_target, window_center, window_span):
    """
    Calculate weight loss outcomes for each patient in a survival analysis-ready format. 
    """
    # Initialize an empty list to store results, and group measurements by patient and medical record ID
    results = []
    grouped_measurements = filtered_measurements.groupby(['patient_id', 'medical_record_id'])
    # Iterate through each group within measurements. 
    for (patient_id, medical_record_id), group in grouped_measurements:
        # 1. Identify baseline measurement date and weight
        baseline_row = _get_patient_baseline(patient_data, patient_id, medical_record_id)
        if baseline_row is None: continue
        baseline_date = baseline_row['measurement_date']
        baseline_weight = baseline_row['weight_kg']
        # 2. Define observation time windows and group measurements within the defined window
        # Calculations are done for both the complete observation period, 
        # as well as the period strictry around the cutoff date, within the defined permissivity window. 
        min_window_date = baseline_date + timedelta(days=(window_center - window_span))
        max_window_date = baseline_date + timedelta(days=(window_center + window_span))
        measurements_within_window = group[
            (group['measurement_date'] > baseline_date) &
            (group['measurement_date'] <= max_window_date)
        ].sort_values('measurement_date')
        measurements_around_cutoff = group[
            (group['measurement_date'] >= min_window_date) &
            (group['measurement_date'] <= max_window_date)
        ]
        # 3. Check whether target weight loss was achieved in the defined time window
        target_achieved, first_success_row = _check_target_achievement(
            measurements_within_window, baseline_weight, weight_loss_target
        )
        # 4. Identify the last measurement date within the time window,
        # whether based on target achievment or followup completion
        final_measurement_row = _determine_final_measurement(
            target_achieved, first_success_row, measurements_around_cutoff,
            measurements_within_window, baseline_date, window_center
        )
        # 5. Check for dropout status - instant dropouts are those who have no second measurement, 
        # while delayed dropouts are those who have not reached target, 
        # and their final measurement is before the cutoff window. 
        is_instant_dropout = final_measurement_row is None
        is_delayed_dropout = (not target_achieved and
                              final_measurement_row is not None and
                              final_measurement_row['measurement_date'] < min_window_date)
        dropout = is_instant_dropout or is_delayed_dropout
        success = target_achieved
        # 6. Calculate metrics like final date and weight, followup length and weight lost. 
        outcome_metrics = _calculate_outcome_metrics(baseline_row, final_measurement_row)

        """ARE WE GOING TO MODIFY AVG CALCS?"""

        # # 7. 
        # # --- NEW: Calculate metrics based *always* on the last measurement within the window ---
        # actual_last_measurement_row = None
        # if not measurements_within_window.empty:
        #     actual_last_measurement_row = measurements_within_window.iloc[-1]

        # # Use the same helper, but pass the actual last measurement row
        # actual_end_metrics = _calculate_outcome_metrics(baseline_row, actual_last_measurement_row)
        # actual_wl_pct_at_window_end = actual_end_metrics['weight_loss_pct']
        # # --- End NEW ---



        # 8. Assemble the result - this is where the output tables' columns are defined. 
        # If additional variables are inserted at an earlier part of the code, 
        # they need to be mentioned here as well. 
        result = {
            'patient_id': patient_id,
            'medical_record_id': medical_record_id,
            'baseline_date': baseline_date,
            'end_date': outcome_metrics['end_date'],
            'followup_period': outcome_metrics['followup_period'],
            'baseline_weight': baseline_weight,
            'final_weight': outcome_metrics['final_weight'],
            'weight_loss_kg': outcome_metrics['weight_loss_kg'],
            'weight_loss_pct': outcome_metrics['weight_loss_pct'],
            # 
            f'{weight_loss_target}pct_achieved': success,
            'dropout': dropout,
            # Add baseline characteristics safely using .get()
            'sex': baseline_row.get('sex'),
            'age': baseline_row.get('age'),
            'height_m': baseline_row.get('height_m'),
            'baseline_bmi': baseline_row.get('bmi'),
            'hunger': baseline_row.get('hunger'),
            'satiety': baseline_row.get('satiety'),
            'emotional_eating': baseline_row.get('emotional_eating'),
            'emotional_eating_value': baseline_row.get('emotional_eating_value'),
            'quantity_control': baseline_row.get('quantity_control'),
            'impulse_control': baseline_row.get('impulse_control')
        }
        results.append(result)
    return pd.DataFrame(results)


"""
MAIN ORCHESTRATION FUNCTION
"""

def generate_survival_analysis_datasets(input_connection, output_connection, weight_loss_targets, time_windows, window_span=10):
    """
    The main function to orchestrate the survival analysis process, calling all previously defined functions in an organized manner. 
    Generate survival analysis datasets for multiple weight loss targets and observation time windows.
    Targets and timeframes are defined in the configuration section at the beginning of the code module.
    Save data to a separate SQLite database. 
    """
    # 1. Load and prepare input data
    measurements = load_measurements(input_connection)
    medical_records = load_medical_records(input_connection)
    patient_data, filtered_measurements = prepare_patient_data(measurements, medical_records)
    if patient_data.empty:
        print("ERROR: Prepared patient data is empty. Cannot proceed.")
        return {}, pd.DataFrame()
    # 2. Calculate weight loss outcomes for each target-timeframe combination. 
    # Targets and timeframes are defined in the config section of the script. 
    # Initialize a results dictionary and a list for summary statistics. 
    results = {}
    summary_list = []
    for window in sorted(time_windows):
        for target in sorted(weight_loss_targets):
            # Name each instance accordingly, where sa stands for survival analysis, 
            # and the numbers indicate the time window and target percentage.
            name = f"sa_{window}d_{target}p"
            print(f"--- Processing: {name} ---") # Minimal progress indication
            result_df = calculate_weight_loss_outcome(
                patient_data,
                filtered_measurements,
                target,
                window,
                window_span # Defined in config - the permissivity window around the followup cutoff time
            )
            results[name] = result_df
            # Add the calculated instances to the summary statistics list. 
            if not result_df.empty:
                summary_row = {
                    'analysis_name': name,
                    'weight_loss_target': target,
                    'time_window': window,
                    'total_patients': len(result_df),
                    'achieved_target': int(result_df[f'{target}pct_achieved'].sum()),
                    'dropout_count': int(result_df['dropout'].sum()),
                    'avg_weight_loss_pct': result_df['weight_loss_pct'].mean() if not result_df['weight_loss_pct'].isnull().all() else 0
                }
                summary_list.append(summary_row)
            else:
                 print(f"WARN: No results generated for {name}. Skipping summary entry.")
    # Turn the summary statistics list into a data frame
    summary = pd.DataFrame(summary_list)

    # 3. Save the analysis results (9 tables by default) to the SQLite database defined in the config section
    print(f"--- Saving results to output database: {output_db_path} ---")
    # Save individual tables
    for name, df in results.items():
        print(f"Saving table: {name} ({len(df)} rows)")
        df.to_sql(name, output_connection, if_exists='replace', index=False)
    # Save the summary stats table in the database as well
    print(f"Saving summary table: survival_analysis_summary ({len(summary)} rows)")
    summary.to_sql('survival_analysis_summary', output_connection, if_exists='replace', index=False)
    output_connection.commit() # Ensure changes are saved
    print("--- All results saved successfully ---")
    return results, summary

"""
EXECUTION BLOCK
"""

"""
This part of the code calls all the functions and executes the code. 
Currently it has a lot of debug messages and error handling, which might be an overkill, 
but overall, it should not affect transparency of the code.
"""

if __name__ == "__main__":
    print("========== Starting Survival Analysis Script ==========")
    # By default, connections are set to None, and will be established in the try block.
    input_conn = None
    output_conn = None
    try:
        # Connect to in-and output databases
        print(f"Connecting to input database: {input_db_path}")
        if not os.path.exists(input_db_path):
             raise FileNotFoundError(f"Input database not found at {input_db_path}")
        input_conn = sqlite3.connect(input_db_path)
        print(f"Connecting to output database: {output_db_path}")
        output_conn = sqlite3.connect(output_db_path)
        # Run the main analysis function
        results, summary = generate_survival_analysis_datasets(
            input_conn,
            output_conn,
            weight_loss_targets,
            time_windows,
            window_span
        )
        # Display summary if successful
        if not summary.empty:
            print("\n--- Survival Analysis Summary ---")
            print(summary.to_string()) # Use print for console display
            print("--- End Summary ---")
        else:
            print("WARN: Analysis completed, but the summary table is empty.")
        print(f"Analysis data successfully generated and saved to {output_db_path}")

    # Minimal error handling for critical failures
    except FileNotFoundError as e:
        print(f"ERROR: Database file not found - {e}")
    except sqlite3.Error as e:
        print(f"ERROR: SQLite database error - {e}")
    except ValueError as e:
        print(f"ERROR: Data processing error - {e}")
    except Exception as e:
        print(f"ERROR: An unexpected error occurred - {e}")
        # Consider adding traceback for debugging complex errors:
        # import traceback
        # print(traceback.format_exc())
    finally:
        # Ensure connections are closed
        print("Closing database connections...")
        if input_conn:
            input_conn.close()
        if output_conn:
            output_conn.close()
        print("========== Survival Analysis Script Finished ==========")


Connecting to input database: C:\Users\Felhasználó\Desktop\Projects\PNK_DB2\paper1_emotional\emotional_all_notna.sqlite
Connecting to output database: C:\Users\Felhasználó\Desktop\Projects\PNK_DB2\paper1_emotional\survival_analysis.sqlite
--- Processing: sa_40d_5p ---
--- Processing: sa_40d_10p ---
--- Processing: sa_40d_15p ---
--- Processing: sa_60d_5p ---
--- Processing: sa_60d_10p ---
--- Processing: sa_60d_15p ---
--- Processing: sa_80d_5p ---
--- Processing: sa_80d_10p ---
--- Processing: sa_80d_15p ---
--- Saving results to output database: C:\Users\Felhasználó\Desktop\Projects\PNK_DB2\paper1_emotional\survival_analysis.sqlite ---
Saving table: sa_40d_5p (1664 rows)
Saving table: sa_40d_10p (1664 rows)
Saving table: sa_40d_15p (1664 rows)
Saving table: sa_60d_5p (1664 rows)
Saving table: sa_60d_10p (1664 rows)
Saving table: sa_60d_15p (1664 rows)
Saving table: sa_80d_5p (1664 rows)
Saving table: sa_80d_10p (1664 rows)
Saving table: sa_80d_15p (1664 rows)
Saving summary table: su