In [None]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Load environment variables
load_dotenv(dotenv_path='/home/jovyan/work/Medicina/DuETT - DP/DialysisDuETT/.env')

# Define target keys and retrieve pickle file paths from .env
target_keys = ['ANALITICAS', 'PACIENTES']
target_paths = {}
for key in target_keys:
    path = os.getenv(key)
    if path is None:
        print(f"Warning: Environment variable {key} not found in the .env file.")
    else:
        target_paths[key] = path

# Load the pickle files into DataFrames
dataframes = {}
for key, pkl_path in target_paths.items():
    if os.path.exists(pkl_path):
        try:
            df = pd.read_pickle(pkl_path)
            dataframes[key] = df
            print(f"Loaded DataFrame for '{key}' from '{pkl_path}'.")
        except Exception as e:
            print(f"Error loading pickle file for '{key}' from '{pkl_path}': {e}")
    else:
        print(f"Pickle file for '{key}' not found at '{pkl_path}'.")

# Extract the dataframes
ANALITICAS = dataframes.get('ANALITICAS')
PACIENTES = dataframes.get('PACIENTES')

if ANALITICAS is None or PACIENTES is None:
    print("Error: Required dataframes not loaded. Exiting.")
    exit(1)

# Ensure the FECHA column is in datetime format
ANALITICAS['FECHA'] = pd.to_datetime(ANALITICAS['FECHA'])

## Filter patients with avg_diff_days > 80

In [None]:

# Sort the DataFrame by patient and date
ANALITICAS.sort_values(['REGISTRO', 'FECHA'], inplace=True)

# Calculate the difference in days between consecutive measurements for each patient
ANALITICAS['diff_days'] = ANALITICAS.groupby('REGISTRO')['FECHA'].diff().dt.days

# Calculate the average interval for every patient (ignoring NaN values)
patient_avg_interval = ANALITICAS.groupby('REGISTRO')['diff_days'].mean()

# Filter to get patients with average interval <= 80 days
patients_to_keep = patient_avg_interval[patient_avg_interval <= 80].index.tolist()

# Create filtered dataframes
PACIENTES_f = PACIENTES[PACIENTES['REGISTRO'].isin(patients_to_keep)].copy()
ANALITICAS_f = ANALITICAS[ANALITICAS['REGISTRO'].isin(patients_to_keep)].copy()

# Save the filtered dataframes as pickle files
output_dir = os.path.dirname(target_paths['PACIENTES'])
pacientes_f_path = os.path.join(output_dir, 'PACIENTES_f.pkl')
analiticas_f_path = os.path.join(output_dir, 'ANALITICAS_f.pkl')

PACIENTES_f.to_pickle(pacientes_f_path)
ANALITICAS_f.to_pickle(analiticas_f_path)

print("\nSummary:")
print(f"Original PACIENTES: {len(PACIENTES)} rows")
print(f"Filtered PACIENTES_f: {len(PACIENTES_f)} rows ({len(PACIENTES_f)/len(PACIENTES)*100:.2f}%)")
print(f"Original ANALITICAS: {len(ANALITICAS)} rows")
print(f"Filtered ANALITICAS_f: {len(ANALITICAS_f)} rows ({len(ANALITICAS_f)/len(ANALITICAS)*100:.2f}%)")