## Load data from pkl

In [None]:
import os
import pandas as pd
from IPython.display import display, HTML
from dotenv import load_dotenv

# ---------------------------
# Step 1: Load environment variables
# ---------------------------
load_dotenv(dotenv_path='/home/jovyan/work/Medicina/DuETT - DP/DialysisDuETT/.env')

# ---------------------------
# Step 2: Define target keys and retrieve pickle file paths from .env
# ---------------------------
target_keys = ['ANALITICAS', 'ANEMIA', 'CATETER', 'CINETICA', 
               'EXTEMP', 'INGRESOS', 'MOM', 'PACIENTES', 'PERITONITIS']

# Retrieve paths from the environment variables
target_paths = {}
for key in target_keys:
    path = os.getenv(key)
    if path is None:
        print(f"Warning: Environment variable {key} not found in the .env file.")
    else:
        target_paths[key] = path

# ---------------------------
# Step 3: Load the pickle files into DataFrames
# ---------------------------
levantedp = {}  # Dictionary to hold the loaded DataFrames

for key, pkl_path in target_paths.items():
    if os.path.exists(pkl_path):
        try:
            df = pd.read_pickle(pkl_path)
            levantedp[key] = df
            print(f"Loaded DataFrame for '{key}' from '{pkl_path}'.")
        except Exception as e:
            print(f"Error loading pickle file for '{key}' from '{pkl_path}': {e}")
    else:
        print(f"Pickle file for '{key}' not found at '{pkl_path}'.")

# ---------------------------
# Step 4: Unpack DataFrames into global variables
# ---------------------------
for key, df in levantedp.items():
    globals()[key] = df

# ---------------------------
# Step 5: General adjustments
# ---------------------------

# Ensure the FECHA column is in datetime format
ANALITICAS['FECHA'] = pd.to_datetime(ANALITICAS['FECHA'])


In [None]:
# Set Pandas Display Options
pd.set_option('display.max_rows', None)  # Show all rows (if needed)
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Adjust the output width
pd.set_option('display.max_colwidth', None)  # Prevent column content truncation


## Missing values by column

In [None]:
# Define a function to report missing values
def missing_values_report(df, df_name):
    """
    Prints a report of the absolute number and percentage of missing values (NaN and empty)
    for each column in the DataFrame.
    """
    # Replace empty strings (or strings with only whitespace) with NaN
    df_clean = df.replace(r'^\s*$', pd.NA, regex=True)
    
    total_rows = len(df_clean)
    # Calculate missing values count and percentage for each column
    missing_count = df_clean.isnull().sum()
    missing_percentage = (missing_count / total_rows * 100).round(2)
    
    report_df = pd.DataFrame({
        'Missing Values': missing_count,
        'Percentage (%)': missing_percentage
    })
    
    print(f"Missing Values Report for '{df_name}' (Total Rows: {total_rows}):")
    display(HTML(report_df.to_html()))    
    print("\n")

# Check and print the missing values report for ANALITICAS
if "ANALITICAS" in levantedp:
    missing_values_report(levantedp["ANALITICAS"], "ANALITICAS")
else:
    print("DataFrame for 'ANALITICAS' not found in levantedp.")

# Check and print the missing values report for PACIENTES
if "PACIENTES" in levantedp:
    missing_values_report(levantedp["PACIENTES"], "PACIENTES")
else:
    print("DataFrame for 'PACIENTES' not found in levantedp.") 

## Average distance (days) between meassurements

In [None]:
# Sort the DataFrame by patient and date
ANALITICAS.sort_values(['REGISTRO', 'FECHA'], inplace=True)

# Calculate the difference in days between consecutive measurements for each patient
ANALITICAS['diff_days'] = ANALITICAS.groupby('REGISTRO')['FECHA'].diff().dt.days

# Merge with PACIENTES DataFrame to get the CENTRO information
ANALITICAS_merged = ANALITICAS.merge(PACIENTES[['REGISTRO', 'CENTRO']], on='REGISTRO', how='left')

# 1. Calculate the average interval for every patient (ignoring NaN values)
patient_avg = ANALITICAS_merged.groupby('REGISTRO')['diff_days'].mean()

# 2. Calculate the general average interval (across all valid differences)
general_avg = ANALITICAS_merged['diff_days'].mean()

# 3. Calculate the average interval for every CENTRO
centro_avg = ANALITICAS_merged.groupby('CENTRO')['diff_days'].mean()

# 4. Calculate the number of unique patients per CENTRO
unique_patients_per_centro = ANALITICAS_merged.groupby('CENTRO')['REGISTRO'].nunique()

# 5. Calculate the total number of measurements per CENTRO
measurements_per_centro = ANALITICAS_merged.groupby('CENTRO')['REGISTRO'].count()

# Combine results into a DataFrame for better readability
centro_results = pd.DataFrame({
    'Average Interval (days)': centro_avg,
    'Unique Patients': unique_patients_per_centro,
    'Total Measurements': measurements_per_centro
}).reset_index()

# Output the results
print("General average (days):", general_avg)
print("\nAverage for every CENTRO (days), Unique Patients, and Total Measurements:")
print(centro_results)