# View Plots

This notebook loads the PNG plots generated by `Work life balance vs longevity.py` from the `plots/` folder, displays them, and provides short, automated interpretations where possible.

If you moved the `plots/` directory or the dataset, update the `plots_dir` and `data_path` variables in the cells below.

In [None]:
# Import required libraries
from IPython.display import Image, display, HTML
import glob
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image as PILImage

# Helpers
plots_dir = 'plots'  # adjust if your plots folder is elsewhere

def safe_fname(s: str) -> str:
    return re.sub(r"[^0-9a-zA-Z-_]+", '_', s).strip('_')

def display_image(path, width=700):
    """Display an image file with optional width in pixels."""
    if os.path.exists(path):
        display(Image(path, width=width))
    else:
        display(HTML(f"<b>Missing file:</b> {path}"))

# Attempt to locate a CSV dataset for computing summary stats (optional)
# Update `data_path` if your CSV lives elsewhere.
data_path = '/Users/netah/Library/CloudStorage/OneDrive-weizmann.ac.il/PhD- personal/courses/python/work life balance-longevity dataset.csv'
try:
    df = pd.read_csv(data_path)
    print('Loaded dataset for numeric summaries:', data_path)
except Exception as e:
    df = None
    print('Could not load dataset for numeric summaries:', e)


In [None]:
# Section: Load plot files and list available images
plot_files = sorted(glob.glob(os.path.join(plots_dir, '*.png')))
print(f'Found {len(plot_files)} plot files in "{plots_dir}"')
for p in plot_files:
    print('-', os.path.basename(p))


In [None]:
# Section: Display and Interpret Scatter (correlation) plots

corr_files = sorted(glob.glob(os.path.join(plots_dir, '*_vs_*_corr.png')))
if not corr_files:
    print('No correlation plot files found (pattern "*_vs_*_corr.png").')
else:
    print(f'Displaying {len(corr_files)} correlation plots')
    for p in corr_files:
        display_image(p)
        # Try to infer numeric columns from filename and compute Pearson r if possible
        fname = os.path.basename(p).replace('.png','')
        if '_vs_' in fname:
            left_tok, right_tok = fname.split('_vs_')
            # map tokens back to dataframe columns, if df is available
            if df is not None:
                mapped_left = None
                mapped_right = None
                for c in df.columns:
                    if safe_fname(c) == left_tok:
                        mapped_left = c
                    if safe_fname(c) == right_tok.replace('_corr','') or safe_fname(c) == right_tok:
                        mapped_right = c
                if mapped_left and mapped_right and pd.api.types.is_numeric_dtype(df[mapped_left]) and pd.api.types.is_numeric_dtype(df[mapped_right]):
                    r = df[mapped_left].corr(df[mapped_right])
                    print(f'  -> Pearson r between "{mapped_left}" and "{mapped_right}": {r:.3f}')
                else:
                    print('  -> Could not compute numeric correlation (missing columns or non-numeric).')
        # Short interpretation guidance
        print('  Interpretation: examine slope, scatter spread, and r (if printed). Tight line = strong correlation; scattered points = weak correlation; slope sign indicates positive/negative relationship.')


In [None]:
# Section: Display and Interpret Histograms

hist_files = sorted(glob.glob(os.path.join(plots_dir, '*_hist.png')))
if not hist_files:
    print('No histogram files found (pattern "*_hist.png").')
else:
    print(f'Displaying {len(hist_files)} histogram plots')
    for p in hist_files:
        display_image(p)
        # If dataframe available, compute basic stats for the column inferred from filename
        fname = os.path.basename(p).replace('.png','')
        tok = fname.replace('_hist','')
        if df is not None:
            mapped = None
            for c in df.columns:
                if safe_fname(c) == tok:
                    mapped = c
                    break
            if mapped is not None and pd.api.types.is_numeric_dtype(df[mapped]):
                col = df[mapped].dropna()
                mean = col.mean()
                median = col.median()
                std = col.std()
                print(f'  -> {mapped}: n={len(col)}, mean={mean:.2f}, median={median:.2f}, std={std:.2f}')
            else:
                print('  -> Could not compute stats for this histogram (column not found or non-numeric).')
        print('  Interpretation: check skewness, modality, and spread. Narrow distrib = low variance; skew indicates tail to look for outliers.')


In [None]:
# Section: Display and Interpret Line Charts

# This script does not generate time-series line charts by default. If you have line charts saved
# in the plots folder (e.g., filenames containing "line" or "timeseries"), we'll display them here.
line_files = sorted(glob.glob(os.path.join(plots_dir, '*line*.png')) + glob.glob(os.path.join(plots_dir, '*timeseries*.png')))
if not line_files:
    print('No line/time-series plots detected in the plots folder.')
else:
    for p in line_files:
        display_image(p)
        print('  Interpretation: examine trends, seasonality, and abrupt changes. Context and axis labels matter.')


In [None]:
# Section: Work-life balance plots (by occupation) preview
wl_files = sorted(glob.glob(os.path.join(plots_dir, 'work_life_balance_index_vs_*_by_*.png')))
if not wl_files:
    print('No work-life-balance index plots found.')
else:
    print(f'Found {len(wl_files)} work-life-balance plots; displaying up to 12 samples')
    for p in wl_files[:12]:
        display_image(p, width=700)
        # short guidance
        print('  Interpretation: points toward the right indicate higher balance index; examine relationship with longevity/age at death and look for positive or negative trends.')


In [None]:
# Section: Summary of key findings (automated)

summary_rows = []
if df is not None:
    # numeric columns summary
    numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c.lower()!='id']
    for c in numeric_cols:
        col = df[c].dropna()
        if len(col)==0:
            continue
        row = {
            'column': c,
            'n': int(len(col)),
            'mean': float(col.mean()),
            'median': float(col.median()),
            'std': float(col.std())
        }
        # correlation with age_at_death if present
        if 'age_at_death' in df.columns and pd.api.types.is_numeric_dtype(df['age_at_death']):
            try:
                row['corr_with_age_at_death'] = float(col.corr(df['age_at_death']))
            except Exception:
                row['corr_with_age_at_death'] = None
        summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows)
if not summary_df.empty:
    display(summary_df.sort_values(by='corr_with_age_at_death', na_position='last'))
else:
    print('No numeric summary available (dataset not loaded or no numeric columns).')

# Save summary table for convenience
if not summary_df.empty:
    summary_csv = os.path.join(plots_dir, 'plots_summary.csv')
    summary_df.to_csv(summary_csv, index=False)
    print('Saved numeric summary to', summary_csv)
