In [10]:
"""
solve_problem_from_ppt.py

Usage:
    python solve_problem_from_ppt.py /path/to/presentation.pptx
    OR just run: python solve_problem_from_ppt.py
    (if no PPTX file is provided, it will run the fallback demo problem)
"""

import sys
import os
import re
import pandas as pd
import numpy as np
#from pptx import Presentation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, classification_report

def extract_problem_sentences(pptx_path):
    prs = Presentation(pptx_path)
    extracted = []
    for i, slide in enumerate(prs.slides, start=1):
        texts = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                t = shape.text.strip()
                if t:
                    texts.append(t)
        combined = "\n".join(texts)
        if re.search(r'Problem Statement', combined, re.I) or re.search(r'no-?show', combined, re.I) or re.search(r'reduce', combined, re.I):
            parts = re.split(r'[\n\.]\s*', combined)
            for p in parts:
                p = p.strip()
                if len(p) > 10 and (re.search(r'no-?show', p, re.I) or re.search(r'reduce.*\b', p, re.I) or 'problem' in p.lower()):
                    extracted.append((i, p))
    return extracted

def build_and_evaluate_model():
    N = 4000
    rng = np.random.default_rng(42)
    data = {}
    data["age"] = rng.integers(18,85,N)
    data["lead_days"] = rng.integers(0,60,N)
    data["prev_no_shows"] = rng.integers(0,5,N)
    data["appointment_hour"] = rng.integers(8,18,N)
    data["weekday"] = rng.integers(0,6,N)
    data["distance_km"] = np.round(rng.normal(10,7,N).clip(0.5,150),1)
    data["reminder_sent"] = rng.choice([0,1], size=N, p=[0.6,0.4])
    data["socio_flag"] = rng.choice([0,1], size=N, p=[0.7,0.3])

    base = 0.12 + 0.02*np.array(data["prev_no_shows"]) + 0.002*np.array(data["lead_days"]) \
           + 0.001*np.array(data["distance_km"]) + 0.05*np.array(data["socio_flag"]) \
           - 0.05*np.array(data["reminder_sent"]) + 0.01*(((np.array(data["appointment_hour"])<10) | (np.array(data["appointment_hour"])>16)).astype(float))
    base = np.clip(base + rng.normal(0,0.03,N), 0.01, 0.9)
    data["no_show"] = rng.binomial(1, base)

    df = pd.DataFrame(data)

    X = df.drop(columns=["no_show"])
    y = df["no_show"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

    numeric_cols = ["age","lead_days","prev_no_shows","appointment_hour","weekday","distance_km"]
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train[numeric_cols])
    X_test_num = scaler.transform(X_test[numeric_cols])

    X_train_pre = np.hstack([X_train_num, X_train[["reminder_sent","socio_flag"]].values])
    X_test_pre = np.hstack([X_test_num, X_test[["reminder_sent","socio_flag"]].values])

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_pre, y_train)

    y_proba = model.predict_proba(X_test_pre)[:,1]
    y_pred = (y_proba >= 0.5).astype(int)

    results = {
        "roc_auc": roc_auc_score(y_test, y_proba),
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "report": classification_report(y_test, y_pred, zero_division=0),
        "X_test": X_test.reset_index(drop=True),
        "y_test": y_test.reset_index(drop=True),
        "y_proba": y_proba
    }
    return model, scaler, results

def recommend_interventions(X_df, proba, top_k_percent=20):
    out = X_df.copy().reset_index(drop=True)
    out["pred_proba"] = proba
    cutoff = np.percentile(proba, 100 - top_k_percent)
    out["reminder_recommendation"] = (out["pred_proba"] >= cutoff).astype(int)
    out["recommendation_reason"] = out["pred_proba"].apply(lambda p: f"Top {top_k_percent}%" if p>=cutoff else "No action")
    return out

def main():
    pptx_path = sys.argv[1] if len(sys.argv) > 1 else None
    problems = []

    if pptx_path and os.path.exists(pptx_path):
        problems = extract_problem_sentences(pptx_path)
        if problems:
            print("Extracted problem statements (first 5):")
            for i,(slide_idx, sentence) in enumerate(problems[:5], start=1):
                print(f"{i}. (Slide {slide_idx}) {sentence}")
            pd.DataFrame(problems, columns=["slide_index","sentence"]).to_csv("extracted_problem_statements.csv", index=False)
            print("Saved extracted_problem_statements.csv")
        else:
            print("No clear problem phrase found in PPT. Using fallback problem.")
    else:
        print("⚠️ No PPT file provided. Using fallback problem: appointment no-shows.")

    # Build and evaluate model (for the fallback problem)
    print("\nTraining a sample predictive model for appointment no-shows (synthetic data)...")
    model, scaler, res = build_and_evaluate_model()
    print(f"ROC AUC: {res['roc_auc']:.3f}, Accuracy: {res['accuracy']:.3f}, Precision: {res['precision']:.3f}, Recall: {res['recall']:.3f}")
    print("\nClassification report:\n", res["report"])

    rec = recommend_interventions(res["X_test"], res["y_proba"], top_k_percent=20)
    rec["actual_no_show"] = res["y_test"]

    baseline = res["y_test"].mean()
    selected = rec[rec["reminder_recommendation"]==1]
    sel_rate = selected["actual_no_show"].mean() if len(selected)>0 else 0.0
    print(f"\nBaseline no-show rate (test set): {baseline:.3f}")
    print(f"No-show rate among top 20%: {sel_rate:.3f} (N={len(selected)})")

    reduction = 0.40
    expected_reduction = (selected["pred_proba"] * reduction).sum() / len(res["y_test"])
    expected_new_rate = baseline - expected_reduction
    print(f"\nIf reminders cut no-shows by {int(reduction*100)}% for recipients:")
    print(f"Expected new no-show rate ≈ {expected_new_rate:.3f}")
    print(f"Absolute reduction ≈ {baseline - expected_new_rate:.3f}")

    print("\nSample recommendations (top 10):")
    print(rec.sort_values("pred_proba", ascending=False).head(10).to_string(index=False))

if __name__ == "__main__":
    main()


⚠️ No PPT file provided. Using fallback problem: appointment no-shows.

Training a sample predictive model for appointment no-shows (synthetic data)...
ROC AUC: 0.584, Accuracy: 0.771, Precision: 0.000, Recall: 0.000

Classification report:
               precision    recall  f1-score   support

           0       0.77      1.00      0.87       771
           1       0.00      0.00      0.00       229

    accuracy                           0.77      1000
   macro avg       0.39      0.50      0.44      1000
weighted avg       0.59      0.77      0.67      1000


Baseline no-show rate (test set): 0.229
No-show rate among top 20%: 0.290 (N=200)

If reminders cut no-shows by 40% for recipients:
Expected new no-show rate ≈ 0.202
Absolute reduction ≈ 0.027

Sample recommendations (top 10):
 age  lead_days  prev_no_shows  appointment_hour  weekday  distance_km  reminder_sent  socio_flag  pred_proba  reminder_recommendation recommendation_reason  actual_no_show
  30         59              4