In [28]:
# %% Cell 1: Import libraries and set constants
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Constants
TODAY = datetime(2025, 7, 3).date()
SCREENING_WINDOW = 30  # days to look ahead for reminders

In [29]:
# %% Cell 2: Data Loading and Cleaning
def load_and_clean_data(filepath):
    """Load patient data and perform cleaning"""
    df = pd.read_csv(filepath)
    
    # Convert dates
    date_cols = ['last_breast_screen', 'last_cervical_screen', 'last_colorectal_screen']
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce').dt.date
    
    # Handle missing values
    df['family_history'].fillna('None', inplace=True)
    df['lifestyle'].fillna('Unknown', inplace=True)
    
    # Create binary flags
    df['is_smoker'] = df['lifestyle'].apply(lambda x: 1 if 'smoker' in x.lower() else 0)
    df['alcohol_consumer'] = df['lifestyle'].apply(lambda x: 1 if 'alcohol' in x.lower() else 0)
    df['obese'] = df['lifestyle'].apply(lambda x: 1 if 'obese' in x.lower() else 0)
    
    # Create family history features
    df['fh_Breast'] = df['family_history'].apply(lambda x: 1 if 'breast' in x.lower() else 0)
    df['fh_Cervical'] = df['family_history'].apply(lambda x: 1 if 'cervical' in x.lower() else 0)
    df['fh_Colorectal'] = df['family_history'].apply(
        lambda x: 1 if any(word in x.lower() for word in ['colorectal', 'colon']) else 0
    )
    
    return df

In [30]:
# %% Cell 3: Risk Stratification Models
def train_risk_model(cancer_type, df):
    """Train ML model for specific cancer risk stratification"""
    # Feature engineering
    df = df.copy()
    df['age'] = pd.to_numeric(df['age'])
    
    # Prepare features and target
    if cancer_type == "Breast":
        features = ['age', 'gender', 'fh_Breast', 'is_smoker', 'obese']
        target = 'risk_breast'
    elif cancer_type == "Cervical":
        features = ['age', 'gender', 'fh_Cervical', 'is_smoker', 'obese']
        target = 'risk_cervical'
    elif cancer_type == "Colorectal":
        features = ['age', 'gender', 'fh_Colorectal', 'alcohol_consumer', 'obese']
        target = 'risk_colorectal'
    else:
        raise ValueError(f"Unknown cancer type: {cancer_type}")
    
    # Check if target exists
    if target not in df.columns:
        raise KeyError(f"Required column '{target}' not found in data. Please add risk labels.")
    
    X = pd.get_dummies(df[features], columns=['gender'])
    y = df[target]  # Use existing risk column
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    print(f"{cancer_type} Model - Train Acc: {train_acc:.2f}, Test Acc: {test_acc:.2f}")
    
    return model

def calculate_risk_levels(df, breast_model, cervical_model, colorectal_model):
    """Calculate risk levels for all patients"""
    # Breast cancer risk
    breast_features = pd.get_dummies(df[['age', 'gender', 'fh_Breast', 'is_smoker', 'obese']], columns=['gender'])
    df['breast_risk'] = breast_model.predict(breast_features)
    
    # Cervical cancer risk
    cervical_features = pd.get_dummies(df[['age', 'gender', 'fh_Cervical', 'is_smoker', 'obese']], columns=['gender'])
    df['cervical_risk'] = cervical_model.predict(cervical_features)
    
    # Colorectal cancer risk
    colorectal_features = pd.get_dummies(df[['age', 'gender', 'fh_Colorectal', 'alcohol_consumer', 'obese']], columns=['gender'])
    df['colorectal_risk'] = colorectal_model.predict(colorectal_features)
    
    return df

In [31]:
# %% Cell 4: Screening Guidelines Engine
def get_screening_interval(cancer_type, risk_level, gender, age):
    """Determine screening interval based on guidelines"""
    if cancer_type == "Breast":
        if gender != "Female":
            return None  # Only females are screened for breast cancer
        if risk_level == "High":
            return 1  # years
        elif risk_level == "Medium":
            return 2
        else:  # Low risk
            return 3
    
    elif cancer_type == "Cervical":
        if gender != "Female":
            return None  # Only females are screened for cervical cancer
        if risk_level == "High":
            return 1
        elif risk_level == "Medium":
            return 3
        else:  # Low risk
            return 5
    
    elif cancer_type == "Colorectal":
        if age < 45:
            return None  # Not eligible
        if risk_level == "High":
            return 5
        else:  # Medium/Low risk
            return 10
    
    return None

def next_screening_due(last_screen_date, interval_years):
    """Calculate next due date"""
    if pd.isna(last_screen_date) or last_screen_date is None:
        return TODAY  # Immediately due if never screened
    
    # Calculate next due date and adjust if it's in the past
    due_date = last_screen_date + timedelta(days=interval_years*365)
    return max(due_date, TODAY)  # Ensure due date isn't in the past

In [32]:
# %% Cell 5: Reminder Generation System
def generate_reminders(df):
    """Generate screening reminders for all patients"""
    reminders = []
    
    for _, row in df.iterrows():
        # Breast cancer screening
        interval = get_screening_interval(
            "Breast", row['breast_risk'], row['gender'], row['age']
        )
        if interval:
            due_date = next_screening_due(row['last_breast_screen'], interval)
            if due_date <= TODAY + timedelta(days=SCREENING_WINDOW):
                reminders.append({
                    'patient_id': row['patient_id'],
                    'name': row['name'],
                    'type': 'Breast Cancer Screening',
                    'due_date': due_date,
                    'risk': row['breast_risk']
                })
        
        # Cervical cancer screening
        interval = get_screening_interval(
            "Cervical", row['cervical_risk'], row['gender'], row['age']
        )
        if interval:
            due_date = next_screening_due(row['last_cervical_screen'], interval)
            if due_date <= TODAY + timedelta(days=SCREENING_WINDOW):
                reminders.append({
                    'patient_id': row['patient_id'],
                    'name': row['name'],
                    'type': 'Cervical Cancer Screening',
                    'due_date': due_date,
                    'risk': row['cervical_risk']
                })
        
        # Colorectal cancer screening
        interval = get_screening_interval(
            "Colorectal", row['colorectal_risk'], row['gender'], row['age']
        )
        if interval:
            due_date = next_screening_due(row['last_colorectal_screen'], interval)
            if due_date <= TODAY + timedelta(days=SCREENING_WINDOW):
                reminders.append({
                    'patient_id': row['patient_id'],
                    'name': row['name'],
                    'type': 'Colorectal Cancer Screening',
                    'due_date': due_date,
                    'risk': row['colorectal_risk']
                })
    
    return pd.DataFrame(reminders)

In [None]:
# %% Cell 6: Main Execution Flow
if __name__ == "__main__":
    # Load and clean data
    df = load_and_clean_data("patientsData.csv")
    
    # Train real models using existing risk labels
    print("Training models...")
    breast_model = train_risk_model("Breast", df)
    cervical_model = train_risk_model("Cervical", df)
    colorectal_model = train_risk_model("Colorectal", df)
    
    # Calculate risks using the trained models
    df = calculate_risk_levels(df, breast_model, cervical_model, colorectal_model)
    
    # Generate reminders
    reminders_df = generate_reminders(df)
    
    # Print results
    print(f"\nScreening Reminders as of {TODAY}:\n")
    if reminders_df.empty:
        print("No reminders to send at this time.")
    else:
        for _, reminder in reminders_df.iterrows():
            print(f"Reminder sent to {reminder['name']} ({reminder['patient_id']}):")
            print(f"  - Type: {reminder['type']}")
            print(f"  - Due Date: {reminder['due_date'].strftime('%Y-%m-%d')}")
            print(f"  - Priority: {reminder['risk']} Risk\n")

Training models...
Breast Model - Train Acc: 1.00, Test Acc: 0.00


ValueError: Input contains NaN