# UTI Prediction Model Inference

This notebook implements inference pipeline for UTI prediction:
1. Loading trained models and parameters
2. Data preprocessing functions
3. Prediction pipeline with both models
4. Confidence scoring and threshold application
5. Example usage with sample data

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## 1. Load Trained Models and Parameters

In [None]:
def load_models():
    """Load trained models and associated parameters"""
    try:
        # Load models
        rf_model = joblib.load('../models/rf_model.joblib')
        xgb_model = joblib.load('../models/xgb_model.joblib')
        
        # Load scaler
        scaler = joblib.load('../models/scaler.joblib')
        
        # Load optimal thresholds
        thresholds = joblib.load('../models/optimal_thresholds.joblib')
        
        return {
            'rf_model': rf_model,
            'xgb_model': xgb_model,
            'scaler': scaler,
            'thresholds': thresholds
        }
    except Exception as e:
        print(f"Error loading models: {str(e)}")
        return None

## 2. Data Preprocessing Functions

In [None]:
def preprocess_data(data, scaler):
    """Preprocess input data for prediction
    
    Args:
        data (pd.DataFrame): Input data with required features
        scaler (StandardScaler): Fitted scaler from training
    
    Returns:
        np.ndarray: Preprocessed data ready for prediction
    """
    # Define the exact feature order as used in training
    feature_order = [
        # Numerical features first
        'age', 'urine_ph', 'wbc', 'rbc',
        
        # Categorical features
        'frequent_urination', 'painful_urination', 'lower_abdominal_pain', 
        'cloudy_urine', 'blood_in_urine', 'fever', 'urgent_urination', 
        'foul_smelling_urine', 'nitrites', 'leukocyte_esterase',
        'gender', 'diabetes', 'hypertension', 'bacteria'
    ]
    
    # Create a copy to avoid modifying original data
    processed_data = data.copy()
    
    # Ensure only the features used in training are present
    # If any feature is missing, raise an informative error
    missing_features = set(feature_order) - set(processed_data.columns)
    if missing_features:
        raise ValueError(f"Missing features: {missing_features}. Please ensure all required features are present.")
    
    # Select and reorder features exactly as they were during training
    processed_data = processed_data[feature_order]
    
    # Ensure categorical binary features are 0 or 1
    categorical_features_binary = [
        'frequent_urination', 'painful_urination', 'lower_abdominal_pain', 
        'cloudy_urine', 'blood_in_urine', 'fever', 'urgent_urination', 
        'foul_smelling_urine', 'nitrites', 'leukocyte_esterase',
        'diabetes', 'hypertension', 'bacteria'
    ]
    
    for feature in categorical_features_binary:
        processed_data[feature] = processed_data[feature].astype(int)
    
    # Encode gender as binary (assuming 'M' is 0, 'F' is 1)
    processed_data['gender'] = processed_data['gender'].map({'M': 0, 'F': 1})
    
    # Scale numerical features
    numerical_features = ['age', 'urine_ph', 'wbc', 'rbc']
    processed_data[numerical_features] = scaler.transform(processed_data[numerical_features])
    
    # Convert to numpy array to avoid feature name issues
    return processed_data.values

## 3. Sample Data Generation

In [None]:
def stratified_sample(data, n_samples=15):
    """
    Perform advanced stratified sampling across different UTI risk scenarios
    
    Args:
        data (pd.DataFrame): Original dataset
        n_samples (int): Total number of samples to generate
    
    Returns:
        pd.DataFrame: Diverse sample representing different UTI scenarios
    """
    # Define more nuanced risk categorization
    def categorize_risk(row):
        # High-risk criteria: multiple strong indicators of UTI
        if (row['bacteria'] == 1 and 
            ((row['frequent_urination'] == 1) or (row['painful_urination'] == 1)) and 
            (row['fever'] == 1 or row['leukocyte_esterase'] == 1)):
            return 'high_risk_uti'
        
        # Moderate risk: potential UTI indicators
        elif ((row['leukocyte_esterase'] == 1 or row['nitrites'] == 1) and 
              (row['frequent_urination'] == 1 or row['painful_urination'] == 1)):
            return 'moderate_risk_potential_uti'
        
        # Low risk with some symptoms
        elif sum([row['frequent_urination'], row['painful_urination'], 
                  row['lower_abdominal_pain'], row['cloudy_urine']]) >= 2:
            return 'low_risk_some_symptoms'
        
        # Very low risk or no UTI indicators
        else:
            return 'no_risk_low_symptoms'
    
    # Add risk category to the dataset
    data['risk_category'] = data.apply(categorize_risk, axis=1)
    
    # Calculate sample distribution
    # Ensure representation of different risk scenarios
    category_samples = {
        'high_risk_uti': int(n_samples * 0.3),  # 30% high-risk cases
        'moderate_risk_potential_uti': int(n_samples * 0.3),  # 30% moderate-risk cases
        'low_risk_some_symptoms': int(n_samples * 0.2),  # 20% low-risk cases
        'no_risk_low_symptoms': int(n_samples * 0.2)  # 20% no-risk cases
    }
    
    # Adjust for any rounding discrepancies
    total_samples = sum(category_samples.values())
    if total_samples < n_samples:
        category_samples['no_risk_low_symptoms'] += (n_samples - total_samples)
    
    # Perform stratified sampling
    samples = []
    for category, count in category_samples.items():
        # Get available samples for this category
        category_data = data[data['risk_category'] == category]
        
        # If not enough samples, sample with replacement
        if len(category_data) < count:
            samples.append(category_data.sample(n=count, replace=True, random_state=42))
        else:
            samples.append(category_data.sample(n=count, random_state=42))
    
    # Combine samples
    diverse_sample = pd.concat(samples, ignore_index=True)
    
    # Remove the temporary risk_category column
    diverse_sample = diverse_sample.drop(columns=['risk_category'])
    
    return diverse_sample

## 4. Prediction Pipeline

In [None]:
def get_prediction(data, model_artifacts):
    """Make predictions using both models with confidence scoring
    
    Args:
        data (pd.DataFrame): Input data for prediction
        model_artifacts (dict): Dictionary containing models and parameters
    
    Returns:
        dict: Prediction results and confidence scores
    """
    # Preprocess data
    processed_data = preprocess_data(data, model_artifacts['scaler'])
    
    # Get predictions and probabilities from both models
    rf_proba = model_artifacts['rf_model'].predict_proba(processed_data)[:, 1]
    xgb_proba = model_artifacts['xgb_model'].predict_proba(processed_data)[:, 1]
    
    # Apply optimal thresholds
    rf_pred = (rf_proba >= model_artifacts['thresholds']['rf_threshold']).astype(int)
    xgb_pred = (xgb_proba >= model_artifacts['thresholds']['xgb_threshold']).astype(int)
    
    # Calculate ensemble probability and confidence
    ensemble_proba = (rf_proba + xgb_proba) / 2
    prediction_agreement = (rf_pred == xgb_pred).astype(int)
    
    # Calculate confidence score
    confidence_score = np.where(
        prediction_agreement,
        np.maximum(ensemble_proba, 1 - ensemble_proba),
        np.minimum(ensemble_proba, 1 - ensemble_proba)
    )
    
    # Prepare results
    results = pd.DataFrame({
        'rf_prediction': rf_pred,
        'rf_probability': rf_proba,
        'xgb_prediction': xgb_pred,
        'xgb_probability': xgb_proba,
        'ensemble_probability': ensemble_proba,
        'confidence_score': confidence_score,
        'models_agree': prediction_agreement
    })
    
    return results

## 5. Results Interpretation

In [None]:
def interpret_prediction(prediction_results):
    """Interpret prediction results and provide recommendation
    
    Args:
        prediction_results (pd.DataFrame): Results from get_prediction function
    
    Returns:
        dict: Dictionary containing interpretation and recommendation
    """
    for idx, row in prediction_results.iterrows():
        interpretation = {
            'final_prediction': 1 if row['ensemble_probability'] >= 0.5 else 0,
            'confidence_level': 'high' if row['confidence_score'] >= 0.8 else 'medium' if row['confidence_score'] >= 0.6 else 'low',
            'model_agreement': 'agreed' if row['models_agree'] else 'disagreed',
            'probability': row['ensemble_probability'],
            'recommendation': ''
        }
        
        # Generate recommendation based on prediction and confidence
        if interpretation['final_prediction'] == 1:
            if interpretation['confidence_level'] == 'high':
                interpretation['recommendation'] = "High probability of UTI. Recommend immediate clinical evaluation."
            elif interpretation['confidence_level'] == 'medium':
                interpretation['recommendation'] = "Moderate probability of UTI. Recommend clinical evaluation within 24 hours."
            else:
                interpretation['recommendation'] = "Possible UTI. Monitor symptoms and recommend clinical evaluation if symptoms persist."
        else:
            if interpretation['confidence_level'] == 'high':
                interpretation['recommendation'] = "Low probability of UTI. Monitor for symptom changes."
            else:
                interpretation['recommendation'] = "UTI unlikely but cannot be ruled out. Monitor symptoms and seek evaluation if they worsen."
        
        return interpretation

## 6. Example Usage

In [None]:
# Load the synthetic dataset
data = pd.read_csv('../data/uti_synthetic_data.csv')

# Load models and artifacts
model_artifacts = load_models()

if model_artifacts:
    # Generate stratified sample data
    sample_data = stratified_sample(data, n_samples=5)
    print("\nStratified Sample Data:")
    print(sample_data)
    
    # Get predictions
    predictions = get_prediction(sample_data, model_artifacts)
    print("\nPrediction Results:")
    print(predictions)
    
    # Interpret results for each sample
    print("\nInterpretations:")
    for i in range(len(predictions)):
        sample_interpretation = interpret_prediction(predictions.iloc[[i]])
        print(f"\nSample {i+1}:")
        print(f"Prediction: {'Positive' if sample_interpretation['final_prediction'] == 1 else 'Negative'}")
        print(f"Confidence Level: {sample_interpretation['confidence_level']}")
        print(f"Model Agreement: {sample_interpretation['model_agreement']}")
        print(f"Probability: {sample_interpretation['probability']:.2f}")
        print(f"Recommendation: {sample_interpretation['recommendation']}")

## 7. Interactive Patient Input

In [None]:
def collect_patient_input():
    """Collect patient information interactively for UTI prediction
    
    Returns:
        pd.DataFrame: DataFrame with patient input data
    """
    print("UTI Prediction Input Form")
    print("Please enter patient information (0 for No, 1 for Yes)")
    
    # Create dictionary to store patient inputs
    patient_data = {
        'frequent_urination': int(input("Frequent urination (0/1): ")),
        'painful_urination': int(input("Painful urination (0/1): ")),
        'lower_abdominal_pain': int(input("Lower abdominal pain (0/1): ")),
        'cloudy_urine': int(input("Cloudy urine (0/1): ")),
        'blood_in_urine': int(input("Blood in urine (0/1): ")),
        'fever': int(input("Fever (0/1): ")),
        'urgent_urination': int(input("Urgent urination (0/1): ")),
        'foul_smelling_urine': int(input("Foul-smelling urine (0/1): ")),
        'nitrites': int(input("Nitrites present (0/1): ")),
        'leukocyte_esterase': int(input("Leukocyte esterase present (0/1): ")),
    }
    
    # Additional numerical inputs
    patient_data['age'] = int(input("Patient age: "))
    patient_data['urine_ph'] = float(input("Urine pH level: "))
    patient_data['wbc'] = float(input("White Blood Cell count: "))
    patient_data['rbc'] = float(input("Red Blood Cell count: "))
    
    # Categorical inputs
    patient_data['gender'] = input("Gender (M/F): ").upper()
    patient_data['diabetes'] = int(input("Diabetes (0/1): "))
    patient_data['hypertension'] = int(input("Hypertension (0/1): "))
    patient_data['bacteria'] = int(input("Bacteria present (0/1): "))
    
    # Convert to DataFrame
    patient_df = pd.DataFrame([patient_data])
    
    return patient_df

def run_patient_prediction():
    """Run the full prediction pipeline for a single patient input"""
    # Load models
    model_artifacts = load_models()
    
    if model_artifacts:
        # Collect patient input
        patient_data = collect_patient_input()
        
        # Get prediction
        predictions = get_prediction(patient_data, model_artifacts)
        
        # Interpret results
        interpretation = interpret_prediction(predictions)
        
        print("\n--- UTI Prediction Result ---")
        print(f"Prediction: {'Positive' if interpretation['final_prediction'] == 1 else 'Negative'}")
        print(f"Confidence Level: {interpretation['confidence_level']}")
        print(f"Probability: {interpretation['probability']:.2f}")
        print(f"Recommendation: {interpretation['recommendation']}")

# Uncomment the following line to run the interactive prediction
run_patient_prediction()