# UTI Prediction Model Inference

This notebook implements inference pipeline for UTI prediction:
1. Loading trained models and parameters
2. Data preprocessing functions
3. Prediction pipeline with both models
4. Confidence scoring and threshold application
5. Example usage with sample data

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## 1. Load Trained Models and Parameters

In [2]:
def load_models():
    """Load trained models and associated parameters"""
    try:
        rf_model = joblib.load('../models/rf_model.joblib')
        xgb_model = joblib.load('../models/xgb_model.joblib')
        
        scaler = joblib.load('../models/scaler.joblib')
        
        # More aggressive thresholds for medical risk
        thresholds = {
            'rf_threshold': 0.3,  # Lower threshold to increase sensitivity
            'xgb_threshold': 0.3
        }
        
        return {
            'rf_model': rf_model,
            'xgb_model': xgb_model,
            'scaler': scaler,
            'thresholds': thresholds
        }
    except Exception as e:
        print(f"Error loading models: {str(e)}")
        return None

## 2. Data Preprocessing Functions

In [3]:
def preprocess_data(data, scaler):
    feature_order = [
        'age', 'urine_ph', 'wbc', 'rbc',
        'frequent_urination', 'painful_urination', 'lower_abdominal_pain', 
        'cloudy_urine', 'blood_in_urine', 'fever', 'urgent_urination', 
        'foul_smelling_urine', 'nitrites', 'leukocyte_esterase',
        'gender', 'diabetes', 'hypertension', 'bacteria'
    ]
    
    processed_data = data.copy()
    
    # More robust feature handling
    for feature in feature_order:
        if feature not in processed_data.columns:
            processed_data[feature] = 0
    
    categorical_features = [
        'frequent_urination', 'painful_urination', 'lower_abdominal_pain', 
        'cloudy_urine', 'blood_in_urine', 'fever', 'urgent_urination', 
        'foul_smelling_urine', 'nitrites', 'leukocyte_esterase',
        'diabetes', 'hypertension', 'bacteria'
    ]
    
    for feature in categorical_features:
        processed_data[feature] = processed_data[feature].astype(int).clip(0, 1)
    
    processed_data['gender'] = processed_data['gender'].map({'M': 0, 'F': 1}).fillna(0).astype(int)
    
    numerical_features = ['age', 'urine_ph', 'wbc', 'rbc']
    for feature in numerical_features:
        processed_data[feature] = pd.to_numeric(processed_data[feature], errors='coerce').fillna(processed_data[feature].mean())
    
    processed_data = processed_data[feature_order]
    
    processed_data[numerical_features] = scaler.transform(processed_data[numerical_features])
    
    return processed_data.values

## 3. Sample Data Generation

In [4]:
def stratified_sample(data, n_samples=15):
    """
    Perform advanced stratified sampling across different UTI risk scenarios
    
    Args:
        data (pd.DataFrame): Original dataset
        n_samples (int): Total number of samples to generate
    
    Returns:
        pd.DataFrame: Diverse sample representing different UTI scenarios
    """
    # Define more nuanced risk categorization
    def categorize_risk(row):
        # High-risk criteria: multiple strong indicators of UTI
        if (row['bacteria'] == 1 and 
            ((row['frequent_urination'] == 1) or (row['painful_urination'] == 1)) and 
            (row['fever'] == 1 or row['leukocyte_esterase'] == 1)):
            return 'high_risk_uti'
        
        # Moderate risk: potential UTI indicators
        elif ((row['leukocyte_esterase'] == 1 or row['nitrites'] == 1) and 
              (row['frequent_urination'] == 1 or row['painful_urination'] == 1)):
            return 'moderate_risk_potential_uti'
        
        # Low risk with some symptoms
        elif sum([row['frequent_urination'], row['painful_urination'], 
                  row['lower_abdominal_pain'], row['cloudy_urine']]) >= 2:
            return 'low_risk_some_symptoms'
        
        # Very low risk or no UTI indicators
        else:
            return 'no_risk_low_symptoms'
    
    # Add risk category to the dataset
    data['risk_category'] = data.apply(categorize_risk, axis=1)
    
    # Calculate sample distribution
    # Ensure representation of different risk scenarios
    category_samples = {
        'high_risk_uti': int(n_samples * 0.3),  # 30% high-risk cases
        'moderate_risk_potential_uti': int(n_samples * 0.3),  # 30% moderate-risk cases
        'low_risk_some_symptoms': int(n_samples * 0.2),  # 20% low-risk cases
        'no_risk_low_symptoms': int(n_samples * 0.2)  # 20% no-risk cases
    }
    
    # Adjust for any rounding discrepancies
    total_samples = sum(category_samples.values())
    if total_samples < n_samples:
        category_samples['no_risk_low_symptoms'] += (n_samples - total_samples)
    
    # Perform stratified sampling
    samples = []
    for category, count in category_samples.items():
        # Get available samples for this category
        category_data = data[data['risk_category'] == category]
        
        # If not enough samples, sample with replacement
        if len(category_data) < count:
            samples.append(category_data.sample(n=count, replace=True, random_state=42))
        else:
            samples.append(category_data.sample(n=count, random_state=42))
    
    # Combine samples
    diverse_sample = pd.concat(samples, ignore_index=True)
    
    # Remove the temporary risk_category column
    diverse_sample = diverse_sample.drop(columns=['risk_category'])
    
    return diverse_sample

## 4. Prediction Pipeline

In [5]:
def get_prediction(data, model_artifacts):
    processed_data = preprocess_data(data, model_artifacts['scaler'])
    
    rf_proba = model_artifacts['rf_model'].predict_proba(processed_data)[:, 1]
    xgb_proba = model_artifacts['xgb_model'].predict_proba(processed_data)[:, 1]
    
    # Adjust thresholds to be more sensitive
    rf_threshold = model_artifacts['thresholds'].get('rf_threshold', 0.2)
    xgb_threshold = model_artifacts['thresholds'].get('xgb_threshold', 0.2)
    
    rf_pred = (rf_proba >= rf_threshold).astype(int)
    xgb_pred = (xgb_proba >= xgb_threshold).astype(int)
    
    # Enhanced risk indicators weighting
    risk_indicators = data[['bacteria', 'nitrites', 'leukocyte_esterase', 
                            'painful_urination', 'frequent_urination', 'fever']].apply(lambda x: sum(x), axis=1)
    
    # More aggressive risk weighting
    risk_weights = 1 + (risk_indicators / 3)  # Increased impact of risk indicators
    
    # Adjusted ensemble probability calculation
    ensemble_proba = np.mean([
        rf_proba * risk_weights, 
        xgb_proba * risk_weights
    ], axis=0)
    
    prediction_agreement = (rf_pred == xgb_pred).astype(int)
    
    # More nuanced confidence calculation with higher sensitivity
    confidence_score = np.where(
        prediction_agreement,
        np.maximum(ensemble_proba, 1 - ensemble_proba) * (1 + risk_indicators/2),
        np.minimum(ensemble_proba, 1 - ensemble_proba)
    )
    
    results = pd.DataFrame({
        'rf_prediction': rf_pred,
        'rf_probability': rf_proba,
        'xgb_prediction': xgb_pred,
        'xgb_probability': xgb_proba,
        'ensemble_probability': ensemble_proba,
        'confidence_score': confidence_score,
        'models_agree': prediction_agreement,
        'risk_indicators': risk_indicators
    })
    
    return results

## 5. Results Interpretation

In [6]:
def interpret_prediction(prediction_results):
    interpretations = []
    
    for idx, row in prediction_results.iterrows():
        # More aggressive prediction threshold
        final_prediction = 1 if row['ensemble_probability'] >= 0.2 else 0
        
        # Refined risk categorization
        risk_indicators = row['risk_indicators']
        if risk_indicators >= 3:
            confidence_level = 'very high'
        elif risk_indicators == 2:
            confidence_level = 'high'
        elif risk_indicators == 1:
            confidence_level = 'medium'
        else:
            confidence_level = 'low'
        
        # More specific recommendations
        recommendation = ""
        if final_prediction == 1:
            if confidence_level == 'very high':
                recommendation = "High risk of UTI. Immediate medical consultation strongly recommended."
            elif confidence_level == 'high':
                recommendation = "Moderate to high probability of UTI. Seek medical evaluation within 12 hours."
            else:
                recommendation = "Possible UTI. Consult healthcare provider and monitor symptoms closely."
        else:
            recommendation = "Low UTI risk, but do not ignore persistent or worsening symptoms."
        
        interpretation = {
            'final_prediction': final_prediction,
            'confidence_level': confidence_level,
            'model_agreement': 'agreed' if row['models_agree'] else 'disagreed',
            'probability': row['ensemble_probability'],
            'confidence_score': row['confidence_score'],
            'risk_indicators': risk_indicators,
            'recommendation': recommendation
        }
        
        interpretations.append(interpretation)
    
    return interpretations

## 6. Example Usage

In [7]:
# Load the synthetic dataset
try:
    data = pd.read_csv('../data/uti_synthetic_data.csv')
    
    # Load models and artifacts
    model_artifacts = load_models()
    
    if model_artifacts:
        # Generate a more representative sample
        sample_data = data.sample(n=min(5, len(data)), random_state=42)
        print("\nStratified Sample Data:")
        print(sample_data)
        
        # Get predictions
        predictions = get_prediction(sample_data, model_artifacts)
        print("\nPrediction Results:")
        print(predictions)
        
        # Interpret results for each sample
        print("\nInterpretations:")
        interpretations = interpret_prediction(predictions)
        
        for i, interpretation in enumerate(interpretations, 1):
            print(f"\nSample {i}:")
            print(f"Prediction: {'Positive' if interpretation['final_prediction'] == 1 else 'Negative'}")
            print(f"Confidence Level: {interpretation['confidence_level']}")
            print(f"Model Agreement: {interpretation['model_agreement']}")
            print(f"Probability: {interpretation['probability']:.2f}")
            print(f"Confidence Score: {interpretation['confidence_score']:.2f}")
            print(f"Recommendation: {interpretation['recommendation']}")

except Exception as e:
    print(f"An error occurred during execution: {str(e)}")


Stratified Sample Data:
       frequent_urination  painful_urination  lower_abdominal_pain  \
75721                   0                  0                     0   
80184                   0                  0                     0   
19864                   0                  1                     1   
76699                   0                  0                     0   
92991                   0                  0                     1   

       cloudy_urine  blood_in_urine  fever  urgent_urination  \
75721             0               0      1                 0   
80184             0               0      0                 0   
19864             0               1      0                 1   
76699             0               0      0                 0   
92991             0               0      0                 0   

       foul_smelling_urine  nitrites  leukocyte_esterase  urine_ph        age  \
75721                    0         0                   0  5.468306  28.962950   
80184  