In [None]:
# Note: 1.Be sure that the 4 files are downloaded in the enviroment
#       2.Run all the needed libraries
#       3.Don't enter negative values or values less than 1 for (Glucose, BloodPressure, SkinThickness ,Insulin, BMI ) because it is not logically although the function can handle these values but to get better predictions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import xgboost as xgb
import lightgbm as lgb
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, confusion_matrix, accuracy_score, roc_curve

In [2]:
def preprocess_input_data(file_path, model_path='random_forest_model.joblib', threshold=0.40):
    """
    Preprocesses raw input data from a CSV file for diabetes prediction.
    Loads the fitted scaler and imputer internally.
    Performs imputation (for physiological zeros), feature engineering,
    one-hot encoding, and scaling.
    Makes a prediction using a provided model and threshold.

    Args:
        file_path (str): The path to the CSV file containing the input data.
                         Expected columns: 'Pregnancies', 'Glucose', 'BloodPressure',
                                           'SkinThickness', 'Insulin', 'BMI', 'Age',
                                           'DiabetesPedigreeFunction'.
                         Values of 0 in Glucose, BloodPressure, SkinThickness,
                                   Insulin, BMI are treated as missing and imputed.
        model_path (str, optional): The path to the saved joblib file of the trained model.
                                  Defaults to 'random_forest_model.joblib'.
        threshold (float, optional): The prediction probability threshold (0 to 1).
                                     Defaults to 0.40.

    Returns:
        tuple: A tuple containing:
               - numpy.ndarray: The binary prediction (0 or 1) for each row.
               - numpy.ndarray: The prediction probability (0 to 1) for the positive class (1) for each row.
               Returns (None, None) if file loading, required file loading, or processing fails.
    """
    # Load data from the CSV file
    try:
        input_data = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: Input file not found at {file_path}")
        return None, None
    except Exception as e:
        print(f"Error loading input file {file_path}: {e}")
        return None, None

    # Create a copy to avoid modifying the original input
    processed_data = input_data.copy()

    # --- Load the fitted scaler, imputer, and training columns internally ---
    scaler_filename = 'standard_scaler.joblib'
    imputer_filename = 'iterative_imputer.joblib'
    training_columns_filename = 'training_columns.joblib'


    try:
        scaler = joblib.load(scaler_filename)
        # print(f"Scaler loaded successfully from '{scaler_filename}'") # Keep print statements minimal in function
    except FileNotFoundError:
        print(f"Error: Scaler file '{scaler_filename}' not found. Cannot preprocess data.")
        return None, None # Exit if scaler cannot be loaded
    except Exception as e:
        print(f"Error loading scaler: {e}")
        return None, None # Exit on other loading errors

    try:
        imputer = joblib.load(imputer_filename)
        # print(f"Imputer loaded successfully from '{imputer_filename}'") # Keep print statements minimal
    except FileNotFoundError:
        print(f"Error: Imputer file '{imputer_filename}' not found. Cannot preprocess data.")
        return None, None # Exit if imputer cannot be loaded
    except Exception as e:
        print(f"Error loading imputer: {e}")
        return None, None # Exit on other loading errors

    try:
        training_columns = joblib.load(training_columns_filename)
        # print(f"Training columns loaded successfully from '{training_columns_filename}'") # Keep print statements minimal
    except FileNotFoundError:
        print(f"Error: Training columns file '{training_columns_filename}' not found. Cannot preprocess data.")
        return None, None # Exit if training columns cannot be loaded
    except Exception as e:
        print(f"Error loading training columns: {e}")
        return None, None # Exit on other loading errors


    # 1. Imputation (Handling non-physiological zeros)
    # Define the target columns with non-physiological zero values used during training
    # This list should be consistent with the training notebook
    target_columns_impute = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

    # Create 'Is_Missing' flags before replacing zeros with NaN
    for col in target_columns_impute:
        flag_col_name = f'Is_{col}_Missing'
        if col in processed_data.columns:
             processed_data[flag_col_name] = (processed_data[col] == 0).astype(int)
        else:
             # Handle cases where an expected column is missing in input
             print(f"Warning: Imputation column '{col}' not found in input data. Adding '{flag_col_name}' flag with zeros.")
             processed_data[flag_col_name] = 0 # Add the missing flag column and set to 0

    # Replace non-physiological zeros with np.nan *after* creating flags
    for col in target_columns_impute:
        if col in processed_data.columns:
             # Only replace 0 if the column is in the target_columns_impute list
             processed_data[col] = processed_data[col].replace(0, np.nan)


    # Apply the loaded IterativeImputer (MICE)
    # Identify numerical columns for imputation (all numerical columns present after flags are added)
    # This list should ideally be derived from the training_columns list to ensure consistency
    cols_to_impute_together_input = processed_data.select_dtypes(include=np.number).columns.tolist()

    # Ensure we only try to impute columns that were imputed during training AND are in the input data
    cols_to_impute_together_input = [col for col in cols_to_impute_together_input if col in imputer.feature_names_in_]


    if len(cols_to_impute_together_input) > 0:
         # Need to align columns before imputation if the input has a different order or subset
         # A robust way is to create a subset of the input data with only the columns the imputer expects,
         # impute, and then merge back.
         input_subset_for_imputation = processed_data[imputer.feature_names_in_]

         # Apply the loaded imputer to the data subset
         imputed_data_array = imputer.transform(input_subset_for_imputation)

         # Update the DataFrame with the imputed values for the imputed columns
         # Need to handle the index correctly to ensure values align with the original rows
         imputed_df_subset = pd.DataFrame(imputed_data_array, columns=imputer.feature_names_in_, index=processed_data.index)

         # Update the original processed_data DataFrame with the imputed values
         processed_data[imputer.feature_names_in_] = imputed_df_subset[imputer.feature_names_in_]
    else:
         print("Warning: No columns found in input data that were targeted for imputation during training.")


    # Correct negative insulin values (from imputation or input)
    MIN_PHYSIOLOGICAL_INSULIN = 1.0
    if 'Insulin' in processed_data.columns:
        # Ensure clamping handles potential NaN values correctly
        processed_data['Insulin'] = processed_data['Insulin'].apply(
            lambda x: max(x, MIN_PHYSIOLOGICAL_INSULIN) if pd.notna(x) else x
        )


    # 2. Feature Engineering (Replicate exactly what was done in the notebook)
    # Ensure EPSILON and GLUCOSE_CRITICAL_CUTOFF are defined or globally available
    # Re-define them here to make the function self-contained
    EPSILON = 1e-6
    GLUCOSE_CRITICAL_CUTOFF = 126

    # A. Log_DPF
    if 'DiabetesPedigreeFunction' in processed_data.columns:
        processed_data['Log_DPF'] = np.log(processed_data['DiabetesPedigreeFunction'].replace(0, EPSILON))
    else:
        # Add the column with a default value or based on imputation strategy for missing DPF
        # Assuming if DPF is missing, log_DPF should reflect that. Imputing with mean/median of log_DPF from train might be better.
        # For simplicity here, adding a placeholder if the original column is missing entirely.
        print("Warning: 'DiabetesPedigreeFunction' column not found for Log_DPF. Adding column with default.")
        processed_data['Log_DPF'] = 0 # Consider a more robust imputation strategy if DPF can be missing


    # B. Glucose_to_Insulin_Ratio
    if 'Glucose' in processed_data.columns and 'Insulin' in processed_data.columns:
         processed_data['Glucose_to_Insulin_Ratio'] = processed_data['Glucose'] / (processed_data['Insulin'] + EPSILON)
    else:
         print("Warning: 'Glucose' or 'Insulin' column not found for Glucose_to_Insulin_Ratio. Adding column with default.")
         processed_data['Glucose_to_Insulin_Ratio'] = 0 # Consider a more robust imputation strategy


    # C. Age_BMI_Interaction
    if 'Age' in processed_data.columns and 'BMI' in processed_data.columns:
        processed_data['Age_BMI_Interaction'] = processed_data['Age'] * processed_data['BMI']
    else:
        print("Warning: 'Age' or 'BMI' column not found for Age_BMI_Interaction. Adding column with default.")
        processed_data['Age_BMI_Interaction'] = 0 # Consider a more robust imputation strategy


    # D. Sqrt_Insulin
    if 'Insulin' in processed_data.columns:
        processed_data['Sqrt_Insulin'] = np.sqrt(processed_data['Insulin'].clip(lower=0)) # Ensure non-negative before sqrt
    else:
        print("Warning: 'Insulin' column not found for Sqrt_Insulin. Adding column with default.")
        processed_data['Sqrt_Insulin'] = 0 # Consider a more robust imputation strategy


    # E. Sqrt_Pregnancies
    if 'Pregnancies' in processed_data.columns:
        processed_data['Sqrt_Pregnancies'] = np.sqrt(processed_data['Pregnancies'].clip(lower=0)) # Ensure non-negative
    else:
        print("Warning: 'Pregnancies' column not found for Sqrt_Pregnancies. Adding column with default.")
        processed_data['Sqrt_Pregnancies'] = 0 # Consider a more robust imputation strategy


    # F. BP_Age_Index (Blood Pressure to Age Index)
    if 'BloodPressure' in processed_data.columns and 'Age' in processed_data.columns:
        processed_data['BP_Age_Index'] = processed_data['BloodPressure'] / (processed_data['Age'] + EPSILON)
    else:
        print("Warning: 'BloodPressure' or 'Age' column not found for BP_Age_Index. Adding column with default.")
        processed_data['BP_Age_Index'] = 0 # Consider a more robust imputation strategy


    # G. Skin_BMI_Ratio (Skin Thickness to BMI Ratio)
    if 'SkinThickness' in processed_data.columns and 'BMI' in processed_data.columns:
        processed_data['Skin_BMI_Ratio'] = processed_data['SkinThickness'] / (processed_data['BMI'] + EPSILON)
    else:
        print("Warning: 'SkinThickness' or 'BMI' column not found for Skin_BMI_Ratio. Adding column with default.")
        processed_data['Skin_BMI_Ratio'] = 0 # Consider a more robust imputation strategy


    # H. Is_Glucose_Critical (Critical Glucose Flag)
    if 'Glucose' in processed_data.columns:
        processed_data['Is_Glucose_Critical'] = (processed_data['Glucose'] >= GLUCOSE_CRITICAL_CUTOFF).astype(int)
    else:
        print("Warning: 'Glucose' column not found for Is_Glucose_Critical. Adding column with default.")
        processed_data['Is_Glucose_Critical'] = 0 # Add column with default value


    # I. BMI_Category (requires the function definition)
    def classify_bmi(bmi):
        """Classifies BMI into standard WHO categories."""
        if pd.isna(bmi): # Handle potential NaN from imputation
            return 'Unknown' # Return a specific missing category for NaN BMI
        elif bmi < 18.5:
            return 'Underweight'
        elif 18.5 <= bmi < 25:
            return 'Normal'
        elif 25 <= bmi < 30:
            return 'Overweight'
        elif 30 <= bmi < 35:
            return 'Obese_Class_I'
        elif 35 <= bmi < 40:
            return 'Obese_Class_II'
        else:
            return 'Obese_Class_III'

    if 'BMI' in processed_data.columns:
         processed_data['BMI_Category'] = processed_data['BMI'].apply(classify_bmi)
    else:
         print("Warning: 'BMI' column not found for BMI_Category. Adding column with default.")
         processed_data['BMI_Category'] = 'Unknown' # Add column with default category


    # 3. One-Hot Encode BMI_Category (Match training columns exactly)
    # Create dummy variables for the BMI_Category column
    processed_data = pd.get_dummies(processed_data, columns=['BMI_Category'], drop_first=False)


    # Reindex processed_data to match the training columns, filling missing columns with 0
    # This step is crucial for consistent feature sets between training and prediction.
    # Ensure processed_data only contains columns that are in training_columns
    # Drop any columns from processed_data that are NOT in training_columns (e.g., original columns if not dropped)
    cols_to_keep = [col for col in processed_data.columns if col in training_columns]
    processed_data = processed_data[cols_to_keep]

    # Add missing columns (from training but not in input) and fill with 0
    for col in training_columns:
        if col not in processed_data.columns:
            processed_data[col] = 0

    # Ensure the order of columns matches the training data
    processed_data = processed_data[training_columns]


    # 4. Scaling (Apply the *fitted* scaler)
    # Identify numerical columns for scaling, excluding binary flags and one-hot encoded columns
    # This list should be derived consistently, ideally from the training_columns list
    numerical_cols_for_scaling = [col for col in training_columns if not (col.startswith('Is_') or col.startswith('BMI_'))]

    # Apply the fitted scaler transform
    # Ensure that processed_data[numerical_cols_for_scaling] is a DataFrame before scaling
    scaled_numerical_data = scaler.transform(processed_data[numerical_cols_for_scaling])

    # Create a new DataFrame from scaled data and concatenate
    scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_cols_for_scaling, index=processed_data.index)

    # Separate non-numerical columns (those that were not scaled)
    non_numerical_cols = [col for col in processed_data.columns if col not in numerical_cols_for_scaling]
    non_numerical_df = processed_data[non_numerical_cols]

    # Concatenate scaled numerical and non-numerical columns
    # Ensure column order matches training data by reindexing
    processed_data_scaled = pd.concat([scaled_numerical_df, non_numerical_df], axis=1)
    processed_data_scaled = processed_data_scaled[training_columns] # Reorder columns

    # The processed_data_scaled DataFrame is now ready for prediction
    X_processed = processed_data_scaled


    # --- Make Prediction using the provided model path and threshold ---
    try:
        model = joblib.load(model_path)
        # print(f"Model loaded successfully from '{model_path}'") # Keep print statements minimal
    except FileNotFoundError:
        print(f"Error: Model file '{model_path}' not found. Cannot make predictions.")
        return None, None
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None


    if not hasattr(model, 'predict_proba'):
        print("Error: Loaded model does not have a predict_proba method for thresholding.")
        # If it's an SVM or similar, you might need decision_function
        if hasattr(model, 'decision_function'):
             print("Attempting to use decision_function for scoring...")
             prediction_scores = model.decision_function(X_processed)
             # For binary classification, decision_function scores need conversion to a probability-like scale if possible,
             # or you predict based on the sign. AUC can be calculated directly on decision_function scores.
             # However, for a probability threshold, predict_proba is standard.
             # Returning None, None as predict_proba is expected for thresholding.
             return None, None # Exit if predict_proba is strictly required for the threshold

        return None, None # Exit if neither predict_proba nor decision_function is available


    # Get prediction probabilities
    prediction_proba = model.predict_proba(X_processed)[:, 1]

    # Apply the threshold to get the binary prediction
    binary_prediction = (prediction_proba >= threshold).astype(int)

    # Return both the binary prediction and the probability
    return binary_prediction, prediction_proba

In [3]:
# Example Usage (Requires a sample input file and the saved .joblib files)
# Create a dummy CSV file for testing
dummy_data = {
    'Pregnancies': [2,5,0],
    'Glucose': [100, 150,0],
    'BloodPressure': [70, 80,0],
    'SkinThickness': [25, 30,0],
    'Insulin': [50, 0,0], # Include a zero for imputation test
    'BMI': [30, 35,0],
    'DiabetesPedigreeFunction': [0.5, 0.6,0.2],
    'Age': [40, 50,22]
}

# --- Required input data to enter her in the dummy data ---

dummy_df = pd.DataFrame(dummy_data)
dummy_csv_path = 'sample_input_data.csv'
dummy_df.to_csv(dummy_csv_path, index=False)

# Use the preprocess_input_data function with the dummy CSV
# The function now loads the scaler, imputer, and model internally
predictions, probabilities = preprocess_input_data(dummy_csv_path, model_path='random_forest_model.joblib', threshold=0.40)

# Display the results
if predictions is not None and probabilities is not None:
    print(f"\nPredictions for data from '{dummy_csv_path}':")
    print(f"Binary Predictions (Threshold=0.40): {predictions}")
    print(f"Prediction Probabilities (Diabetic): {probabilities}")
else:
    print("\nPreprocessing or prediction failed.")


# Test with the logically healthy and sick individuals
# Assuming combined_csv_path and the CSV file exists from the previous example
if 'combined_csv_path' in locals() and combined_csv_path:
    print(f"\nTesting with combined sample data from '{combined_csv_path}':")
    predictions_combined, probabilities_combined = preprocess_input_data(combined_csv_path, model_path='random_forest_model.joblib', threshold=0.40)

    if predictions_combined is not None and probabilities_combined is not None:
        print(f"Binary Predictions (Threshold=0.40): {predictions_combined}")
        print(f"Prediction Probabilities (Diabetic): {probabilities_combined}")
    else:
        print("Preprocessing or prediction for combined data failed.")
else:
    print("\nCombined sample data CSV path not found. Skipping test with combined data.")


Predictions for data from 'sample_input_data.csv':
Binary Predictions (Threshold=0.40): [0 1 1]
Prediction Probabilities (Diabetic): [0.21554671 0.7885325  0.41635334]

Combined sample data CSV path not found. Skipping test with combined data.


In [5]:
import pandas as pd

# Define data for a logically healthy individual
healthy_data = {
    'Pregnancies': 0,
    'Glucose': 80,
    'BloodPressure': 60,
    'SkinThickness': 20,
    'Insulin': 50,
    'BMI': 22,
    'DiabetesPedigreeFunction': 0.1,
    'Age': 25
}

# Define data for a logically sick individual
sick_data = {
    'Pregnancies': 5,
    'Glucose': 180,
    'BloodPressure': 90,
    'SkinThickness': 40,
    'Insulin': 250,
    'BMI': 40,
    'DiabetesPedigreeFunction': 0.8,
    'Age': 55
}

# Combine the data for both individuals into a list
combined_data = [healthy_data, sick_data]

# Create a pandas DataFrame from the combined data
combined_df = pd.DataFrame(combined_data)

print("Created a DataFrame with one healthy and one sick individual:")
display(combined_df)

# You can now use this 'combined_df' with your preprocessing and prediction logic
# For example, you could save it to a CSV and use the preprocess_input_data function:
combined_csv_path = 'combined_sample_data.csv'
combined_df.to_csv(combined_csv_path, index=False)

# Call preprocess_input_data and get the predictions and probabilities
predictions_combined, probabilities_combined = preprocess_input_data(combined_csv_path)

print("\nPredictions Combined Data:")
if predictions_combined is not None and probabilities_combined is not None:
    print(f"Binary Predictions (Threshold=0.40): {predictions_combined}")
    print(f"Prediction Probabilities (Diabetic): {probabilities_combined}")
else:
    print("Preprocessing or prediction for combined data failed.")

loaded_model = joblib.load('random_forest_model.joblib')
# The preprocess_input_data function already returns the predictions and probabilities,
# so we don't need to call predict and predict_proba on the loaded model here.
# If you want to test with the loaded model directly, you would need the processed DataFrame.
# However, the preprocess_input_data function is designed to handle the entire process.

# If you specifically wanted the processed DataFrame, you would modify preprocess_input_data
# to return the processed_data_scaled DataFrame as well.
# For now, we will rely on the output of preprocess_input_data as intended.

# print(f"\nPredictions: {predictions}") # Remove these lines as they were using the incorrect input
# print(f"Probabilities: {probabilities}") # Remove these lines

Created a DataFrame with one healthy and one sick individual:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,80,60,20,50,22,0.1,25
1,5,180,90,40,250,40,0.8,55



Predictions Combined Data:
Binary Predictions (Threshold=0.40): [0 1]
Prediction Probabilities (Diabetic): [0.02392885 0.84824353]


In [None]:
# And here we wanted to be more sure that also the model can predicect data that is very logicaly than the the first is healthy and the second is diabetec
# And the model acutly predicted right
# 0 means healthy and 1 means unhealthy