In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/1651277648862_healthinsurance.csv')

In [3]:
df

Unnamed: 0,age,sex,weight,bmi,hereditary_diseases,no_of_dependents,smoker,city,bloodpressure,diabetes,regular_ex,job_title,claim
0,60.0,male,64,24.3,NoDisease,1,0,NewYork,72,0,0,Actor,13112.6
1,49.0,female,75,22.6,NoDisease,1,0,Boston,78,1,1,Engineer,9567.0
2,32.0,female,64,17.8,Epilepsy,2,1,Phildelphia,88,1,1,Academician,32734.2
3,61.0,female,53,36.4,NoDisease,1,1,Pittsburg,72,1,0,Chef,48517.6
4,19.0,female,50,20.6,NoDisease,0,0,Buffalo,82,1,0,HomeMakers,1731.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,39.0,male,49,28.3,NoDisease,1,1,Florence,54,1,0,FilmMaker,21082.2
14996,39.0,male,74,29.6,NoDisease,4,0,Miami,64,1,0,Student,7512.3
14997,20.0,male,62,33.3,NoDisease,0,0,Tampa,52,1,0,FashionDesigner,1391.5
14998,52.0,male,88,36.7,NoDisease,0,0,PanamaCity,70,1,0,Farmer,9144.6


**Preprocessing**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  14604 non-null  float64
 1   sex                  15000 non-null  object 
 2   weight               15000 non-null  int64  
 3   bmi                  14044 non-null  float64
 4   hereditary_diseases  15000 non-null  object 
 5   no_of_dependents     15000 non-null  int64  
 6   smoker               15000 non-null  int64  
 7   city                 15000 non-null  object 
 8   bloodpressure        15000 non-null  int64  
 9   diabetes             15000 non-null  int64  
 10  regular_ex           15000 non-null  int64  
 11  job_title            15000 non-null  object 
 12  claim                15000 non-null  float64
dtypes: float64(3), int64(6), object(4)
memory usage: 1.5+ MB


In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [6]:
class AdvancedClaimPredictor:
    def __init__(self, data_path=None, df=None):
        """
        Initialize the claim predictor with advanced preprocessing and modeling capabilities
        """
        if df is not None:
            self.df = df.copy()
        else:
            self.df = pd.read_csv(data_path)

        self.scaler = RobustScaler()  # More robust to outliers than StandardScaler
        self.label_encoders = {}
        self.feature_columns = None
        self.model = None
        self.ensemble_models = []

    def advanced_preprocessing(self):
        """
        Advanced preprocessing with missing value handling, outlier detection, and feature engineering
        """
        print("Starting advanced preprocessing...")

        # Handle missing values intelligently
        # Age: Fill with median based on job_title and city
        for job in self.df['job_title'].unique():
            for city in self.df['city'].unique():
                mask = (self.df['job_title'] == job) & (self.df['city'] == city)
                if mask.sum() > 0:
                    median_age = self.df.loc[mask, 'age'].median()
                    self.df.loc[mask & self.df['age'].isna(), 'age'] = median_age

        # Fill remaining age nulls with overall median
        self.df['age'].fillna(self.df['age'].median(), inplace=True)

        # BMI: Calculate from weight if possible, otherwise use median by age group
        # Assuming average height for BMI calculation where missing
        self.df['age_group'] = pd.cut(self.df['age'], bins=[0, 25, 35, 45, 55, 100],
                                     labels=['Young', 'Adult', 'Middle', 'Senior', 'Elder'])

        for age_group in self.df['age_group'].unique():
            mask = self.df['age_group'] == age_group
            median_bmi = self.df.loc[mask, 'bmi'].median()
            self.df.loc[mask & self.df['bmi'].isna(), 'bmi'] = median_bmi

        # Advanced Feature Engineering
        self.df['bmi_category'] = pd.cut(self.df['bmi'],
                                        bins=[0, 18.5, 25, 30, 40, 100],
                                        labels=['Underweight', 'Normal', 'Overweight', 'Obese', 'Extremely_Obese'])

        self.df['age_bmi_interaction'] = self.df['age'] * self.df['bmi']
        self.df['risk_score'] = (self.df['smoker'] * 2 +
                                self.df['diabetes'] * 3 +
                                self.df['bloodpressure'] / 100 +
                                (self.df['bmi'] > 30).astype(int) * 2)

        self.df['health_index'] = (self.df['regular_ex'] * 2 -
                                  self.df['smoker'] * 3 -
                                  self.df['diabetes'] * 2)

        self.df['weight_height_ratio'] = self.df['weight'] / (self.df['bmi'] / 25)  # Estimated height factor

        # Encode categorical variables with proper handling
        categorical_cols = ['sex', 'hereditary_diseases', 'city', 'job_title', 'bmi_category', 'age_group']

        for col in categorical_cols:
            le = LabelEncoder()
            self.df[f'{col}_encoded'] = le.fit_transform(self.df[col].astype(str))
            self.label_encoders[col] = le

        # Outlier detection and handling for claim amounts
        isolation_forest = IsolationForest(contamination=0.1, random_state=42)
        outliers = isolation_forest.fit_predict(self.df[['claim']])
        self.df['is_outlier'] = outliers == -1

        # Cap extreme outliers instead of removing them
        claim_99 = self.df['claim'].quantile(0.99)
        claim_1 = self.df['claim'].quantile(0.01)
        self.df['claim_capped'] = np.clip(self.df['claim'], claim_1, claim_99)

        print(f"Preprocessing complete. Dataset shape: {self.df.shape}")
        return self.df

    def prepare_features(self):
        """
        Prepare feature matrix for training
        """
        feature_cols = [
            'age', 'weight', 'bmi', 'no_of_dependents', 'smoker',
            'bloodpressure', 'diabetes', 'regular_ex', 'age_bmi_interaction',
            'risk_score', 'health_index', 'weight_height_ratio',
            'sex_encoded', 'hereditary_diseases_encoded', 'city_encoded',
            'job_title_encoded', 'bmi_category_encoded', 'age_group_encoded'
        ]

        X = self.df[feature_cols].copy()
        y = self.df['claim_capped'].values

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        self.feature_columns = feature_cols

        return X_scaled, y

    def create_advanced_neural_network(self, input_dim):
        """
        Create an advanced neural network with attention mechanism and residual connections
        """
        inputs = layers.Input(shape=(input_dim,))

        # First residual block
        x1 = layers.Dense(256, activation='relu')(inputs)
        x1 = layers.BatchNormalization()(x1)
        x1 = layers.Dropout(0.3)(x1)

        x2 = layers.Dense(256, activation='relu')(x1)
        x2 = layers.BatchNormalization()(x2)
        x2 = layers.Dropout(0.3)(x2)

        # Residual connection
        x_res1 = layers.Add()([x1, x2])

        # Attention mechanism
        attention_weights = layers.Dense(256, activation='softmax')(x_res1)
        x_attention = layers.Multiply()([x_res1, attention_weights])

        # Second residual block
        x3 = layers.Dense(128, activation='relu')(x_attention)
        x3 = layers.BatchNormalization()(x3)
        x3 = layers.Dropout(0.2)(x3)

        x4 = layers.Dense(128, activation='relu')(x3)
        x4 = layers.BatchNormalization()(x4)
        x4 = layers.Dropout(0.2)(x4)

        # Skip connection
        x_res2 = layers.Add()([x3, x4])

        # Final layers
        x5 = layers.Dense(64, activation='relu')(x_res2)
        x5 = layers.BatchNormalization()(x5)
        x5 = layers.Dropout(0.1)(x5)

        # Multi-head output for ensemble-like behavior
        head1 = layers.Dense(32, activation='relu')(x5)
        head2 = layers.Dense(32, activation='relu')(x5)

        concat_heads = layers.Concatenate()([head1, head2])

        # Final prediction
        outputs = layers.Dense(1, activation='linear')(concat_heads)

        model = Model(inputs=inputs, outputs=outputs)

        # Custom optimizer with learning rate scheduling
        optimizer = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

        model.compile(
            optimizer=optimizer,
            loss='huber',  # More robust to outliers than MSE
            metrics=['mae', 'mse']
        )

        return model

    def train_with_cross_validation(self, X, y, n_folds=5):
        """
        Train model with cross-validation for robust performance estimation
        """
        kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

        # Create stratification bins for continuous target
        y_bins = pd.qcut(y, q=5, labels=False, duplicates='drop')

        cv_scores = []
        fold_models = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_bins)):
            print(f"Training fold {fold + 1}/{n_folds}")

            X_train_fold, X_val_fold = X[train_idx], X[val_idx]
            y_train_fold, y_val_fold = y[train_idx], y[val_idx]

            # Create and train model for this fold
            model = self.create_advanced_neural_network(X.shape[1])

            # Callbacks
            early_stopping = keras.callbacks.EarlyStopping(
                monitor='val_loss', patience=15, restore_best_weights=True
            )

            reduce_lr = keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss', factor=0.5, patience=7, min_lr=1e-6
            )

            # Train model
            history = model.fit(
                X_train_fold, y_train_fold,
                validation_data=(X_val_fold, y_val_fold),
                epochs=100,
                batch_size=64,
                callbacks=[early_stopping, reduce_lr],
                verbose=0
            )

            # Evaluate fold
            y_pred_fold = model.predict(X_val_fold, verbose=0)
            fold_score = r2_score(y_val_fold, y_pred_fold)
            cv_scores.append(fold_score)
            fold_models.append(model)

            print(f"Fold {fold + 1} R² Score: {fold_score:.4f}")

        self.ensemble_models = fold_models
        print(f"Average CV R² Score: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores)*2:.4f})")

        return cv_scores

    def train_final_model(self, X, y):
        """
        Train final model on all data
        """
        print("Training final model on complete dataset...")

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Create final model
        self.model = self.create_advanced_neural_network(X.shape[1])

        # Callbacks
        early_stopping = keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=20, restore_best_weights=True
        )

        reduce_lr = keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6
        )

        # Train
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_test, y_test),
            epochs=150,
            batch_size=64,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        # Final evaluation
        y_pred = self.model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        print(f"\nFinal Model Performance:")
        print(f"R² Score: {r2:.4f}")
        print(f"MAE: {mae:.2f}")
        print(f"RMSE: {rmse:.2f}")

        return history, (r2, mae, rmse)

    def ensemble_predict(self, X):
        """
        Make predictions using ensemble of cross-validation models
        """
        predictions = []

        for model in self.ensemble_models:
            pred = model.predict(X, verbose=0)
            predictions.append(pred.flatten())

        # Average predictions
        ensemble_pred = np.mean(predictions, axis=0)
        prediction_std = np.std(predictions, axis=0)

        return ensemble_pred, prediction_std

    def apply_reinforcement_learning_optimization(self, X, y):
        """
        Apply RL-based hyperparameter optimization using simple Q-learning approach
        """
        print("Applying RL-based optimization...")

        # Define action space (hyperparameter combinations)
        actions = [
            {'learning_rate': 0.001, 'batch_size': 32, 'dropout': 0.2},
            {'learning_rate': 0.001, 'batch_size': 64, 'dropout': 0.3},
            {'learning_rate': 0.0005, 'batch_size': 32, 'dropout': 0.2},
            {'learning_rate': 0.0005, 'batch_size': 64, 'dropout': 0.3},
            {'learning_rate': 0.002, 'batch_size': 32, 'dropout': 0.1},
            {'learning_rate': 0.002, 'batch_size': 64, 'dropout': 0.4}
        ]

        # Q-table for action values
        q_table = np.zeros(len(actions))
        action_counts = np.zeros(len(actions))

        best_score = -np.inf
        best_params = None

        # RL optimization loop
        for episode in range(len(actions)):
            # Select action (explore all actions once, then exploit)
            action_idx = episode if episode < len(actions) else np.argmax(q_table)
            params = actions[action_idx]

            # Create model with selected hyperparameters
            model = self.create_rl_optimized_model(X.shape[1], params)

            # Train and evaluate
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=0.2, random_state=42 + episode
            )

            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=50,
                batch_size=params['batch_size'],
                verbose=0
            )

            # Calculate reward (negative validation loss)
            val_loss = min(history.history['val_loss'])
            reward = -val_loss

            # Update Q-table
            action_counts[action_idx] += 1
            q_table[action_idx] += (reward - q_table[action_idx]) / action_counts[action_idx]

            if reward > best_score:
                best_score = reward
                best_params = params

            print(f"Episode {episode + 1}: Action {action_idx}, Reward: {reward:.4f}")

        print(f"Best RL-optimized parameters: {best_params}")
        print(f"Best score: {best_score:.4f}")

        return best_params

    def create_rl_optimized_model(self, input_dim, params):
        """
        Create model with RL-optimized hyperparameters
        """
        inputs = layers.Input(shape=(input_dim,))

        x = layers.Dense(256, activation='relu')(inputs)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(params['dropout'])(x)

        x = layers.Dense(128, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(params['dropout'])(x)

        x = layers.Dense(64, activation='relu')(x)
        x = layers.Dropout(params['dropout'] / 2)(x)

        outputs = layers.Dense(1, activation='linear')(x)

        model = Model(inputs=inputs, outputs=outputs)

        optimizer = keras.optimizers.Adam(learning_rate=params['learning_rate'])
        model.compile(optimizer=optimizer, loss='huber', metrics=['mae'])

        return model

    def feature_importance_analysis(self, X, y):
        """
        Analyze feature importance using permutation importance
        """
        print("Analyzing feature importance...")

        if self.model is None:
            print("Model not trained yet. Training model first...")
            self.train_final_model(X, y)

        # Baseline performance
        baseline_pred = self.model.predict(X, verbose=0)
        baseline_score = r2_score(y, baseline_pred)

        feature_importance = {}

        for i, feature in enumerate(self.feature_columns):
            # Permute feature
            X_permuted = X.copy()
            np.random.shuffle(X_permuted[:, i])

            # Calculate performance drop
            permuted_pred = self.model.predict(X_permuted, verbose=0)
            permuted_score = r2_score(y, permuted_pred)

            importance = baseline_score - permuted_score
            feature_importance[feature] = importance

        # Sort by importance
        sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

        print("\nFeature Importance (R² drop when permuted):")
        for feature, importance in sorted_importance[:10]:
            print(f"{feature}: {importance:.4f}")

        return feature_importance

    def run_complete_pipeline(self):
        """
        Run the complete advanced modeling pipeline
        """
        print("Starting Advanced Claim Prediction Pipeline...")
        print("=" * 50)

        # 1. Advanced preprocessing
        self.advanced_preprocessing()

        # 2. Prepare features
        X, y = self.prepare_features()

        # 3. Cross-validation training
        cv_scores = self.train_with_cross_validation(X, y)

        # 4. RL-based optimization
        best_params = self.apply_reinforcement_learning_optimization(X, y)

        # 5. Train final optimized model
        history, performance = self.train_final_model(X, y)

        # 6. Feature importance analysis
        feature_importance = self.feature_importance_analysis(X, y)

        print("\n" + "=" * 50)
        print("Pipeline Complete!")
        print(f"Final Model R² Score: {performance[0]:.4f}")
        print(f"Cross-validation Mean R² Score: {np.mean(cv_scores):.4f}")

        return {
            'cv_scores': cv_scores,
            'final_performance': performance,
            'best_rl_params': best_params,
            'feature_importance': feature_importance,
            'model': self.model,
            'ensemble_models': self.ensemble_models
        }

In [7]:
# Initialize with your DataFrame
predictor = AdvancedClaimPredictor(df=df)

# Run complete pipeline
results = predictor.run_complete_pipeline()

# Get ensemble predictions with uncertainty
X_new, _ = predictor.prepare_features()
predictions, uncertainty = predictor.ensemble_predict(X_new)

Starting Advanced Claim Prediction Pipeline...
Starting advanced preprocessing...
Preprocessing complete. Dataset shape: (15000, 27)
Training fold 1/5
Fold 1 R² Score: 0.9093
Training fold 2/5
Fold 2 R² Score: 0.9018
Training fold 3/5
Fold 3 R² Score: 0.8549
Training fold 4/5
Fold 4 R² Score: 0.8885
Training fold 5/5
Fold 5 R² Score: 0.8733
Average CV R² Score: 0.8856 (+/- 0.0392)
Applying RL-based optimization...
Episode 1: Action 0, Reward: -1733.0101
Episode 2: Action 1, Reward: -1841.3820
Episode 3: Action 2, Reward: -1945.3612
Episode 4: Action 3, Reward: -2215.0752
Episode 5: Action 4, Reward: -1669.8580
Episode 6: Action 5, Reward: -1856.2141
Best RL-optimized parameters: {'learning_rate': 0.002, 'batch_size': 32, 'dropout': 0.1}
Best score: -1669.8580
Training final model on complete dataset...
Epoch 1/150
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 30ms/step - loss: 13452.6729 - mae: 13453.1729 - mse: 326244256.0000 - val_loss: 13224.8398 - val_mae: 1322

In [8]:
import joblib
import os

def save_model(self, model_path='final_model.h5', encoder_path='encoders/', scaler_path='scaler.pkl'):
    """
    Save model, scalers, and encoders
    """
    os.makedirs(encoder_path, exist_ok=True)

    # Save model
    self.model.save(model_path)

    # Save scaler
    joblib.dump(self.scaler, scaler_path)

    # Save label encoders
    for col, le in self.label_encoders.items():
        joblib.dump(le, os.path.join(encoder_path, f"{col}_encoder.pkl"))

    print(f"Model and preprocessing artifacts saved successfully.")

def load_model(self, model_path='final_model.h5', encoder_path='encoders/', scaler_path='scaler.pkl'):
    """
    Load model, scalers, and encoders
    """
    self.model = keras.models.load_model(model_path)
    self.scaler = joblib.load(scaler_path)

    self.label_encoders = {}
    for file in os.listdir(encoder_path):
        if file.endswith('_encoder.pkl'):
            col = file.replace('_encoder.pkl', '')
            self.label_encoders[col] = joblib.load(os.path.join(encoder_path, file))

    print("Model and preprocessing artifacts loaded successfully.")


In [10]:
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from tensorflow import keras

# Load artifacts
model = keras.models.load_model('final_model.h5')
scaler = joblib.load('scaler.pkl')

encoders = {}
encoder_cols = ['sex', 'hereditary_diseases', 'city', 'job_title', 'bmi_category', 'age_group']
for col in encoder_cols:
    encoders[col] = joblib.load(f'encoders/{col}_encoder.pkl')

# Feature columns (must match training)
feature_cols = [
    'age', 'weight', 'bmi', 'no_of_dependents', 'smoker',
    'bloodpressure', 'diabetes', 'regular_ex', 'age_bmi_interaction',
    'risk_score', 'health_index', 'weight_height_ratio',
    'sex_encoded', 'hereditary_diseases_encoded', 'city_encoded',
    'job_title_encoded', 'bmi_category_encoded', 'age_group_encoded'
]

# Streamlit UI
st.title("🧠 Health Insurance Claim Predictor")

# Input fields
age = st.number_input("Age", 18, 100, 35)
weight = st.number_input("Weight (kg)", 30, 150, 70)
bmi = st.number_input("BMI", 15.0, 50.0, 25.0)
no_dep = st.slider("Number of Dependents", 0, 10, 1)
smoker = st.selectbox("Smoker", [0, 1])
bp = st.number_input("Blood Pressure", 80, 200, 120)
diabetes = st.selectbox("Diabetes", [0, 1])
exercise = st.selectbox("Regular Exercise", [0, 1])

sex = st.selectbox("Sex", encoders['sex'].classes_)
hered = st.selectbox("Hereditary Diseases", encoders['hereditary_diseases'].classes_)
city = st.selectbox("City", encoders['city'].classes_)
job = st.selectbox("Job Title", encoders['job_title'].classes_)

# Feature engineering
age_bmi = age * bmi
risk = smoker * 2 + diabetes * 3 + bp / 100 + (bmi > 30) * 2
health_idx = exercise * 2 - smoker * 3 - diabetes * 2
whr = weight / (bmi / 25)

bmi_cat = pd.cut([bmi], bins=[0, 18.5, 25, 30, 40, 100], labels=['Underweight', 'Normal', 'Overweight', 'Obese', 'Extremely_Obese'])[0]
age_group = pd.cut([age], bins=[0, 25, 35, 45, 55, 100], labels=['Young', 'Adult', 'Middle', 'Senior', 'Elder'])[0]

# Encode categorical features
features = {
    'age': age,
    'weight': weight,
    'bmi': bmi,
    'no_of_dependents': no_dep,
    'smoker': smoker,
    'bloodpressure': bp,
    'diabetes': diabetes,
    'regular_ex': exercise,
    'age_bmi_interaction': age_bmi,
    'risk_score': risk,
    'health_index': health_idx,
    'weight_height_ratio': whr,
    'sex_encoded': encoders['sex'].transform([sex])[0],
    'hereditary_diseases_encoded': encoders['hereditary_diseases'].transform([hered])[0],
    'city_encoded': encoders['city'].transform([city])[0],
    'job_title_encoded': encoders['job_title'].transform([job])[0],
    'bmi_category_encoded': encoders['bmi_category'].transform([str(bmi_cat)])[0],
    'age_group_encoded': encoders['age_group'].transform([str(age_group)])[0]
}

input_df = pd.DataFrame([features])
input_scaled = scaler.transform(input_df)

# Predict
if st.button("Predict Claim Amount"):
    pred = model.predict(input_scaled)[0][0]
    st.success(f"💰 Predicted Claim Amount: ₹{pred:,.2f}")


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'final_model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)