# Medical Insurance Cost Prediction Analysis
**Author:** TNT  
**Version:** 3.0  
**Description:** Interactive analysis and model exploration for medical insurance cost prediction

This notebook provides an interactive environment for exploring the insurance dataset and understanding the trained model's behavior.

## 1. Setup and Imports

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import sys
import os

# Add scripts directory to path
sys.path.append('../scripts')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 2. Load and Explore Data

In [None]:
# Load the insurance dataset
data = pd.read_csv('../data/insurance.csv')

print(f"Dataset shape: {data.shape}")
print(f"\nColumn names: {list(data.columns)}")
print(f"\nFirst few rows:")
data.head()

In [None]:
# Basic statistics
print("Dataset Information:")
print(data.info())
print("\n" + "="*50)
print("Statistical Summary:")
data.describe()

In [None]:
# Check for missing values and unique values in categorical columns
print("Missing Values:")
print(data.isnull().sum())

print("\nUnique Values in Categorical Columns:")
categorical_cols = ['sex', 'smoker', 'region']
for col in categorical_cols:
    print(f"{col}: {data[col].unique()}")
    print(f"  Value counts: {data[col].value_counts().to_dict()}")

## 3. Data Visualization

In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Medical Insurance Cost Analysis', fontsize=16, fontweight='bold')

# 1. Distribution of charges
axes[0, 0].hist(data['charges'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Distribution of Insurance Charges')
axes[0, 0].set_xlabel('Charges ($)')
axes[0, 0].set_ylabel('Frequency')

# 2. Age vs Charges
axes[0, 1].scatter(data['age'], data['charges'], alpha=0.6, color='coral')
axes[0, 1].set_title('Age vs Insurance Charges')
axes[0, 1].set_xlabel('Age')
axes[0, 1].set_ylabel('Charges ($)')

# 3. BMI vs Charges
axes[0, 2].scatter(data['bmi'], data['charges'], alpha=0.6, color='lightgreen')
axes[0, 2].set_title('BMI vs Insurance Charges')
axes[0, 2].set_xlabel('BMI')
axes[0, 2].set_ylabel('Charges ($)')

# 4. Smoker vs Charges
sns.boxplot(data=data, x='smoker', y='charges', ax=axes[1, 0])
axes[1, 0].set_title('Smoker Status vs Insurance Charges')

# 5. Sex vs Charges
sns.boxplot(data=data, x='sex', y='charges', ax=axes[1, 1])
axes[1, 1].set_title('Gender vs Insurance Charges')

# 6. Region vs Charges
sns.boxplot(data=data, x='region', y='charges', ax=axes[1, 2])
axes[1, 2].set_title('Region vs Insurance Charges')
axes[1, 2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
from sklearn.preprocessing import LabelEncoder

# Create a copy for correlation analysis
data_corr = data.copy()

# Encode categorical variables for correlation
le_sex = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

data_corr['sex_encoded'] = le_sex.fit_transform(data_corr['sex'])
data_corr['smoker_encoded'] = le_smoker.fit_transform(data_corr['smoker'])
data_corr['region_encoded'] = le_region.fit_transform(data_corr['region'])

# Select numeric columns for correlation
numeric_cols = ['age', 'bmi', 'children', 'charges', 'sex_encoded', 'smoker_encoded', 'region_encoded']
correlation_matrix = data_corr[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
           square=True, linewidths=0.5)
plt.title('Correlation Matrix of Insurance Features')
plt.tight_layout()
plt.show()

print("Correlation with charges:")
print(correlation_matrix['charges'].sort_values(ascending=False))

## 4. Advanced Analysis

In [None]:
# Analyze the impact of smoking on charges by age groups
data['age_group'] = pd.cut(data['age'], bins=[18, 30, 40, 50, 65], labels=['18-30', '31-40', '41-50', '51-65'])

plt.figure(figsize=(12, 6))
sns.boxplot(data=data, x='age_group', y='charges', hue='smoker')
plt.title('Insurance Charges by Age Group and Smoking Status')
plt.ylabel('Charges ($)')
plt.xlabel('Age Group')
plt.show()

# Statistical summary by smoking status
print("Charges by Smoking Status:")
smoking_stats = data.groupby('smoker')['charges'].agg(['mean', 'median', 'std', 'min', 'max'])
print(smoking_stats)

In [None]:
# BMI categories analysis
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

data['bmi_category'] = data['bmi'].apply(categorize_bmi)

plt.figure(figsize=(12, 6))
sns.boxplot(data=data, x='bmi_category', y='charges', hue='smoker')
plt.title('Insurance Charges by BMI Category and Smoking Status')
plt.ylabel('Charges ($)')
plt.xlabel('BMI Category')
plt.show()

print("Average charges by BMI category:")
bmi_stats = data.groupby(['bmi_category', 'smoker'])['charges'].mean().unstack()
print(bmi_stats)

## 5. Load Trained Model and Make Predictions

In [None]:
# Try to load the trained model
try:
    model = joblib.load('../output/best_insurance_model.pkl')
    encoders = joblib.load('../output/encoders.pkl')
    scalers = joblib.load('../output/scalers.pkl')
    
    print("Model loaded successfully!")
    print(f"Model type: {type(model).__name__}")
    print(f"Available encoders: {list(encoders.keys())}")
    print(f"Available scalers: {list(scalers.keys())}")
    
    model_loaded = True
    
except FileNotFoundError:
    print("Model files not found. Please run the training script first:")
    print("python scripts/1.0-tnt-insurance-prediction.py")
    model_loaded = False
except Exception as e:
    print(f"Error loading model: {e}")
    model_loaded = False

In [None]:
# Interactive prediction function
if model_loaded:
    def predict_insurance_cost(age, sex, bmi, children, smoker, region):
        """Make a prediction for given parameters"""
        # Create input dataframe
        input_data = pd.DataFrame({
            'age': [age],
            'sex': [sex],
            'bmi': [bmi],
            'children': [children],
            'smoker': [smoker],
            'region': [region]
        })
        
        # Encode categorical variables
        input_data['sex_encoded'] = encoders['sex'].transform(input_data['sex'])
        input_data['smoker_encoded'] = encoders['smoker'].transform(input_data['smoker'])
        input_data['region_encoded'] = encoders['region'].transform(input_data['region'])
        
        # Select features
        feature_names = ['age', 'bmi', 'children', 'sex_encoded', 'smoker_encoded', 'region_encoded']
        X_new = input_data[feature_names]
        
        # Make prediction
        if 'standard' in scalers:
            try:
                X_new_scaled = scalers['standard'].transform(X_new)
                prediction = model.predict(X_new_scaled)[0]
            except:
                prediction = model.predict(X_new)[0]
        else:
            prediction = model.predict(X_new)[0]
        
        return max(0, prediction)
    
    # Example predictions
    print("Example Predictions:")
    print("="*50)
    
    examples = [
        (25, 'male', 22.0, 0, 'no', 'northwest'),
        (45, 'female', 28.5, 2, 'yes', 'southeast'),
        (35, 'male', 30.0, 1, 'no', 'southwest'),
        (55, 'female', 35.0, 3, 'yes', 'northeast')
    ]
    
    for age, sex, bmi, children, smoker, region in examples:
        prediction = predict_insurance_cost(age, sex, bmi, children, smoker, region)
        print(f"Age: {age}, Sex: {sex}, BMI: {bmi}, Children: {children}, Smoker: {smoker}, Region: {region}")
        print(f"Predicted Cost: ${prediction:.2f}")
        print("-" * 30)
else:
    print("Model not loaded. Cannot make predictions.")

## 6. Model Performance Analysis

In [None]:
# Try to load model comparison results
try:
    results_df = pd.read_csv('../results/model_comparison_results.csv', index_col=0)
    print("Model Comparison Results:")
    print(results_df)
    
    # Plot model comparison
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # R² scores
    results_df['r2_score'].plot(kind='bar', ax=axes[0], color='skyblue')
    axes[0].set_title('Model R² Scores')
    axes[0].set_ylabel('R² Score')
    axes[0].tick_params(axis='x', rotation=45)
    
    # RMSE scores
    results_df['rmse'].plot(kind='bar', ax=axes[1], color='lightcoral')
    axes[1].set_title('Model RMSE Scores')
    axes[1].set_ylabel('RMSE ($)')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
except FileNotFoundError:
    print("Model results not found. Please run the training script first.")
except Exception as e:
    print(f"Error loading results: {e}")

## 7. Interactive Prediction Widget

In [None]:
# Create an interactive prediction interface
if model_loaded:
    print("Interactive Prediction Interface")
    print("="*40)
    print("Modify the values below and run the cell to get predictions:")
    
    # User inputs (modify these values)
    user_age = 35
    user_sex = 'male'  # 'male' or 'female'
    user_bmi = 28.5
    user_children = 2
    user_smoker = 'no'  # 'yes' or 'no'
    user_region = 'northwest'  # 'northeast', 'northwest', 'southeast', 'southwest'
    
    # Make prediction
    prediction = predict_insurance_cost(user_age, user_sex, user_bmi, user_children, user_smoker, user_region)
    
    print(f"\nInput Parameters:")
    print(f"Age: {user_age}")
    print(f"Sex: {user_sex}")
    print(f"BMI: {user_bmi}")
    print(f"Children: {user_children}")
    print(f"Smoker: {user_smoker}")
    print(f"Region: {user_region}")
    print(f"\nPredicted Insurance Cost: ${prediction:.2f}")
    
    # Compare with similar profiles in the dataset
    similar_profiles = data[
        (data['age'].between(user_age-5, user_age+5)) &
        (data['sex'] == user_sex) &
        (data['smoker'] == user_smoker)
    ]
    
    if len(similar_profiles) > 0:
        avg_actual = similar_profiles['charges'].mean()
        print(f"\nAverage cost for similar profiles in dataset: ${avg_actual:.2f}")
        print(f"Difference: ${prediction - avg_actual:.2f}")
    else:
        print("\nNo similar profiles found in the dataset.")
else:
    print("Model not loaded. Please run the training script first.")

## 8. Feature Importance Analysis

In [None]:
# Analyze feature importance if available
if model_loaded and hasattr(model, 'feature_importances_'):
    feature_names = ['age', 'bmi', 'children', 'sex_encoded', 'smoker_encoded', 'region_encoded']
    importances = model.feature_importances_
    
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("Feature Importance Rankings:")
    print(feature_importance_df)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance_df, x='importance', y='feature', palette='viridis')
    plt.title(f'Feature Importance - {type(model).__name__}')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
elif model_loaded:
    print("Feature importance not available for this model type.")
else:
    print("Model not loaded.")

## 9. Summary and Insights

In [None]:
print("KEY INSIGHTS FROM THE ANALYSIS:")
print("="*50)

# Calculate key statistics
smoker_multiplier = data[data['smoker'] == 'yes']['charges'].mean() / data[data['smoker'] == 'no']['charges'].mean()
age_correlation = data['age'].corr(data['charges'])
bmi_correlation = data['bmi'].corr(data['charges'])

print(f"1. SMOKING IMPACT:")
print(f"   - Smokers pay {smoker_multiplier:.1f}x more than non-smokers on average")
print(f"   - Average cost for smokers: ${data[data['smoker'] == 'yes']['charges'].mean():.2f}")
print(f"   - Average cost for non-smokers: ${data[data['smoker'] == 'no']['charges'].mean():.2f}")

print(f"\n2. AGE FACTOR:")
print(f"   - Age correlation with charges: {age_correlation:.3f}")
print(f"   - Insurance costs generally increase with age")

print(f"\n3. BMI IMPACT:")
print(f"   - BMI correlation with charges: {bmi_correlation:.3f}")
print(f"   - Higher BMI tends to correlate with higher insurance costs")

print(f"\n4. REGIONAL DIFFERENCES:")
regional_avg = data.groupby('region')['charges'].mean().sort_values(ascending=False)
for region, avg_cost in regional_avg.items():
    print(f"   - {region.title()}: ${avg_cost:.2f}")

print(f"\n5. FAMILY SIZE:")
children_avg = data.groupby('children')['charges'].mean()
print(f"   - Average cost by number of children:")
for children, avg_cost in children_avg.items():
    print(f"     {children} children: ${avg_cost:.2f}")

if model_loaded:
    print(f"\n6. MODEL PERFORMANCE:")
    print(f"   - Best model: {type(model).__name__}")
    print(f"   - Model can be used for cost estimation and risk assessment")
    print(f"   - Key predictors: Smoking status, Age, BMI")

print(f"\n" + "="*50)
print("RECOMMENDATIONS:")
print("- Smoking cessation programs could significantly reduce costs")
print("- Health and wellness programs focusing on BMI management")
print("- Age-based premium structures are justified by the data")
print("- Regional cost differences should be considered in pricing")