# Insurance Cost Prediction Project
This notebook includes data cleaning, EDA, model training, and feature importance for predicting insurance cost using health and lifestyle data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

In [None]:
df = pd.read_csv("Data.csv")
df.head()

In [None]:
df.drop("applicant_id", axis=1, inplace=True)
df["bmi"].fillna(df["bmi"].median(), inplace=True)
df["Year_last_admitted"] = df["Year_last_admitted"].fillna("Never")
df["Year_last_admitted"] = df["Year_last_admitted"].apply(lambda x: 2025 - int(x) if x != "Never" else 0)

def parse_cholesterol(value):
    try:
        low, high = value.split(" to ")
        return (int(low) + int(high)) // 2
    except:
        return np.nan

df["cholesterol_level"] = df["cholesterol_level"].apply(parse_cholesterol)
df["cholesterol_level"].fillna(df["cholesterol_level"].median(), inplace=True)

binary_cols = {
    "Gender": {"Male": 1, "Female": 0},
    "covered_by_any_other_company": {"Y": 1, "N": 0},
    "Alcohol": {"No": 0, "Rare": 1, "Daily": 2},
    "exercise": {"No": 0, "Moderate": 1, "Extreme": 2},
}
for col, mapping in binary_cols.items():
    df[col] = df[col].map(mapping)

le = LabelEncoder()
df["smoking_status"] = le.fit_transform(df["smoking_status"])

df = pd.get_dummies(df, columns=["Occupation", "Location"], drop_first=True)
df.head()

In [None]:
corr_matrix = df.corr()
top_corr_features = corr_matrix["insurance_cost"].abs().sort_values(ascending=False).head(10).index
plt.figure(figsize=(10, 8))
sns.heatmap(df[top_corr_features].corr(), annot=True, cmap="coolwarm")
plt.title("Top 10 Feature Correlations with Insurance Cost")
plt.tight_layout()
plt.show()

In [None]:
X = df.drop("insurance_cost", axis=1)
y = df["insurance_cost"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"{name}\nMAE: {mae:.2f} | RMSE: {rmse:.2f} | R2: {r2:.4f}\n")

In [None]:
rf_model = models["Random Forest"]
importances = rf_model.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)[:20]
plt.figure(figsize=(10, 8))
feat_imp.plot(kind='barh')
plt.title("Top 20 Feature Importances - Random Forest")
plt.gca().invert_yaxis()
plt.show()

### SHAP Explanations (Optional - Run Locally)
To run SHAP locally, install SHAP and run the following code:
```python
import shap
explainer = shap.Explainer(rf_model, X_train.sample(500))
shap_values = explainer(X_train.sample(500))
shap.summary_plot(shap_values, X_train.sample(500))
```