In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("insurance.csv")

# Verify dataset
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (1338, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Data Preprocessing

In [2]:
import numpy as np

# -----------------------------
# 2.1 Missing Value Check
# -----------------------------
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [3]:
# -----------------------------
# 2.2 Separate Features & Target
# -----------------------------
X = df.drop(columns=["charges"])
y = df["charges"]

In [4]:
# -----------------------------
# 2.3 Identify Column Types
# -----------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

categorical_cols, numerical_cols

(['sex', 'smoker', 'region'], ['age', 'bmi', 'children'])

In [5]:
# -----------------------------
# 2.4 Outlier Handling (IQR Capping on target)
# -----------------------------
Q1 = y.quantile(0.25)
Q3 = y.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

y_capped = y.clip(lower=lower_bound, upper=upper_bound)

In [6]:
# -----------------------------
# 2.5 Feature Engineering
# Smoker as binary numeric feature
# -----------------------------
X["smoker_binary"] = X["smoker"].map({"yes": 1, "no": 0})

# Pipeline Creation

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [9]:
# -----------------------------
# Define transformers
# -----------------------------
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [10]:
# -----------------------------
# Column Transformer
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [11]:
# -----------------------------
# Full Pipeline (Preprocessing + Model)
# -----------------------------
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

## Model Selection Justification

For this project, Linear Regression is selected as the primary model. The target variable, medical insurance charges, is continuous, making this a regression problem. Linear Regression is well-suited for estimating continuous outcomes and provides a strong baseline for cost prediction tasks.

The dataset contains a mix of numerical and categorical features, and after appropriate preprocessing (encoding and scaling), Linear Regression can effectively model linear relationships between input variables (such as age, BMI, and smoking status) and insurance costs. Additionally, the model is interpretable, allowing clear understanding of how different factors influence medical charges, which is important in healthcare-related domains.

# Model Training

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
# -----------------------------
# Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_capped, test_size=0.2, random_state=42
)

In [14]:
# -----------------------------
# Model Training
# -----------------------------
model_pipeline.fit(X_train, y_train)

# Cross-Validation

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
# -----------------------------
# Cross-Validation
# -----------------------------
cv_scores = cross_val_score(
    model_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring="r2"
)

print("Cross-validation R2 scores:", cv_scores)
print("Average R2:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())

Cross-validation R2 scores: [0.68596931 0.79957719 0.71951942 0.67128215 0.77490615]
Average R2: 0.7302508429226073
Standard Deviation: 0.04970626479509676


# Hyperparameter Tuning

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [18]:
# -----------------------------
# Update Pipeline with Random Forest
# -----------------------------
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

In [19]:
# -----------------------------
# Hyperparameter Grid
# -----------------------------
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5]
}

In [20]:
# -----------------------------
# Grid Search
# -----------------------------
grid_search = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

In [21]:
# -----------------------------
# Results
# -----------------------------
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated R2:", grid_search.best_score_)

Best Parameters: {'model__max_depth': 10, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best Cross-Validated R2: 0.7877559884500259


# Best Model Selection

In [22]:
# -----------------------------
# Select Best Model
# -----------------------------
best_model = grid_search.best_estimator_

# Model Performance Evaluation

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [24]:
# -----------------------------
# Predictions on Test Set
# -----------------------------
y_pred = best_model.predict(X_test)

In [25]:
# -----------------------------
# Evaluation Metrics
# -----------------------------
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

Mean Absolute Error (MAE): 2190.96360646883
Root Mean Squared Error (RMSE): 4148.413989589697
R² Score: 0.8414711586692531


# Web Interface with Gradio

In [26]:
import gradio as gr

In [27]:
# -----------------------------
# Prediction Function
# -----------------------------
def predict_insurance_cost(age, sex, bmi, children, smoker, region):
    input_data = pd.DataFrame([{
        "age": age,
        "sex": sex,
        "bmi": bmi,
        "children": children,
        "smoker": smoker,
        "region": region
    }])

    prediction = best_model.predict(input_data)[0]
    return round(prediction, 2)

In [28]:
# -----------------------------
# Gradio Interface
# -----------------------------
interface = gr.Interface(
    fn=predict_insurance_cost,
    inputs=[
        gr.Number(label="Age"),
        gr.Dropdown(["male", "female"], label="Sex"),
        gr.Number(label="BMI"),
        gr.Number(label="Number of Children"),
        gr.Dropdown(["yes", "no"], label="Smoker"),
        gr.Dropdown(["southeast", "southwest", "northwest", "northeast"], label="Region")
    ],
    outputs=gr.Number(label="Predicted Insurance Cost"),
    title="Medical Insurance Cost Prediction",
    description="Predict annual medical insurance charges based on personal information."
)

In [29]:
# -----------------------------
# Launch App
# -----------------------------
interface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3c76057d3517a20b96.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


