In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("insurance.csv")

# Verify dataset
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (1338, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Data Preprocessing

In [2]:
import numpy as np

# -----------------------------
# 2.1 Missing Value Check
# -----------------------------
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [3]:
# -----------------------------
# 2.2 Separate Features & Target
# -----------------------------
X = df.drop(columns=["charges"])
y = df["charges"]

In [4]:
# -----------------------------
# 2.3 Identify Column Types
# -----------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

categorical_cols, numerical_cols

(['sex', 'smoker', 'region'], ['age', 'bmi', 'children'])

In [5]:
# -----------------------------
# 2.4 Outlier Handling (IQR Capping on target)
# -----------------------------
Q1 = y.quantile(0.25)
Q3 = y.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

y_capped = y.clip(lower=lower_bound, upper=upper_bound)

In [6]:
# -----------------------------
# 2.5 Feature Engineering
# Smoker as binary numeric feature
# -----------------------------
X["smoker_binary"] = X["smoker"].map({"yes": 1, "no": 0})

# Pipeline Creation

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [8]:
# -----------------------------
# Define transformers
# -----------------------------
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [9]:
# -----------------------------
# Column Transformer
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [10]:
# -----------------------------
# Full Pipeline (Preprocessing + Model)
# -----------------------------
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

## Model Selection Justification

For this project, Linear Regression is selected as the primary model. The target variable, medical insurance charges, is continuous, making this a regression problem. Linear Regression is well-suited for estimating continuous outcomes and provides a strong baseline for cost prediction tasks.

The dataset contains a mix of numerical and categorical features, and after appropriate preprocessing (encoding and scaling), Linear Regression can effectively model linear relationships between input variables (such as age, BMI, and smoking status) and insurance costs. Additionally, the model is interpretable, allowing clear understanding of how different factors influence medical charges, which is important in healthcare-related domains.

# Model Training

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# -----------------------------
# Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_capped, test_size=0.2, random_state=42
)

In [13]:
# -----------------------------
# Model Training
# -----------------------------
model_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


# Cross-Validation

In [14]:
from sklearn.model_selection import cross_val_score

In [15]:
# -----------------------------
# Cross-Validation
# -----------------------------
cv_scores = cross_val_score(
    model_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring="r2"
)

print("Cross-validation R2 scores:", cv_scores)
print("Average R2:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())

Cross-validation R2 scores: [0.68596931 0.79957719 0.71951942 0.67128215 0.77490615]
Average R2: 0.7302508429226074
Standard Deviation: 0.04970626479509677


# Hyperparameter Tuning

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [17]:
# -----------------------------
# Update Pipeline with Random Forest
# -----------------------------
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

In [18]:
# -----------------------------
# Hyperparameter Grid
# -----------------------------
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5]
}

In [19]:
# -----------------------------
# Grid Search
# -----------------------------
grid_search = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [None, 10, ...], 'model__min_samples_split': [2, 5], 'model__n_estimators': [100, 200]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
# -----------------------------
# Results
# -----------------------------
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated R2:", grid_search.best_score_)

Best Parameters: {'model__max_depth': 10, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best Cross-Validated R2: 0.7877559884500259


# Best Model Selection

In [21]:
# -----------------------------
# Select Best Model
# -----------------------------
best_model = grid_search.best_estimator_

In [31]:
import pickle

# -----------------------------
# Save the Best Model
# -----------------------------
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Model Performance Evaluation

In [22]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [23]:
# -----------------------------
# Predictions on Test Set
# -----------------------------
y_pred = best_model.predict(X_test)

In [24]:
# -----------------------------
# Evaluation Metrics
# -----------------------------
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

Mean Absolute Error (MAE): 2190.96360646883
Root Mean Squared Error (RMSE): 4148.413989589697
R² Score: 0.8414711586692531


# Web Interface with Gradio

In [27]:
!pip install --upgrade gradio
import gradio as gr

Defaulting to user installation because normal site-packages is not writeable
Collecting gradio
  Downloading gradio-6.3.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting brotli>=1.1.0 (from gradio)
  Downloading brotli-1.2.0-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-1.0.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==2.0.3 (from gradio)
  Downloading gradio_client-2.0.3-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting httpx<1.0,>=0.24.1 (from gradio)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub<2.0,>=0.33.5 (from gradio)
  Downloading huggingface_hub-1.3.2-py3-none-any.whl.metadata (13 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.11.5-cp311-cp311-win_amd64



In [28]:
# -----------------------------
# Prediction Function
# -----------------------------
def predict_insurance_cost(age, sex, bmi, children, smoker, region):
    input_data = pd.DataFrame([{
        "age": age,
        "sex": sex,
        "bmi": bmi,
        "children": children,
        "smoker": smoker,
        "region": region
    }])

    prediction = best_model.predict(input_data)[0]
    return round(prediction, 2)

In [29]:
# -----------------------------
# Gradio Interface
# -----------------------------
interface = gr.Interface(
    fn=predict_insurance_cost,
    inputs=[
        gr.Number(label="Age"),
        gr.Dropdown(["male", "female"], label="Sex"),
        gr.Number(label="BMI"),
        gr.Number(label="Number of Children"),
        gr.Dropdown(["yes", "no"], label="Smoker"),
        gr.Dropdown(["southeast", "southwest", "northwest", "northeast"], label="Region")
    ],
    outputs=gr.Number(label="Predicted Insurance Cost"),
    title="Medical Insurance Cost Prediction",
    description="Predict annual medical insurance charges based on personal information."
)

In [30]:
# -----------------------------
# Launch App
# -----------------------------
interface.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


