In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('C:\\Users\\user\\Downloads\\archive\\insurance.csv')

# Features and target
X = df.drop('charges', axis=1)
y = df['charges']

# Define categorical and numerical features
categorical_features = ['sex', 'smoker', 'region']
numerical_features = ['age', 'bmi', 'children']

# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

# Create a pipeline with preprocessing and RandomForestRegressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = pipeline.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R² Score: {r2_score(y_test, y_pred)}")


MAE: 2531.712544775
MSE: 21204696.67185502
R² Score: 0.8634146864318911


In [5]:
import pandas as pd

# Define a single input
input_data = pd.DataFrame({
    'age': [35],
    'sex': ['female'],
    'bmi': [28.5],
    'children': [2],
    'smoker': ['yes'],
    'region': ['northeast']
})

# Make a prediction
predicted_cost = pipeline.predict(input_data)
print(f"Predicted Medical Cost: ${predicted_cost[0]:.2f}")


Predicted Medical Cost: $23620.25


In [7]:
# Define multiple inputs
input_data = pd.DataFrame({
    'age': [40, 25, 50],
    'sex': ['male', 'female', 'male'],
    'bmi': [22.0, 30.1, 26.8],
    'children': [1, 0, 3],
    'smoker': ['yes', 'no', 'no'],
    'region': ['southeast', 'northwest', 'southwest']
})

# Make predictions for multiple inputs
predicted_costs = pipeline.predict(input_data)
for idx, cost in enumerate(predicted_costs):
    print(f"Predicted Medical Cost for input {idx + 1}: ${cost:.2f}")


Predicted Medical Cost for input 1: $18267.22
Predicted Medical Cost for input 2: $3928.98
Predicted Medical Cost for input 3: $12527.42
