In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("simranjain17/insurance")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/simranjain17/insurance/versions/1


T**ask 1: Data Exploration and Preprocessing**
1. Load the dataset and display the first few rows.
2. Perform basic statistical analysis to understand the distribution of the
     features.
3. Check for missing values and handle them appropriately.
4. Check for categorical features and convert them to numerical features.
5. Perform feature engineering, including the creation of new features and scaling of numerical
features.
6. Split the data into training and testing sets.


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
import math

In [7]:
#task 1
df = pd.read_csv ("insurance.csv")
#few rows
df.head()
# prompt: Perform basic statistical analysis to understand the distribution of the features.

# Check the data types of each column
print(df.dtypes)

# Generate descriptive statistics for numerical features
print(df.describe())

# Explore the distribution of categorical features
for col in ['sex', 'smoker', 'region']:
  print(df[col].value_counts())

# Convert categorical features to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

# Display the updated DataFrame with numerical features
print(df.head())
from sklearn.preprocessing import MinMaxScaler

# Create interaction features
df['bmi_age'] = df['bmi'] * df['age']
df['bmi_children'] = df['bmi'] * df['children']
# Identify numerical features for scaling
numerical_features = ['age', 'bmi', 'children', 'bmi_age', 'bmi_children']

# Scale numerical features using MinMaxScaler
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Display the updated DataFrame with scaled numerical features
print(df.head())

# Define features (X) and target (y)
X = df.drop('charges', axis=1)
y = df['charges']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010
sex
male      676
female    662
Name: count, dtype: int64
smoker
no     1064
yes     274
Name: count, dtype: int64
region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64
   age     bmi  children      charges  sex_male  smoker_yes  region_northwest  \
0   19  27.900    

**Task 2: Implement Regression Models **
1. Train the following regression models:
o Linear Regression
o Decision Tree Regression
o Random Forest Regression
o Gradient Boosting Regression
o Support Vector Regression (SVR)
2. For each model, train it using the training set and predict on the testing set.



In [8]:
#task 2
# Initialize and train the regression models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regression": DecisionTreeRegressor(),
    "Random Forest Regression": RandomForestRegressor(),
    "Gradient Boosting Regression": GradientBoostingRegressor(),
    "Support Vector Regression": SVR()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)


**Task 3: Model Evaluation **
1. Evaluate each model using the following metrics:
o  Mean Absolute Error (MAE)
o Mean Squared Error (MSE)
o Root Mean Squared Error (RMSE)
o Mean Absolute Percentage Error (MAPE)
o R-squared (R2)
2. Compare the performance of the models based on these metrics and find out which model performs the best.

In [10]:
#task 3
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}

    print(f"{name}:")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  Root Mean Squared Error (RMSE): {rmse}")
    print(f"  Mean Absolute Percentage Error (MAPE): {mape}")
    print(f"  R-squared (R2): {r2}")
    print("-"*30)

best_model = max(results, key=lambda k: results[k]['R2'])
print(f"The best performing model is: {best_model} with an R-squared score of {results[best_model]['R2']}")
print("-M" * 30)

Linear Regression:
  Mean Absolute Error (MAE): 4190.528604001747
  Mean Squared Error (MSE): 34158899.357042
  Root Mean Squared Error (RMSE): 5844.561519655858
  Mean Absolute Percentage Error (MAPE): 0.4704370573082475
  R-squared (R2): 0.7799730855845862
------------------------------
Decision Tree Regression:
  Mean Absolute Error (MAE): 2774.7215781492537
  Mean Squared Error (MSE): 35996180.000064306
  Root Mean Squared Error (RMSE): 5999.681658226902
  Mean Absolute Percentage Error (MAPE): 0.35001263464595317
  R-squared (R2): 0.7681386530235725
------------------------------
Random Forest Regression:
  Mean Absolute Error (MAE): 2445.9837132044786
  Mean Squared Error (MSE): 20574316.34074517
  Root Mean Squared Error (RMSE): 4535.892011583297
  Mean Absolute Percentage Error (MAPE): 0.3122081039490585
  R-squared (R2): 0.8674751404211287
------------------------------
Gradient Boosting Regression:
  Mean Absolute Error (MAE): 2417.1346923122715
  Mean Squared Error (MSE): 18