In [2]:
import pandas as pd

In [3]:
df=pd.read_csv(r'D:\MLproject\Portfolio_Projects\Insurance_premium\notebooks\data\insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [4]:
## Independent & Dependent features

X = df.drop(labels=['expenses'],axis=1)
Y = df['expenses']

In [5]:
Y

0       16884.92
1        1725.55
2        4449.46
3       21984.47
4        3866.86
          ...   
1333    10600.55
1334     2205.98
1335     1629.83
1336     2007.95
1337    29141.36
Name: expenses, Length: 1338, dtype: float64

In [6]:

from sklearn.impute import SimpleImputer ## Handling Missing value
from sklearn.preprocessing import StandardScaler ## Handling feature scaling
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [8]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [9]:
print(categorical_cols)
print(numerical_cols)

Index(['sex', 'smoker', 'region'], dtype='object')
Index(['age', 'bmi', 'children'], dtype='object')


In [10]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [12]:
one_hot_tf=ColumnTransformer([
    ('ohe_sex',OneHotEncoder(sparse=False,handle_unknown='ignore',drop='first'),[1,4,5])
],remainder='passthrough')

In [13]:
scale_tf=ColumnTransformer([
    ('scale',StandardScaler(),slice(0,7)) # sex =1,region =3, smo =1 =4  and other =3 total =7 
])

In [14]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
1077,21,male,26.0,0,no,northeast
61,25,male,33.7,4,no,southeast
796,30,male,44.2,2,no,southeast
1061,57,male,27.9,1,no,southeast
69,28,male,24.0,3,yes,southeast


In [15]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,ElasticNet,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [16]:
## Train multiple models
## Model Ecaluation
models = [
    ('lr',LinearRegression()),
    ('rgd',Ridge()),
    ('lasso',Lasso()),
    ('en',ElasticNet()),
    ('dt',DecisionTreeRegressor()),
    ('RandomForest', RandomForestRegressor()),
    ('GradientBoosting', GradientBoostingRegressor())
    # Add more models if desired
]

results = {}  # A dictionary to store the evaluation results

for model_name, model in models:
    # create pipeline
    pipe = Pipeline([
        ('ohe', one_hot_tf),
        ('scale_tf', scale_tf),
        ('model_tf', model)
    ])
    
    # Train the model using the pipeline
    pipe.fit(X_train, y_train)
    
    # Evaluate the model on the validation set
    y_pred = pipe.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Store the evaluation results in the dictionary
    results[model_name] = {'R2': r2, 'MSE': mse, 'MAE': mae}

# Now, display the evaluation results for all the models
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"R2 Score: {metrics['R2']}")
    print(f"Mean Squared Error: {metrics['MSE']}")
    print(f"Mean Absolute Error: {metrics['MAE']}")
    print("----------------------")



Model: lr
R2 Score: 0.760162039840738
Mean Squared Error: 34759324.36609049
Mean Absolute Error: 4097.659024795042
----------------------
Model: rgd
R2 Score: 0.7601450753623513
Mean Squared Error: 34761783.00027231
Mean Absolute Error: 4098.796623527131
----------------------
Model: lasso
R2 Score: 0.760146127190908
Mean Squared Error: 34761630.56047523
Mean Absolute Error: 4097.688069178779
----------------------
Model: en
R2 Score: 0.6716796010846309
Mean Squared Error: 47582939.89127238
Mean Absolute Error: 4963.78434148366
----------------------
Model: dt
R2 Score: 0.6753062367431999
Mean Squared Error: 47057337.50068284
Mean Absolute Error: 3466.1028109452736
----------------------
Model: RandomForest
R2 Score: 0.8061928352657387
Mean Squared Error: 28088156.265993714
Mean Absolute Error: 3106.6072176924904
----------------------
Model: GradientBoosting
R2 Score: 0.8336208045898725
Mean Squared Error: 24113065.409618568
Mean Absolute Error: 2771.4755139219787
--------------------

