# Importing The Module

In [176]:
import pandas as pd    
import numpy as np   
import joblib       
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,root_mean_squared_error,mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import pickle




# Import The File 

In [177]:
data =pd.read_csv('insurance.csv')

# View The DataSet

In [178]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


# PreProcessing 

In [179]:
def preprocess_inputs(df):
    df = data.copy()

    # splitting the dataset into X and y
    X = df.drop('expenses', axis=1)
    y = df['expenses']

    # train test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # categorical columns
    cat_cols = ['sex', 'smoker', 'region']

    # numerical columns
    num_cols = ['age', 'bmi', 'children']

    # one hot encode categorical columns
    encoder = OneHotEncoder(drop='first', sparse_output=False)


    x_train_encode = encoder.fit_transform(x_train[cat_cols])
    x_test_encode = encoder.transform(x_test[cat_cols])

    x_train_encoded = pd.DataFrame(x_train_encode, index=x_train.index, columns=encoder.get_feature_names_out(cat_cols))
    x_test_encoded = pd.DataFrame(x_test_encode, index=x_test.index, columns=encoder.get_feature_names_out(cat_cols))

    # scale numerical columns
    scaler = StandardScaler()
    
    x_train_scale = scaler.fit_transform(x_train[num_cols])
    x_test_scale = scaler.transform(x_test[num_cols])

    x_train_scaled = pd.DataFrame(x_train_scale, index=x_train.index, columns=num_cols)
    x_test_scaled = pd.DataFrame(x_test_scale, index=x_test.index, columns=num_cols)

    # concatenate numerical and categorical features
    x_train_processed = pd.concat([x_train_scaled, x_train_encoded], axis=1)
    x_test_processed = pd.concat([x_test_scaled, x_test_encoded], axis=1)

    return x_train_processed, x_test_processed, y_train, y_test, encoder, scaler

In [180]:
x_train_processed, x_test_processed, y_train, y_test, encoder, scaler = preprocess_inputs(data)

# ALGORITHMS

# 1.LinearRegression

In [181]:
model_lr = LinearRegression()
model_lr.fit(x_train_processed,y_train)


y_pred = model_lr.predict(x_test_processed)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 32193435.27377557
r2_score: 0.7946953084832675
Root Mean Square Error: 5673.925913666442
Mean Absolute Error: 3916.3077180168257


* The model demonstrates a strong fit with an R² score of 0.79, though the high MSE and RMSE indicate significant variance in predictions 

* The R2 Score Is Low Than The Other Regressor

* The mse,rmse,mae are bit higher

# 2.DecisionTreeRegressor

In [182]:
tree_model = DecisionTreeRegressor()
tree_model.fit(x_train_processed, y_train)

y_pred = tree_model.predict(x_test_processed)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 47455939.46420634
r2_score: 0.6973629272713218
Root Mean Square Error: 6888.827147214999
Mean Absolute Error: 3503.0233208955224


* The model shows moderate predictive power with an R² score of 0.71, but the high MSE and RMSE suggest considerable prediction errors

* The R2 Score Is Very Lesser Than The Linear Regression 

* The Mse Of The Dtr Is Higher Than The LinearRegressor 

# 3.KNeighborsRegressor

In [183]:
knn_model = KNeighborsRegressor() 
knn_model.fit(x_train_processed, y_train)

y_pred = knn_model.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 44047140.7407018
r2_score: 0.7191015943138481
Root Mean Square Error: 6636.801996496642
Mean Absolute Error: 4122.186604477612


* The model achieves a fair R² score of 0.71, but the substantial MSE and RMSE indicate notable prediction inaccuracies 

* The R2 Is Much Similiar Value Than The DecisionTreeRegressor 



# 4.RandomForestRegressor

In [184]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train_processed, y_train)

y_pred = rf_model.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 20225073.465646066
r2_score: 0.8710202116221459
Root Mean Square Error: 4497.229532239384
Mean Absolute Error: 2814.2032483395524


* The model performs well with a high R² score of 0.87 and relatively low RMSE and MAE, indicating good accuracy with minimal prediction errors 

* The Value Of R2 Score Is Much Higher Than The Above Regressor 

* The Mse,Mae And Rmse Is Lower Than The Above Regressor Algorithm 

# 5.GradientBoostingRegressor

In [185]:
gbr_model = GradientBoostingRegressor()
gbr_model.fit(x_train_processed, y_train)

y_pred = gbr_model.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 15790940.803764801
r2_score: 0.8992976610633169
Root Mean Square Error: 3973.781675402513
Mean Absolute Error: 2373.928705982682


* The model demonstrates excellent performance with a high R² score of 0.90 and low error metrics, indicating highly accurate predictions with minimal errors

* The R2 Score Is Higher Than The Other Regressor 

* The Mse,Rmse And Mae Values Are Lower Than The Other Regressor 

* This GradientBoostingRegressor Is The Best Model


# Save Using Pickle

In [186]:
with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

In [187]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [188]:
with open("model.pkl", 'wb') as f:
    pickle.dump(gbr_model, f)