# MODELING

### IMPORT LIBRARIES

In [24]:
import numpy as np
import pandas as pd
# feature scaling and encoding
from sklearn.preprocessing import StandardScaler,OneHotEncoder
#train and testing data
from sklearn.model_selection import train_test_split
#regrression evaluation metrics 
from sklearn.metrics import r2_score,mean_squared_error,root_mean_squared_error,mean_absolute_error
#models 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import pickle



In [25]:
# Load your dataset
data = pd.read_csv('C:/Users/surface/Downloads/bank insurence/insurance.csv')  # Replace 'spam.csv' with your file path

### Data perprocessing

In [26]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [27]:
def preprocess_inputs(df):
    df = df.copy()

#### SPLITTING

In [28]:
 # splitting the dataset into X and y
X = data.drop('expenses', axis=1)
y = data['expenses']

In [29]:
    # train test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [30]:

    # categorical columns
cat_cols = ['sex', 'smoker', 'region']

In [31]:

    # numerical columns
num_cols = ['age', 'bmi', 'children']

### Using onehotencoder

In [32]:

    # one hot encode categorical columns
encoder = OneHotEncoder(drop='first', sparse_output=False)

### Fit & transform

In [33]:

x_train_encode = encoder.fit_transform(x_train[cat_cols])
x_test_encode = encoder.transform(x_test[cat_cols])

x_train_encoded = pd.DataFrame(x_train_encode, index=x_train.index, columns=encoder.get_feature_names_out(cat_cols))
x_test_encoded = pd.DataFrame(x_test_encode, index=x_test.index, columns=encoder.get_feature_names_out(cat_cols))


### Using standardscaler

In [34]:
    # scale numerical columns
scaler = StandardScaler()

In [35]:
x_train_scale = scaler.fit_transform(x_train[num_cols])
x_test_scale = scaler.transform(x_test[num_cols])

In [36]:

x_train_scaled = pd.DataFrame(x_train_scale, index=x_train.index, columns=num_cols)
x_test_scaled = pd.DataFrame(x_test_scale, index=x_test.index, columns=num_cols)

### Concantenating

In [37]:
  # concatenate numerical and categorical features
x_train_processed = pd.concat([x_train_scaled, x_train_encoded], axis=1)
x_test_processed = pd.concat([x_test_scaled, x_test_encoded], axis=1)

In [38]:
x_train_processed, x_test_processed, y_train, y_test, encoder, scaler

(           age       bmi  children  sex_male  smoker_yes  region_northwest  \
 1306 -0.718505 -1.429287 -0.915145       0.0         1.0               0.0   
 124   0.558926  0.518872  1.606414       0.0         0.0               1.0   
 588   1.552484  0.843565 -0.915145       0.0         0.0               0.0   
 1127 -0.292695  0.843565  0.765894       0.0         0.0               0.0   
 201   0.629894  0.242883 -0.074626       0.0         0.0               0.0   
 ...        ...       ...       ...       ...         ...               ...   
 802  -1.286253 -1.364349 -0.074626       1.0         0.0               0.0   
 53   -0.221727  0.600045 -0.915145       1.0         1.0               0.0   
 350   1.268610 -1.218237 -0.915145       0.0         0.0               1.0   
 79    0.133116  0.372760 -0.915145       0.0         0.0               1.0   
 792  -1.215284 -1.218237 -0.915145       0.0         0.0               0.0   
 
       region_southeast  region_southwest  
 1306 

# LINEAR REGRESSION

In [39]:
model_lr = LinearRegression()
model_lr.fit(x_train_processed,y_train)
y_pred =model_lr.predict(x_test_processed)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)


Mean Squared Error (MSE): 32193435.273775563
r2_score: 0.7946953084832675
Root Mean Square Error: 5673.925913666441
Mean Absolute Error: 3916.307718016824


#####  >>This model explains about 79% of the variance in the data, but the RMSE (~5674) and MAE (~3916) suggest moderate prediction errors.

# DECISIONTREE REGRESSION

In [40]:
tree_model = DecisionTreeRegressor()
tree_model.fit(x_train_processed, y_train)

y_pred = tree_model.predict(x_test_processed)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 42677396.6877806
r2_score: 0.7278367565557895
Root Mean Square Error: 6532.793941934844
Mean Absolute Error: 3295.156940298508


##### >>This model has a lower \(R^2\) of 0.71, indicating less accuracy, and higher RMSE (~6769), suggesting it struggles with prediction errors compared to other models.

# RANDOMFOREST REGRESSION

In [41]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train_processed, y_train)

y_pred = rf_model.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 20746370.145219225
r2_score: 0.8676957868418043
Root Mean Square Error: 4554.818343822202
Mean Absolute Error: 2805.4399025000007


##### >>It performs well, with an \(R^2\) of 0.87, indicating strong predictive power and relatively low errors (RMSE: 4577, MAE: 2800), making it a reliable model.

# GRADIENTBOOSTING REGRESSION

In [42]:
gbr_model = GradientBoostingRegressor()
gbr_model.fit(x_train_processed, y_train)

y_pred = gbr_model.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 15712318.6771384
r2_score: 0.8997990518380558
Root Mean Square Error: 3963.8767232519226
Mean Absolute Error: 2356.9135959613945


##### >>The KNeighborsRegressor shows weaker performance with a relatively low \(R^2\) of 0.72 and higher errors (RMSE: 6632, MAE: 4108), indicating less accurate predictions compared to other models.

# KNEIGHBORS REGRESSION

In [43]:
knn_model = KNeighborsRegressor() 
knn_model.fit(x_train_processed, y_train)

y_pred = knn_model.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 43979681.832229085
r2_score: 0.7195317947654702
Root Mean Square Error: 6631.717864341718
Mean Absolute Error: 4107.901992537313


##### >>It shows weaker performance with a relatively low \(R^2\) of 0.72 and higher errors (RMSE: 6632, MAE: 4108), indicating less accurate predictions compared to other models.

In [44]:
with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

In [45]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


In [46]:
with open("gbr_model.pkl", "wb") as f:
    pickle.dump(gbr_model, f)


# Conclusion

##### The GradientBoostingRegressor outperforms the other models with the lowest MSE , highest \(R^2\) (0.90), and lowest errors (RMSE: 3963, MAE: 2357), indicating it provides the most accurate predictions among the models tested.