In [5]:
import pandas as pd

In [6]:
df=pd.read_csv('data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [7]:
X=df.drop(labels=['expenses'],axis=1)
Y=df[['expenses']]

In [8]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [9]:
sex_categories = ['female', 'male']
smoker_categories = ['yes', 'no']
region_categories = ['southwest', 'southeast', 'northwest', 'northeast']

In [10]:
from sklearn.impute  import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)


# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[sex_categories,smoker_categories,region_categories])),
    ('scaler',StandardScaler())
    ]

)



preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [12]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [13]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [14]:
X_train


Unnamed: 0,num_pipeline__age,num_pipeline__bmi,num_pipeline__children,cat_pipeline__sex,cat_pipeline__smoker,cat_pipeline__region
0,-1.261206,-0.760093,-0.925046,1.006431,0.511327,1.411177
1,-0.978336,0.495641,2.437452,1.006431,0.511327,-0.417005
2,-0.624748,2.208006,0.756203,1.006431,0.511327,-0.417005
3,1.284628,-0.450236,-0.084422,1.006431,0.511327,-0.417005
4,-0.766183,-1.086257,1.596828,1.006431,-1.955695,-0.417005
...,...,...,...,...,...,...
931,-0.271160,0.577183,-0.084422,-0.993610,0.511327,-0.417005
932,0.153146,-0.385003,-0.084422,-0.993610,0.511327,0.497086
933,-0.695465,0.609799,-0.925046,1.006431,-1.955695,-1.331096
934,0.789604,2.517863,-0.084422,-0.993610,0.511327,-0.417005


In [15]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [16]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [17]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Support Vector Regressor': SVR(),
    'KNN Regressor': KNeighborsRegressor()


}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 5858.705669057534
MAE: 4028.5289830492984
R2 score 76.31627791214297


Lasso
Model Training Performance
RMSE: 5858.791513025547
MAE: 4028.422188888716
R2 score 76.3155838613778


Ridge
Model Training Performance
RMSE: 5858.910820505744
MAE: 4029.636600871353
R2 score 76.31461924033796


Elasticnet
Model Training Performance
RMSE: 6871.765171439433
MAE: 4965.526451058497
R2 score 67.4175891649489




  model.fit(X_train,y_train)


Random Forest Regressor
Model Training Performance
RMSE: 5105.615595512512
MAE: 2848.570178482587
R2 score 82.01365901854743


Gradient Boosting Regressor
Model Training Performance
RMSE: 4810.040118225672
MAE: 2667.3804021830583
R2 score 84.03591642229313


Decision Tree Regressor
Model Training Performance
RMSE: 7113.220451638621
MAE: 3398.3756716417915
R2 score 65.08764600210198




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Support Vector Regressor
Model Training Performance
RMSE: 12719.616122884145
MAE: 8200.124827540882
R2 score -11.633516296321345


KNN Regressor
Model Training Performance
RMSE: 5500.592411369369
MAE: 3244.4561393034833
R2 score 79.12312330170951




In [19]:
model_list

['LinearRegression',
 'Lasso',
 'Ridge',
 'Elasticnet',
 'Random Forest Regressor',
 'Gradient Boosting Regressor',
 'Decision Tree Regressor',
 'Support Vector Regressor',
 'KNN Regressor']

In [20]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
