In [57]:
import pandas as pd

In [58]:
df=pd.read_csv('../notebooks/data/insurance.csv')

In [59]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [61]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [62]:
df.duplicated().sum()

1

In [63]:
df.drop_duplicates()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [64]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [65]:
X=df.drop(labels=['expenses'] , axis=1)

In [66]:
Y=df['expenses']

In [67]:
numerical_columns = df.columns[df.dtypes != 'object']
categorical_columns = df.columns[df.dtypes == 'object']


In [68]:
categorical_columns


Index(['sex', 'smoker', 'region'], dtype='object')

In [69]:
numerical_columns


Index(['age', 'bmi', 'children', 'expenses'], dtype='object')

In [70]:
sex_categories=['male' , 'female']
smoker_categories=['yes' , 'no']
region_categories=['southwest', 'southeast', 'northwest', 'northeast']

In [71]:

from sklearn.preprocessing import StandardScaler  # HAndling Feature Scaling
from sklearn.preprocessing import OneHotEncoder  # OneHot Encoding
# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [72]:
# Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ('scaler', StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('OneHotencoder', OneHotEncoder(categories=[
         sex_categories, smoker_categories, region_categories], sparse_output=False)),
        ('scaler', StandardScaler())
    ]

)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])


In [73]:
preprocessor

In [74]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=42)

In [75]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

ValueError: A given column is not a column of the dataframe

In [76]:
X_train.head()


Unnamed: 0,age,sex,bmi,children,smoker,region
332,61,female,31.2,0,no,northwest
355,46,male,27.6,0,no,southwest
138,54,female,31.9,3,no,southeast
381,55,male,30.7,0,yes,northeast
292,25,male,45.5,2,yes,southeast


In [None]:
## Model Training

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [None]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [None]:
regression.coef_

In [None]:
regression=DecisionTreeRegressor()
regression.fit(X_train , y_train)
y_pred=regression.predict(X_test)
r2_score(y_test , y_pred)

0.7423333992067627

In [None]:
regression = SVR()
regression.fit(X_train, y_train)
y_pred = regression.predict(X_test)
r2_score(y_test, y_pred)


-0.11664493190633762

In [None]:
regression = AdaBoostRegressor()
regression.fit(X_train, y_train)
y_pred = regression.predict(X_test)
r2_score(y_test, y_pred)


0.823021249024772

In [None]:
regression = SGDRegressor()
regression.fit(X_train, y_train)
y_pred = regression.predict(X_test)
r2_score(y_test, y_pred)


0.7622013731522692

In [None]:
import numpy as np


def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square


In [None]:
# Train multiple models

models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(),
    'AdaboostRegressor': AdaBoostRegressor(),
    'SGDRegressor': SGDRegressor(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet(),
    'SVR':SVR(),
    'Ridge': Ridge(),
    
}
trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 score", r2_square*100)

    r2_list.append(r2_square)

    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 5810.044645020165
MAE: 4136.906728882809
R2 score 76.97747460503646


DecisionTree
Model Training Performance
RMSE: 6175.753716836726
MAE: 2814.74131840796
R2 score 73.98798723866679


AdaboostRegressor
Model Training Performance
RMSE: 5143.11088716571
MAE: 4139.590738051237
R2 score 81.95961517145709


SGDRegressor
Model Training Performance
RMSE: 5823.620805986646
MAE: 4172.703565994358
R2 score 76.86975678126022


Lasso
Model Training Performance
RMSE: 5811.793154556339
MAE: 4144.843095704586
R2 score 76.96361544733092


ElasticNet
Model Training Performance
RMSE: 6324.2618862557765
MAE: 4697.815991752613
R2 score 72.72192539158783


SVR
Model Training Performance
RMSE: 12576.411938844765
MAE: 8243.43466000953
R2 score -7.8717017136718415


Ridge
Model Training Performance
RMSE: 5811.854764893556
MAE: 4145.26956591786
R2 score 76.96312703113797


