In [1]:
import pandas as pd
df=pd.read_csv("data/cleaned.csv")
(df.head(2))

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,southwest,16884.92
1,18,1,33.8,1,0,southeast,1725.55


In [2]:
#Seperate independent and dependent features
X=df.drop(labels='expenses',axis=1)
y=df['expenses']

In [3]:
categorical_feature = X.select_dtypes(include='object').columns
numerical_feature = X.select_dtypes(exclude='object').columns
print(categorical_feature)
print(numerical_feature)

Index(['region'], dtype='object')
Index(['age', 'sex', 'bmi', 'children', 'smoker'], dtype='object')


In [4]:
from sklearn.impute import SimpleImputer #for all the missing values
from sklearn.preprocessing import StandardScaler #Feature scaling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder #Oridnal Encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [5]:
numerical_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencode', OrdinalEncoder()),
    ('onehotencode', OneHotEncoder(sparse=False,handle_unknown="ignore")),
    ('scaler',StandardScaler())

])

preprocessor = ColumnTransformer([  
    ('numerical_pipeline',numerical_pipeline,numerical_feature),
    ('categorical_pipeline',categorical_pipeline,categorical_feature)
])

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=42)

In [7]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1069, 6)
(268, 6)
(1069,)
(268,)


In [8]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train))
X_test=pd.DataFrame(preprocessor.transform(X_test))



In [9]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.157680,0.971409,-0.999052,-0.907908,-0.500292,1.795911,-0.572669,-0.605812,-0.574110
1,-1.300619,0.971409,-0.800646,0.766904,-0.500292,1.795911,-0.572669,-0.605812,-0.574110
2,0.914926,-1.029432,1.150347,0.766904,-0.500292,-0.556820,1.746208,-0.605812,-0.574110
3,1.701087,0.971409,1.811701,-0.907908,-0.500292,-0.556820,-0.572669,1.650678,-0.574110
4,0.557580,-1.029432,-0.651842,0.766904,-0.500292,1.795911,-0.572669,-0.605812,-0.574110
...,...,...,...,...,...,...,...,...,...
1064,0.843457,-1.029432,0.737001,0.766904,1.998831,1.795911,-0.572669,-0.605812,-0.574110
1065,-0.871804,0.971409,2.539190,0.766904,-0.500292,-0.556820,-0.572669,-0.605812,1.741827
1066,-1.372088,0.971409,-1.412398,-0.070502,-0.500292,-0.556820,-0.572669,-0.605812,1.741827
1067,-0.085643,-1.029432,-0.420368,1.604310,-0.500292,-0.556820,-0.572669,-0.605812,1.741827


In [10]:
#Model training initiated
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [11]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [12]:
regression.coef_

array([ 3.48399237e+03, -5.02190031e+00,  1.91858116e+03,  5.79952564e+02,
        9.21528682e+03,  1.47124808e+17,  1.49272730e+17,  1.53399836e+17,
        1.49462256e+17])

In [13]:
regression.intercept_


12983.587478109033

In [16]:
#training model
import numpy as np

def model_evalution(test,predict):
    mse = mean_squared_error(test,predict)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test,predict)
    r2score = r2_score(test,predict)
    return mse,rmse,mae,r2score

In [17]:
models =  {
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "ElasticNet":ElasticNet(),
    "RandomForestRegressor":RandomForestRegressor(random_state=3)
}

traning_score = []
model_list = []
r2_list = []
y_pred_list = []

for i in range(len(list(models))):
    model  = list(models.values())[i]
#     train Model
    model.fit(X_train,y_train)
    score = model.score(X_train,y_train)
    
    # Make Prediction
    y_pradict = model.predict(X_test)
    
    mse,mae,rmse,r2score = model_evalution(y_test,y_pradict)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    # matrix performance
    print("Model Traning Performance")
    print("MSE",mse*100)
    print("MAE",mae*100)
    print("RMSE",rmse*100)
    print("R2",r2score*100)
    print("="*40)
    
    r2_list.append(r2score*100)
    y_pred_list.append(y_pradict)
    traning_score.append(score)

LinearRegression
Model Traning Performance
MSE 3573582188.4521565
MAE 597794.4620396008
RMSE 416969.85242030805
R2 80.5525759360819
Ridge
Model Traning Performance
MSE 3551549409.3700843
MAE 595948.773752416
RMSE 417983.0484840471
R2 80.67247825692408
Lasso
Model Traning Performance
MSE 3548853293.5687304
MAE 595722.5271524261
RMSE 417747.2693600179
R2 80.68715051141533
ElasticNet
Model Traning Performance
MSE 6057074372.240994
MAE 778272.0843150546
RMSE 552117.3909754846
R2 67.03741856440078
RandomForestRegressor
Model Traning Performance
MSE 2131000741.8157299
MAE 461627.63585120527
RMSE 251311.61779850742
R2 88.40310004887148
