In [1]:
import pandas as pd
import numpy as np
import pymongo

In [13]:
client=pymongo.MongoClient('mongodb://localhost:27017/')
db=client.intern

In [14]:
collection=db.table
cursor=collection.find()

In [15]:
list=[]
for doc in cursor:
    list.append(doc)

In [16]:
data=pd.DataFrame(list)

In [17]:
data['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [18]:
data.head()

Unnamed: 0,_id,age,sex,bmi,children,smoker,region,expenses
0,673edd74d5f504857f4734a8,19,female,27.9,0,yes,southwest,16884.92
1,673edd74d5f504857f4734a9,18,male,33.8,1,no,southeast,1725.55
2,673edd74d5f504857f4734aa,28,male,33.0,3,no,southeast,4449.46
3,673edd74d5f504857f4734ab,33,male,22.7,0,no,northwest,21984.47
4,673edd74d5f504857f4734ac,32,male,28.9,0,no,northwest,3866.86


In [19]:
data=data.drop('_id',axis=1)

In [21]:
x=data.drop(labels=['expenses'],axis=1)
y=data[['expenses']]

In [22]:
y

Unnamed: 0,expenses
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86
...,...
1333,10600.55
1334,2205.98
1335,1629.83
1336,2007.95


In [24]:
categorical_columns=x.select_dtypes(include='object').columns
numerical_columns=x.select_dtypes(exclude='object').columns

In [25]:
categorical_columns

Index(['sex', 'smoker', 'region'], dtype='object')

In [26]:
numerical_columns

Index(['age', 'bmi', 'children'], dtype='object')

In [27]:
sex_map=['female','male']
smoker_map=['yes','no']
direction_map=['northeast','southeast','northwest','southwest']

In [28]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [29]:
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scalar',StandardScaler())
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('OrdinalEncoder',OrdinalEncoder(categories=[sex_map,smoker_map,direction_map])),
        ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline',cat_pipeline,categorical_columns)
])

In [30]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=21)

In [31]:
x_train.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')

In [32]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [33]:
x_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__bmi,num_pipeline__children,cat_pipeline__sex,cat_pipeline__smoker,cat_pipeline__region
0,1.639742,1.362064,-0.89838,0.966377,0.496325,-0.466247
1,-0.124394,0.04036,1.644615,0.966377,0.496325,0.439162
2,0.510695,1.261682,0.79695,0.966377,0.496325,-0.466247
3,0.369564,2.215316,0.79695,-1.034793,-2.01481,-0.466247
4,-0.124394,-1.214422,0.79695,-1.034793,0.496325,0.439162


In [34]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [35]:
import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)

    return mae,mse,rmse,r2_square

In [36]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import BayesianRidge

In [37]:
models={
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    #'Polynomial_regression':PolynomialFeatures(),
    'Support vector Machine':SVR(),
    'DTR':DecisionTreeRegressor(),
    'RandomForest':RandomForestRegressor(),
    'Neighbors':KNeighborsRegressor(),
    'Gaussian':GaussianProcessRegressor(),
    'Neural_network':MLPRegressor(),
    'BayesianRidge':BayesianRidge()
    
}

In [38]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
for model_name,model in models.items():
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print("%"*50)
    print(f'{model_name}:')
    print("Model Training Performance")
    print('MAE',(mean_absolute_error(y_test,y_pred)))
    print("R2 score",(r2_score(y_test,y_pred)*100))

    print('='*35)
    print('\n')

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
LinearRegression:
Model Training Performance
MAE 4211.619101716418
R2 score 73.26549357293953


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Ridge:
Model Training Performance
MAE 4212.368200357647
R2 score 73.27291835472985


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Lasso:
Model Training Performance
MAE 4211.2178336422885
R2 score 73.26790775990446


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Support vector Machine:
Model Training Performance
MAE 8244.857868602438
R2 score -17.924458074660365


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
DTR:
Model Training Performance
MAE 3469.3679850746266
R2 score 63.06632136846763




  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
RandomForest:
Model Training Performance
MAE 2734.5026652155875
R2 score 81.69305288486805


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Neighbors:
Model Training Performance
MAE 3188.8850646766173
R2 score 79.27803283607705


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Gaussian:
Model Training Performance
MAE 126713.28613155472
R2 score -85636.289766481




  y = column_or_1d(y, warn=True)


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Neural_network:
Model Training Performance
MAE 13575.84439354289
R2 score -131.71552955148294


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
BayesianRidge:
Model Training Performance
MAE 4213.128939750888
R2 score 73.28027979209548




  y = column_or_1d(y, warn=True)
