In [1]:
# Data Collection
import pandas as pd
dataset=pd.read_csv("insurance_pre.csv")
dataset


Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [2]:
# Data preprocessing - nominal to numerical data - one hot encoding method
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [3]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [4]:
indep=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
dep=dataset[['charges']]
dep

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [5]:
# split train and test set
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(indep,dep,test_size=0.3, random_state=0)
X_train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
1163,18,28.215,0,0,0
196,39,32.800,0,0,0
438,52,46.750,5,0,0
183,44,26.410,0,0,0
1298,33,27.455,2,1,0
...,...,...,...,...,...
763,27,26.030,0,1,0
835,42,35.970,2,1,0
1216,40,25.080,0,1,0
559,19,35.530,0,1,0


In [6]:
# Standardisation-Pre processing method - rescales data to a consistent format-easier to understand and use
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
X_test

array([[ 0.89459283, -0.08863026, -0.06746417,  1.02378711, -0.50466988],
       [ 0.53757957, -0.22180837, -0.06746417, -0.97676557, -0.50466988],
       [ 0.60898222,  1.57449152,  0.76341038,  1.02378711,  1.98149332],
       ...,
       [ 1.10880078,  1.20785059, -0.89833872,  1.02378711, -0.50466988],
       [ 1.75142463,  1.34905148, -0.06746417,  1.02378711, -0.50466988],
       [ 1.60861933, -0.92299913, -0.89833872, -0.97676557, -0.50466988]])

In [8]:
# Model creation
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid={
    'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'n_estimators':[50,100],
    'max_features':['sqrt','log2',None]
}
grid=GridSearchCV(RandomForestRegressor(),param_grid, refit=True, verbose=5, n_jobs=-1)
grid.fit(X_train,Y_train) # training the model

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  return fit_method(estimator, *args, **kwargs)


In [9]:
grid.best_params_

{'criterion': 'poisson', 'max_features': 'log2', 'n_estimators': 100}

In [10]:
re=grid.cv_results_
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.25285,0.002569,0.011765,0.000964,squared_error,sqrt,50,"{'criterion': 'squared_error', 'max_features':...",0.863684,0.783053,0.803534,0.83129,0.763932,0.809099,0.035274,14
1,0.492648,0.003851,0.019,0.000913,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.85755,0.792066,0.810916,0.823263,0.772315,0.811222,0.028886,7
2,0.245641,0.002442,0.011542,0.001135,squared_error,log2,50,"{'criterion': 'squared_error', 'max_features':...",0.861535,0.789981,0.798931,0.819534,0.765976,0.807191,0.032152,16
3,0.500555,0.001754,0.01968,0.00074,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.861316,0.787376,0.805212,0.828147,0.76796,0.810002,0.032449,13
4,0.308485,0.002857,0.011004,0.000953,squared_error,,50,"{'criterion': 'squared_error', 'max_features':...",0.860946,0.769852,0.816181,0.803486,0.769003,0.803894,0.034007,17
5,0.60753,0.002647,0.018129,0.00094,squared_error,,100,"{'criterion': 'squared_error', 'max_features':...",0.859967,0.765734,0.815563,0.800363,0.76584,0.801493,0.035124,19
6,0.611385,0.005773,0.009543,0.000274,absolute_error,sqrt,50,"{'criterion': 'absolute_error', 'max_features'...",0.867085,0.793725,0.81581,0.825694,0.763842,0.813231,0.034299,2
7,1.19422,0.010691,0.018837,0.008507,absolute_error,sqrt,100,"{'criterion': 'absolute_error', 'max_features'...",0.864068,0.793162,0.807323,0.822211,0.766029,0.810559,0.032542,9
8,0.594888,0.013232,0.010272,0.005726,absolute_error,log2,50,"{'criterion': 'absolute_error', 'max_features'...",0.864815,0.79348,0.812344,0.817359,0.774239,0.812448,0.030286,3
9,1.191642,0.014974,0.013905,0.004598,absolute_error,log2,100,"{'criterion': 'absolute_error', 'max_features'...",0.860556,0.787404,0.807859,0.824486,0.771794,0.81042,0.030789,11


In [16]:
age=int(input("Age:"))
bmi=float(input("BMI:"))
children=int(input("Children:"))
sex_male=int(input("Sex_Male: 0/1"))
smoker_yes=int(input("Smoker_yes 0 / 1: "))

Future_predictions=grid.predict([[age,bmi,children,sex_male,smoker_yes]])
print("Future predictions={}".format(Future_predictions))

SyntaxError: invalid decimal literal (2790903242.py, line 8)