In [1]:
import pandas as pd

## 1. Data collection

In [2]:
Data=pd.read_csv("insurance_pre.csv")
Data

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
Data_set=pd.get_dummies(Data, drop_first=True)
Data_set

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
Data_set.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

## 2. Input and Output split

In [5]:
Independent=Data_set[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
Independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [6]:
Dependent=Data_set[['charges']]
Dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## 3.Split train and test data

In [7]:
#Here we are using Gird Search CV (cross validation) so no need to use train and test split, Gird will do that all the method.

## 4. Model creation

In [8]:
!pip install lightgbm



In [10]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor

p_grid={'n_estimators':[10,50,100], 'objective':['regression','poisson', 'mape'], 'boosting_type':['gbdt','dart', 'rf']}

#In default CV take Kfold method to do cross validate we can mention the cv in code like below line
KFold=KFold(n_splits=5,shuffle=True, random_state=42)

grid=GridSearchCV(LGBMRegressor(random_state=42), p_grid, cv=KFold, refit=True, verbose=3, n_jobs=-1)
grid.fit(Independent, Dependent)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 313
[LightGBM] [Info] Number of data points in the train set: 1338, number of used features: 5
[LightGBM] [Info] Start training from score 9.493293


45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\smani\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\smani\anaconda3\lib\site-packages\lightgbm\sklearn.py", line 1398, in fit
    super().fit(
  File "C:\Users\smani\anaconda3\lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    self._Booster = train(
  File "C:\Users\smani\anaconda3\lib\site-packages\lightgbm\engine.py", line 297, in train
    booster = Booster(params=params, train_set=train_set)
  File "C:\Users\smani\anaconda3\lib\sit

## 5. Evalution metric

In [11]:
#since we are using GridSearchCV so it will do Evalution the metric and give the best model

In [12]:
# Store the result in re variable
re=grid.cv_results_
print("The Score value for the best model {}".format(grid.best_params_),"Score :",grid.best_score_)

The Score value for the best model {'boosting_type': 'gbdt', 'n_estimators': 50, 'objective': 'poisson'} Score : 0.8448487692452158


In [13]:
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_n_estimators,param_objective,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.059215,0.010782,0.005621,0.000802,gbdt,10,regression,"{'boosting_type': 'gbdt', 'n_estimators': 10, ...",0.7708449,0.7699729,0.7776558,0.7000346,0.7250019,0.748702,0.030696,11
1,0.059893,0.008556,0.005624,0.000803,gbdt,10,poisson,"{'boosting_type': 'gbdt', 'n_estimators': 10, ...",0.5676759,0.5760122,0.573063,0.5102824,0.5281825,0.5510432,0.026717,14
2,0.061626,0.005907,0.004366,0.000701,gbdt,10,mape,"{'boosting_type': 'gbdt', 'n_estimators': 10, ...",0.6473927,0.6041396,0.5903206,0.5294576,0.60995,0.5962521,0.038372,13
3,0.231295,0.030106,0.005469,0.000733,gbdt,50,regression,"{'boosting_type': 'gbdt', 'n_estimators': 50, ...",0.8725555,0.8428396,0.8727317,0.7972607,0.8341627,0.84391,0.028013,3
4,0.213132,0.024753,0.005068,0.000971,gbdt,50,poisson,"{'boosting_type': 'gbdt', 'n_estimators': 50, ...",0.8712959,0.8492411,0.8794349,0.7929742,0.8312977,0.8448488,0.03095,1
5,0.20716,0.015206,0.005466,0.000731,gbdt,50,mape,"{'boosting_type': 'gbdt', 'n_estimators': 50, ...",0.8427146,0.7829405,0.8069973,0.753806,0.8110491,0.7995015,0.02973,9
6,0.4293,0.009751,0.005921,0.000126,gbdt,100,regression,"{'boosting_type': 'gbdt', 'n_estimators': 100,...",0.8676204,0.8338896,0.8593166,0.7857975,0.8229539,0.8339156,0.02904,4
7,0.427269,0.009108,0.005973,0.001274,gbdt,100,poisson,"{'boosting_type': 'gbdt', 'n_estimators': 100,...",0.8740265,0.8419867,0.8768896,0.7909996,0.8358223,0.8439449,0.031187,2
8,0.439639,0.003721,0.005239,0.000962,gbdt,100,mape,"{'boosting_type': 'gbdt', 'n_estimators': 100,...",0.8538373,0.7940176,0.8284338,0.7827853,0.8221317,0.8162411,0.025338,6
9,0.042543,0.002429,0.004014,2e-06,dart,10,regression,"{'boosting_type': 'dart', 'n_estimators': 10, ...",0.6943515,0.6981256,0.6996148,0.6280041,0.649614,0.673942,0.029539,12


In [22]:
age=int(input("Enter your age :"))
bmi=float(input("Enter your BMI : "))
children=int(input("Enter Children count:"))
Gen=int(input("Enter the gander if male enter 1, if female enter 0 :"))
smok=int(input("Enter if somker enter 1, if no somker enter 0 :"))

Enter your age :22
Enter your BMI : 28.900
Enter Children count:2
Enter the gander if male enter 1, if female enter 0 :0
Enter if somker enter 1, if no somker enter 0 :1


In [23]:
result=grid.predict([[age,bmi,children,Gen,smok]])

In [24]:
result

array([19160.17536507])