In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn import metrics

In [2]:
dataset=pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
#Now we are going to convert categorial data column (Sex and Smoker) as Numerical numbers using One Hot Encoding 
dataset=pd.get_dummies(dataset) # while giving drop_first=True function it'll drop State_California column.
dataset

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes
0,19,27.900,0,16884.92400,1,0,0,1
1,18,33.770,1,1725.55230,0,1,1,0
2,28,33.000,3,4449.46200,0,1,1,0
3,33,22.705,0,21984.47061,0,1,1,0
4,32,28.880,0,3866.85520,0,1,1,0
...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0,1,1,0
1334,18,31.920,0,2205.98080,1,0,1,0
1335,18,36.850,0,1629.83350,1,0,1,0
1336,21,25.800,0,2007.94500,1,0,1,0


In [5]:
#Input and output split
#Data was already preprocessed. Now we are going to perform Input and output variables split
independent=dataset[['age', 'bmi', 'children', 'sex_female', 'sex_male','smoker_no', 'smoker_yes']] 
independent #Now 'age', 'bmi', 'children', 'sex_female', 'sex_male','smoker_no', 'smoker_yes' column values seperately assigned into independent varaiable, Independent means input, Dependent means output.

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes
0,19,27.900,0,1,0,0,1
1,18,33.770,1,0,1,1,0
2,28,33.000,3,0,1,1,0
3,33,22.705,0,0,1,1,0
4,32,28.880,0,0,1,1,0
...,...,...,...,...,...,...,...
1333,50,30.970,3,0,1,1,0
1334,18,31.920,0,1,0,1,0
1335,18,36.850,0,1,0,1,0
1336,21,25.800,0,1,0,1,0


In [6]:
#Now 'charges' column value seperately assigned into dependent varaiable, dependent means output.
dependent=dataset[["charges"]] #Now charges value seperately assigned into dependent varaiable,
dependent
#Now we got seperatley independent and dependent varaiables. now sucessfully splited input and output

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [7]:
#Split Trainset and Testset. 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(independent, dependent, test_size=0.30, random_state=0) 

In [8]:
X_train

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes
1163,18,28.215,0,1,0,1,0
196,39,32.800,0,1,0,1,0
438,52,46.750,5,1,0,1,0
183,44,26.410,0,1,0,1,0
1298,33,27.455,2,0,1,1,0
...,...,...,...,...,...,...,...
763,27,26.030,0,0,1,1,0
835,42,35.970,2,0,1,1,0
1216,40,25.080,0,0,1,1,0
559,19,35.530,0,0,1,1,0


In [9]:
#Model Creation
#ML algorithm all are provided by sklearn, we are creting this Model using trainset and tesset varaiable.
from lightgbm import LGBMRegressor
regressor=LGBMRegressor(num_leaves=10000,n_estimators=100,learning_rate=0.1,random_state=0) #Parameterized function we are assigning into regressor varaiable.it consider as class, all the function under this class assign to regressor.
regressor.fit(X_train,y_train) #fit function we are calling it using access operator, now passing splited trainset input and output.
#estimator=None, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None, base_estimator='deprecated'.
#from sklearn.ensemble import GradientBoostingRegressor
#regressor=GradientBoostingRegressor(criterion='friedman_mse',max_features='sqrt',n_estimators=100,random_state=0)
#regressor.fit(X_train,y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=10000, objective=None,
              random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [10]:
#Evaluation metrix(R2 using)
y_pred=regressor.predict(X_test) #y-pred is varaiable, all are saved in regressor(formula,weight,bais), predict is application function.
#Evaluation matrix should happened on Testdataset .Anyone of the Testset values not included into Trandataset values. so that we are using it for crosss predition
#y-test is Actual data(y_test), X_test is brain predicted outptut.so that we are using it for crosss prediction

In [11]:
from sklearn.metrics import r2_score #using this r2_score function going to find R2 value.
r_score=r2_score(y_test,y_pred) #r2_score assigned to r_score variable, now passing Actual output(y_test) and predicted output(y_pred)
#r2_score result nearly 1 means it's Good model, nearly to 0 means bad model
#Now it nearly to 1. our model is performing well. our model learned well while performing crosscheck. evaluation matrix cross checking our model and it'll confirm particular model is performing well or not. 

In [12]:
r_score

0.8651583120628885

In [13]:
#To save model we are using pickle function.
import pickle
filename="finalized_model_XGBoosting_assignment.sav" #creating dummy filename for finalized model. to save pickle file we have to use .sav
#now it was created filename, not yet included into model

In [14]:
pickle.dump(regressor,open(filename, 'wb')) 

In [15]:
#Now Model saved, so once load and check it, it's working fine or not. if everything fine we will move this model to phase2 ,Deployment, end user application use related we can set it.
loaded_model=pickle.load(open("finalized_model_XGBoosting_assignment.sav",'rb')) #creating varaiale name loaded_model, if I want to use saved Model and want to load again means use "pickle.load(open("finalized_model_linear.sav",'rb'" and here only reading so used rb.
#Always use extension properly to load model
result=loaded_model.predict([[50,30.970,3,0,1,1,0]])
#once loading completed means we are going test it real time based on single user. now we are predicting salary for 15 years of experience. Individual value sending time use two closed brackets like this([[15]]).
#predict([[15]]) here we can forward python int varaiable value as input. Now saved model loaded and predicting result.
#'age', 'bmi', 'children', 'sex_male', 'smoker_yes']] 

In [16]:
result

array([11696.98739623])