In [None]:
# Data Collection
import pandas as pd
dataset=pd.read_csv("insurance_pre.csv")
dataset

In [None]:
# Data preprocessing - nominal to numerical data - one hot encoding method
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
dataset

In [None]:
dataset.columns

In [None]:
indep=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
dep=dataset[['charges']]
dep

In [None]:
# split train and test set
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(indep,dep,test_size=0.3, random_state=0)
X_train

In [None]:
# Standardisation-Pre processing method - rescales data to a consistent format-easier to understand and use
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
X_test

In [None]:
scy=StandardScaler()
Y_train=scy.fit_transform(Y_train)
Y_test=scy.transform(Y_test)
Y_test

In [None]:
# Model creation
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid={
    'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'n_estimators':[50,100],
    'max_features':['sqrt','log2',None]
}
grid=GridSearchCV(RandomForestRegressor(),param_grid, refit=True, verbose=5, n_jobs=-1)
grid.fit(X_train,Y_train) # training the model

In [None]:
grid.best_params_

In [None]:
y_pred=grid.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import r2_score
r2=r2_score(y_pred,Y_test)
print(f"R2 score = {r2:.2f}")

In [None]:
re=grid.cv_results_
table=pd.DataFrame.from_dict(re)
table

In [None]:
# save the best model
import pickle
filename="final_model.sav"
pickle.dump(grid, open(filename,'wb'))
