In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# run and show graph in the same shell
# if not written, a new box will be opened
%matplotlib inline

In [None]:
# load the dataset 

df = pd.read_csv("insurance.csv")

In [None]:
df.head()

In [None]:
# missing value 
df.isnull().sum()

In [None]:
# check duplicate and remove them 
df = df.drop_duplicates()

In [None]:
df.shape

In [None]:
x_axis = ['age', 'bmi', 'children', 'expenses']
for x in x_axis:
    fig, axes = plt.subplots(1, 2, figsize=(18, 4))
    
    # Using histplot instead of the deprecated distplot
    sns.histplot(df[x], ax=axes[0], bins=20)  # bins=20 for histogram customization
    
    # Boxplot remains the same
    sns.boxplot(df[x], ax=axes[1], orient='h', showmeans=True, color='pink')

### handle categorical columns 

In [None]:
df.head()

In [None]:
df.sex.unique()

In [None]:
df['sex'] = df.sex.map({'female':0, 'male':1})

In [None]:
df.head()

In [None]:
df.smoker.unique()

In [None]:
df['smoker'] = df.smoker.map({'yes':1, 'no':0})

In [None]:
df.head()

In [None]:
df.region.unique()

In [None]:
df['region'] = df.region.map({'southwest':1,'southeast':2,'northwest':3,'northeast':4})

In [None]:
df.head()

In [None]:
df.info()

### split the dataset in x and y 

In [None]:
df.columns

In [None]:
x = df.drop(['expenses'], axis = 1 )

In [None]:
x.head()

In [None]:
y = df[['expenses']]

In [None]:
y.head()

### train test split 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [None]:
x_train.shape

In [None]:
x_test.shape

#### Model training

In [None]:
# linear regression 
from sklearn.linear_model import LinearRegression

In [None]:
lr=LinearRegression()

In [None]:
lr.fit(x_train, y_train)

In [None]:
y_pred = lr.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
score1 = r2_score(y_test, y_pred)

In [None]:
score1

### support vector regression 

In [None]:
from sklearn.svm import SVR

In [None]:
svm = SVR()

In [None]:
svm.fit(x_train, y_train)

In [None]:
y_pred = svm.predict(x_test)
from sklearn.metrics import r2_score
score1 = r2_score(y_test, y_pred)

In [None]:
score1

### random forest regression 

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [None]:
rf.fit(x_train, y_train)

In [None]:
y_pred3 = rf.predict(x_test)
score3 = r2_score(y_test, y_pred3)

In [None]:
score3

### final model is random forest regressor

In [None]:
rf.fit(x_train, y_train)

# predict charges in new customer 

In [None]:
data = {
    'age':40,
    'sex':1,
    'bmi':40,
    'children':3,
    'smoker':1,
    'region':2
}
new_df = pd.DataFrame(data,index=[0])
new_df

In [None]:
pred = rf.predict(new_df)
pred[0]

In [None]:
data2 = {
    'age':19,
    'sex':0,
    'bmi':27.9,
    'children':0,
    'smoker':1,
    'region':1
}
new_df = pd.DataFrame(data2,index=[0])
new_df

In [None]:
pred = rf.predict(new_df)
pred[0]

### save model 

In [None]:
import joblib
joblib.dump(rf, 'random_forest_regressor.pkl')