In [8]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

# ## Regression Project for Medical Insurance Forecast
 The aim here will be to predict the medical costs billed by health insurance on an individual given some or all of the independent variables of the dataset. Since the cost to be predicted is a continuous variable, it is pretty natural that regression is to be applied in its truest form ** (i.e., without the decision boundary as in regression-based classification). Therefore, you could choose to implement polynomial, multiple linear regression, or even Elastic Net Regression. **Exploratory data analysis can be an essential step (even in this case despite the limited features). You will observe patterns, like the decreased tendency to smoke among those having children, helping you achieve reasonable feature selection and simpler models.

# Take data and do some formanlity

In [9]:
import os
import kagglehub
import shutil

try:
    # Define the desired path using a raw string
    desired_path = r"D:\Machine Learning projects\medical-charges-mlops\research"
    
    # Ensure the directory exists
    os.makedirs(desired_path, exist_ok=True)

    # Download the dataset
    path = kagglehub.dataset_download("teertha/ushealthinsurancedataset")
    
    # Print the original path to dataset files
    print("Path to dataset files:", path)

    # Move files to the desired location with overwrite handling
    for filename in os.listdir(path):
        full_file_name = os.path.join(path, filename)
        if os.path.isfile(full_file_name):
            dest_file = os.path.join(desired_path, filename)
            if os.path.exists(dest_file):
                os.remove(dest_file)  # Overwrite the existing file
            shutil.move(full_file_name, desired_path)

    print("Files moved to:", desired_path)

except OSError as e:
    print(f"OS error: {e}")
except shutil.Error as e:
    print(f"Error moving file: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")


Path to dataset files: C:\Users\DELL\.cache\kagglehub\datasets\teertha\ushealthinsurancedataset\versions\1
Files moved to: D:\Machine Learning projects\medical-charges-mlops\research


In [10]:
medical_df=pd.read_csv("insurance.csv")

In [11]:
medical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [12]:
n_rows=len(medical_df.axes[0])

In [13]:
n_cols=len(medical_df.axes[1])

In [14]:
print(n_rows,n_cols,medical_df.shape)

1338 7 (1338, 7)


In [15]:
medical_df.head(8)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056


# Visualisation of Data

In [16]:
fig=px.histogram(medical_df,
                 x='age',
                 y='charges',
                 marginal='box',
                 color='smoker',
                 title='AGE vs CHARGES ')
fig.update_layout(bargap=0.1)
fig.show()

In [17]:
fig=px.histogram(medical_df,
                 x='region',
                 y='charges',
                 marginal='box',
                 color='smoker',
                 title='AGE vs CHARGES ')
fig.update_layout(bargap=0.1)
fig.show()

In [18]:
fig=px.histogram(medical_df,
                 x='region',
                 y='charges',
                 marginal='box',
                 color='sex',
                 title='AGE vs CHARGES ')
fig.update_layout(bargap=0.1)
fig.show()

In [19]:
medical_df.columns.tolist()

['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

In [20]:
fig = px.scatter_matrix(medical_df,
                        dimensions=['age', 'bmi', 'smoker', 'charges'],
                        color='sex',
                        title='Relationship between age sex  bmi  children smoker region, charges')
fig.show()

In [21]:
print("  It means Charges Is directly proportional to age , increasing bmi ,smoker... ")

  It means Charges Is directly proportional to age , increasing bmi ,smoker... 


# Preprocessing the Data
  **
1.    Identify numeric and categorical input columns.
2.    Impute (fill) missing values in numeric columns
3.    Scale values in numeric columns to a (0,1)(0,1) range.
4.    Encode categorical data into one-hot vectors.
5.    Split the dataset into training and validation sets
**

# Identify numeric and categorical input columns.

In [22]:
# now we should divide data for preprocessing by delete key data from input  data

In [23]:
columns=medical_df.columns.tolist()
columns

['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

In [24]:
input_cols=['age', 'sex', 'bmi', 'children', 'smoker', 'region']

In [25]:
input_df=medical_df[input_cols]
input_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [26]:
numeric_cols=input_df.select_dtypes(include=np.number).columns.tolist()
numeric_cols

['age', 'bmi', 'children']

In [27]:
categorical_cols=input_df.select_dtypes('object').columns.tolist()

In [28]:
target_col=['charges']
target_df=medical_df[target_col]

In [29]:
target_df.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


# Imputing  Data

In [30]:
input_df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [31]:
print("so That's it 😊😊😊")

so That's it 😊😊😊


# Scaling Numerical Data

In [32]:
input_df[numeric_cols].describe()

Unnamed: 0,age,bmi,children
count,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918
std,14.04996,6.098187,1.205493
min,18.0,15.96,0.0
25%,27.0,26.29625,0.0
50%,39.0,30.4,1.0
75%,51.0,34.69375,2.0
max,64.0,53.13,5.0


In [33]:
input_df[numeric_cols].describe().loc[['min' ,'max']]

Unnamed: 0,age,bmi,children
min,18.0,15.96,0.0
max,64.0,53.13,5.0


In [34]:
from sklearn.preprocessing import MinMaxScaler

In [35]:
scaler=MinMaxScaler()

In [36]:
scaler.fit(input_df[numeric_cols])

In [37]:
input_df[numeric_cols]=scaler.transform(input_df[numeric_cols])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [38]:
input_df[numeric_cols].describe().loc[['min', 'max']]

Unnamed: 0,age,bmi,children
min,0.0,0.0,0.0
max,1.0,1.0,1.0


# Encoding the categorical Data

In [39]:
input_df[categorical_cols].nunique()

sex       2
smoker    2
region    4
dtype: int64

In [47]:
from sklearn.preprocessing import OneHotEncoder

In [48]:

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


In [49]:
encoder.fit(input_df[categorical_cols])

In [50]:
encoded_cols=list(encoder.get_feature_names_out(categorical_cols))

In [51]:
encoded_cols

['sex_female',
 'sex_male',
 'smoker_no',
 'smoker_yes',
 'region_northeast',
 'region_northwest',
 'region_southeast',
 'region_southwest']

In [52]:
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]

In [53]:
input_df[encoded_cols]=encoder.transform(input_df[categorical_cols])

In [54]:
input_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.021739,female,0.321227,0.0,yes,southwest,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,male,0.47915,0.2,no,southeast,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.217391,male,0.458434,0.6,no,southeast,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.326087,male,0.181464,0.0,no,northwest,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.304348,male,0.347592,0.0,no,northwest,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [55]:
print("Now for training data we have to be removed cols which would be encodeed")

Now for training data we have to be removed cols which would be encodeed


In [56]:
input_df=input_df.drop(['sex','smoker','region'],axis=1)

In [57]:
input_df.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.021739,0.321227,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.47915,0.2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.217391,0.458434,0.6,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.326087,0.181464,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.304348,0.347592,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


#  Training and Validation Set

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
X_train,X_valid,y_train,y_valid=train_test_split(input_df,target_df,test_size=0.2,random_state=42)

In [60]:
X_train.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
560,0.608696,0.107345,0.4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1285,0.630435,0.224913,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1142,0.73913,0.23944,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
969,0.456522,0.493947,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
486,0.782609,0.148238,0.6,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [61]:
X_valid.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
764,0.586957,0.247915,0.4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
887,0.391304,0.378262,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
890,1.0,0.29392,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1293,0.608696,0.26325,0.6,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
259,0.021739,0.429379,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [62]:
y_train.head()

Unnamed: 0,charges
560,9193.8385
1285,8534.6718
1142,27117.99378
969,8596.8278
486,12475.3513


In [63]:
y_valid.head()

Unnamed: 0,charges
764,9095.06825
887,5272.1758
890,29330.98315
1293,9301.89355
259,33750.2918


# Model Training by polynomial Regression, multiple linear regression,Elastic Net Regression


In [64]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet

In [65]:
lr_model=LinearRegression()
en_model=ElasticNet(alpha=1.0, l1_ratio=0.5)

In [66]:
lr_model.fit(X_train,y_train)
en_model.fit(X_train,y_train)

In [67]:
print(lr_model.coef_,"  :- ", lr_model.intercept_)

[[ 1.18565984e+04  1.19927409e+04  1.57456099e+03  4.50474613e+16
   4.50474613e+16 -9.72166218e+16 -9.72166218e+16 -1.01162627e+18
  -1.01162627e+18 -1.01162627e+18 -1.01162627e+18]]   :-  [1.06379543e+18]


In [68]:
print(en_model.coef_,"  :- ", en_model.intercept_)

[ 1836.25078098   699.21943983   338.48043274  -210.98771879
   210.93497274 -4581.48217616  4581.46476101    -9.19449248
  -166.13800492   350.83015537  -173.49503209]   :-  [14828.39624444]


In [69]:
answer_lr=lr_model.predict(X_valid)
answer_en=en_model.predict(X_valid)

In [70]:
answer_lr.shape

(268, 1)

In [71]:
answer_en.shape

(268,)

In [72]:
answer_en=answer_en.reshape(268,1)

In [73]:
def rmse(ya,yp):
  return np.sqrt(np.mean(np.square(ya-yp)))

In [74]:
rmse(y_valid,answer_lr)

5804.187829167601

In [75]:
rmse(y_valid,answer_en)

9412.870421729178

In [76]:
print("RMSE error of LinearRegression :- ",rmse(y_valid,answer_lr),"\n RMSE error of Elastic Net Regression :- ",rmse(y_valid,answer_en))

RMSE error of LinearRegression :-  5804.187829167601 
 RMSE error of Elastic Net Regression :-  9412.870421729178


In [77]:
print("SO IT MEANS LINEAR REGRESSION IS BETTER THEN Elastic Net Regression ")

SO IT MEANS LINEAR REGRESSION IS BETTER THEN Elastic Net Regression 
