In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
# loading the data from csv file to a Pandas DataFrame
insurance_dataset = pd.read_csv('insurance.csv')

In [3]:
# first 5 rows of the dataframe
insurance_dataset.head()

Unnamed: 0,age,sex,bmi,smoker,region,children,charges
0,21.0,male,25.745,no,northeast,2,3279.86855
1,36.976978,female,25.744165,yes,southeast,3,21454.49424
2,18.0,male,30.03,no,southeast,1,1720.3537
3,37.0,male,30.676891,no,northeast,3,6801.437542
4,58.0,male,32.01,no,southeast,1,11946.6259


In [4]:
# number of rows and columns
insurance_dataset.shape

(3630, 7)

In [5]:
# getting some informations about the dataset
insurance_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3630 entries, 0 to 3629
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       3630 non-null   float64
 1   sex       3630 non-null   object 
 2   bmi       3630 non-null   float64
 3   smoker    3630 non-null   object 
 4   region    3630 non-null   object 
 5   children  3630 non-null   int64  
 6   charges   3630 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 198.6+ KB


In [10]:
# checking for missing values
insurance_dataset.isnull().sum()

age         0
sex         0
bmi         0
smoker      0
region      0
children    0
charges     0
dtype: int64

In [9]:
# statistical Measures of the dataset
insurance_dataset.describe()

Unnamed: 0,age,sex,bmi,smoker,region,children,charges
count,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0
mean,38.887036,0.441047,30.629652,0.84573,1.45427,2.503581,12784.808644
std,12.151029,0.496581,5.441307,0.361257,1.145746,1.712568,10746.166743
min,18.0,0.0,15.96,0.0,0.0,0.0,1121.8739
25%,29.0,0.0,26.694526,1.0,0.0,1.0,5654.818262
50%,39.170922,0.0,30.2,1.0,1.0,3.0,9443.807221
75%,48.343281,1.0,34.1,1.0,3.0,4.0,14680.407505
max,64.0,1.0,53.13,1.0,3.0,5.0,63770.42801


In [8]:
# encoding sex column
insurance_dataset.replace({'sex':{'male':0,'female':1}}, inplace=True)

3 # encoding 'smoker' column
insurance_dataset.replace({'smoker':{'yes':0,'no':1}}, inplace=True)

# encoding 'region' column
insurance_dataset.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)

In [11]:
X = insurance_dataset.drop(columns='charges', axis=1)
Y = insurance_dataset['charges']

In [12]:
print(X)

            age  sex        bmi  smoker  region  children
0     21.000000    0  25.745000       1       2         2
1     36.976978    1  25.744165       0       0         3
2     18.000000    0  30.030000       1       0         1
3     37.000000    0  30.676891       1       2         3
4     58.000000    0  32.010000       1       0         1
...         ...  ...        ...     ...     ...       ...
3625  48.820767    1  41.426984       1       3         4
3626  38.661977    1  26.202557       1       0         2
3627  56.000000    0  40.300000       1       1         0
3628  48.061207    1  34.930624       1       0         1
3629  37.598865    1  25.219233       1       2         3

[3630 rows x 6 columns]


In [13]:
print(Y)

0        3279.868550
1       21454.494240
2        1720.353700
3        6801.437542
4       11946.625900
            ...     
3625    10987.324960
3626    11735.844350
3627    10602.385000
3628     8976.140452
3629     7027.698968
Name: charges, Length: 3630, dtype: float64


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [15]:
print(X.shape, X_train.shape, X_test.shape)

(3630, 6) (2904, 6) (726, 6)


In [16]:
# loading the Linear Regression model
regressor = LinearRegression()

In [17]:
regressor.fit(X_train, Y_train)

LinearRegression()

In [18]:
# prediction on training data
training_data_prediction =regressor.predict(X_train)

In [29]:
# R squared value
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R squared vale : ', r2_train)

R squared vale :  0.6969959791150353


In [20]:
# prediction on test data
test_data_prediction =regressor.predict(X_test)

In [27]:
# R squared value
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print('R squared vale : ', r2_test)

R squared vale :  0.7548725700903745


In [30]:
input_data = (31,1,25.74,0,1,0)

# changing input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = regressor.predict(input_data_reshaped)

print('The insurance cost is USD ', prediction[0])

The insurance cost is USD  26179.117836221296
