In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
dataset = pd.read_csv('insurance.csv')

In [3]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
dataset.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
dataset.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [6]:
dataset.isna().any()

age         False
sex         False
bmi         False
children    False
smoker      False
region      False
charges     False
dtype: bool

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
def one_hot_encoder_one(data,feature,keep_first=True):

    oh = OneHotEncoder()
   
    oh_df = pd.DataFrame(oh.fit_transform(data[[feature]]).toarray()) 
    
    oh_df.columns = oh.get_feature_names()
    
    for col in oh_df.columns:
        oh_df.rename({col:f'{feature}_'+col.split('_')[1]},axis=1,inplace=True)
    
    new_data = pd.concat([data,oh_df],axis=1)
    new_data.drop(feature,axis=1,inplace=True)
    
    if keep_first == False:
        new_data=new_data.iloc[:,1:]
    
    return new_data

In [9]:
encoded_set = dataset

for column in encoded_set.select_dtypes(include=['int', 'float']).columns:
    encoded_set = encoded_set.drop([column], axis=1)

for columns in encoded_set.select_dtypes(include='object').columns:
    encoded_set = one_hot_encoder_one(encoded_set,columns)



In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
def scale_feature(data,feature):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(data[[feature]]), columns=[f'scaled_{feature}'])
    
    new_data = pd.concat([data,scaled_df], axis=1).drop([feature], axis=1)
    return new_data
    

In [12]:
scaled_set = dataset

for column in scaled_set.select_dtypes(include=['object']).columns:
    scaled_set = scaled_set.drop([column], axis=1)

for column in scaled_set.select_dtypes(include=['int', 'float']).columns:
    scaled_set = scale_feature(scaled_set, column)

In [13]:
new_dataset = pd.concat([encoded_set, scaled_set], axis=1)

In [14]:
features = new_dataset.drop(['scaled_charges'], axis=1)
target = new_dataset['scaled_charges']

In [15]:
from sklearn.model_selection import train_test_split

feature_trainset,feature_testset,target_trainset,target_testset = train_test_split(features, target, random_state=42, train_size=0.7, test_size=0.3)

In [16]:
target_testset

764    -0.344914
887    -0.660713
890     1.326718
1293   -0.327829
259     1.691784
          ...   
701    -0.308020
672    -0.732783
1163   -0.914428
1103   -0.157543
1295   -0.933927
Name: scaled_charges, Length: 402, dtype: float64

In [17]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [18]:
lr.fit(feature_trainset, target_trainset)

LinearRegression()

In [19]:
predictions = lr.predict(feature_testset)

In [20]:
values = np.transpose(np.array([predictions, target_testset.values]))

In [21]:
pd.DataFrame(values, columns=['Predicted', 'Expected']).head(20)

Unnamed: 0,Predicted,Expected
0,-0.351415,-0.344914
1,-0.516332,-0.660713
2,1.949564,1.326718
3,-0.309933,-0.327829
4,1.132016,1.691784
5,-0.180875,-0.721505
6,-1.099588,-0.921325
7,0.32397,0.07766
8,-1.015694,-0.787891
9,-0.160296,-0.248316


In [22]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def calculate_metrics(expected_values, predicted_values):
    evaluation = pd.DataFrame([{'r2':r2_score(expected_values, predicted_values),
                  'MAE':mean_absolute_error(expected_values, predicted_values),
                 'RMSE': np.sqrt(mean_squared_error(expected_values, predicted_values))}])
    return evaluation

In [23]:
evaluation = calculate_metrics(expected_values=target_testset, predicted_values=predictions)
evaluation

Unnamed: 0,r2,MAE,RMSE
0,0.769612,0.342444,0.480121


In [24]:
from sklearn.svm import SVR
svr = SVR()

In [25]:
svr.fit(feature_trainset, target_trainset)

SVR()

In [26]:
predictions = svr.predict(feature_testset)

In [27]:
evaluation = calculate_metrics(expected_values=target_testset, predicted_values=predictions)
evaluation

Unnamed: 0,r2,MAE,RMSE
0,0.857353,0.195358,0.377791
