In [1]:
#Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import r2_score

In [2]:
#Import dataset
df = pd.read_csv("insurance.csv")

In [3]:
#Dataframe head
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [5]:
#Checking for NA values
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [7]:
df.nunique()

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [8]:
y = df['charges']
del df['charges']

In [9]:
cat_var = ['sex','smoker','region']
cat_df = df[cat_var]

In [10]:
cat_df.head()

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest


In [11]:
#One hot encoding categorical values
cat_df = pd.get_dummies(cat_df)

In [12]:
cat_df.head()

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0


In [13]:
conti_var = list(set(df.columns) - set(cat_var))
conti_var

['bmi', 'age', 'children']

In [14]:
conti_df = df[conti_var]

In [15]:
conti_df.head()

Unnamed: 0,bmi,age,children
0,27.9,19,0
1,33.77,18,1
2,33.0,28,3
3,22.705,33,0
4,28.88,32,0


In [16]:
#Normalizing values 
x = conti_df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
conti_df = pd.DataFrame(x_scaled)


In [17]:
conti_df.head()

Unnamed: 0,0,1,2
0,0.321227,0.021739,0.0
1,0.47915,0.0,0.2
2,0.458434,0.217391,0.6
3,0.181464,0.326087,0.0
4,0.347592,0.304348,0.0


In [18]:
X = pd.concat([cat_df,conti_df],axis=1)
print(cat_df.size,conti_df.size,X.size)


10704 4014 14718


In [19]:
y = X.iloc[:,-1:]
y

Unnamed: 0,2
0,0.0
1,0.2
2,0.6
3,0.0
4,0.0
...,...
1333,0.6
1334,0.0
1335,0.0
1336,0.0


In [20]:
print(X.shape, y.shape)

(1338, 11) (1338, 1)


In [21]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Converting to tensors for ML
x_train = torch.from_numpy(X_train.to_numpy()).float()
x_test = torch.from_numpy(X_test.to_numpy()).float()
y_train = torch.from_numpy(y_train.to_numpy()).float()
y_test = torch.from_numpy(y_test.to_numpy()).float()


In [22]:
x_train

tensor([[1.0000, 0.0000, 1.0000,  ..., 0.1073, 0.6087, 0.4000],
        [1.0000, 0.0000, 1.0000,  ..., 0.2249, 0.6304, 0.0000],
        [1.0000, 0.0000, 1.0000,  ..., 0.2394, 0.7391, 0.0000],
        ...,
        [0.0000, 1.0000, 1.0000,  ..., 0.2479, 0.8696, 0.0000],
        [1.0000, 0.0000, 0.0000,  ..., 0.8512, 0.4130, 0.4000],
        [0.0000, 1.0000, 1.0000,  ..., 0.3750, 0.8043, 0.0000]])

In [23]:
#Neural Network
class NET(nn.Module):
    def __init__(self):
        super(NET,self).__init__()
        self.fc1 = nn.Linear(11,5)
        self.fc2 = nn.Linear(5,1)
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x

net = NET()
print(net)

NET(
  (fc1): Linear(in_features=11, out_features=5, bias=True)
  (fc2): Linear(in_features=5, out_features=1, bias=True)
)


In [24]:
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(),lr=0.001)

In [25]:
y_test = torch.squeeze(y_test)
y_train = torch.squeeze(y_train)

In [26]:
y_test.shape

torch.Size([268])

In [27]:
y_train.shape

torch.Size([1070])

In [28]:
x_train.shape

torch.Size([1070, 11])

In [29]:
for i in range(1000):
    y_pred = net(x_train)
    y_pred = torch.squeeze(y_pred)
    train_loss = criterion(y_train,y_pred)
    
    optimizer.zero_grad()
    
    train_loss.backward()
    
    optimizer.step()

In [30]:
#Saving the Model
MODEL_PATH = 'model.pth'
torch.save(net, MODEL_PATH)



In [31]:
#Reloading the Model
net = torch.load(MODEL_PATH)

In [32]:
#Results
net.train()
y_pred = net(x_test)


In [33]:
y_pred = y_pred.detach().numpy()

In [34]:
r2_score(y_test,y_pred)

0.9999297850929048