In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

In [46]:
df = pd.read_csv("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv")

df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [48]:
# df = data.drop("Outcome", axis=1)
# df

In [50]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [52]:
df["bmi"].value_counts()

bmi
32.300    13
28.310     9
30.495     8
30.875     8
31.350     8
          ..
46.200     1
23.800     1
44.770     1
32.120     1
30.970     1
Name: count, Length: 548, dtype: int64

In [54]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for e in ['sex', 'smoker', 'region'] :
    df[e] = label_encoder.fit_transform(df[e])
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


In [56]:
# 입력과 타겟 분리
X = df.drop('bmi', axis=1).values
y = df['bmi'].values

X, y

(array([[1.90000000e+01, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
         3.00000000e+00, 1.68849240e+04],
        [1.80000000e+01, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00,
         2.00000000e+00, 1.72555230e+03],
        [2.80000000e+01, 1.00000000e+00, 3.00000000e+00, 0.00000000e+00,
         2.00000000e+00, 4.44946200e+03],
        ...,
        [1.80000000e+01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         2.00000000e+00, 1.62983350e+03],
        [2.10000000e+01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         3.00000000e+00, 2.00794500e+03],
        [6.10000000e+01, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
         1.00000000e+00, 2.91413603e+04]]),
 array([27.9 , 33.77, 33.  , ..., 36.85, 25.8 , 29.07]))

In [58]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 6), (268, 6), (1070,), (268,))

In [68]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [70]:
class RegressionModel(nn.Module):
    def __init__(self):
        super(RegressionModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(6, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RegressionModel().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [72]:
model.train()
for epoch in range(50):
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

print("Training complete.")

Epoch 1, Loss: 940.0628
Epoch 2, Loss: 282.0767
Epoch 3, Loss: 63.0705
Epoch 4, Loss: 48.0511
Epoch 5, Loss: 42.2951
Epoch 6, Loss: 39.0459
Epoch 7, Loss: 36.2800
Epoch 8, Loss: 35.3388
Epoch 9, Loss: 32.9290
Epoch 10, Loss: 32.7654
Epoch 11, Loss: 31.3291
Epoch 12, Loss: 31.5455
Epoch 13, Loss: 30.6834
Epoch 14, Loss: 30.7796
Epoch 15, Loss: 30.8916
Epoch 16, Loss: 30.7609
Epoch 17, Loss: 30.5756
Epoch 18, Loss: 30.9490
Epoch 19, Loss: 29.1091
Epoch 20, Loss: 29.8931
Epoch 21, Loss: 29.9805
Epoch 22, Loss: 29.8000
Epoch 23, Loss: 31.2722
Epoch 24, Loss: 28.6493
Epoch 25, Loss: 29.0566
Epoch 26, Loss: 29.0167
Epoch 27, Loss: 28.5177
Epoch 28, Loss: 28.8556
Epoch 29, Loss: 29.6333
Epoch 30, Loss: 28.6399
Epoch 31, Loss: 28.0595
Epoch 32, Loss: 28.7300
Epoch 33, Loss: 29.0659
Epoch 34, Loss: 28.3649
Epoch 35, Loss: 28.4871
Epoch 36, Loss: 27.6759
Epoch 37, Loss: 27.6926
Epoch 38, Loss: 28.1120
Epoch 39, Loss: 28.9025
Epoch 40, Loss: 27.9690
Epoch 41, Loss: 28.1744
Epoch 42, Loss: 28.2999

In [74]:
model.eval()
preds, actuals = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        # print(X_batch)
        X_batch = X_batch.to(device)
        outputs = model(X_batch).cpu().numpy()
        preds.extend(outputs)
        actuals.extend(y_batch.numpy())

mse = mean_squared_error(actuals, preds)
print(f"Test MSE: {mse:.4f}")


Test MSE: 33.1561
