In [118]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier

In [119]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
# train_data.head()
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
# test_data.head()


In [120]:
y = train_data["SalePrice"]
features = ["LotArea", "YearBuilt", "2ndFlrSF"]

selectDf = train_data[features]
selectDf.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   LotArea    1460 non-null   int64
 1   YearBuilt  1460 non-null   int64
 2   2ndFlrSF   1460 non-null   int64
dtypes: int64(3)
memory usage: 34.3 KB


### 데이터 전처리

In [121]:
from sklearn.preprocessing import StandardScaler

In [122]:
standardscaler = StandardScaler()
standardscaler.fit(selectDf)

In [123]:
scailing = standardscaler.transform(selectDf)

In [124]:
scailing.shape, type(scailing) # Scailing = 합치기..?

((1460, 3), numpy.ndarray)

In [125]:
scailing

array([[-0.20714171,  1.05099379,  1.16185159],
       [-0.09188637,  0.15673371, -0.79516323],
       [ 0.07347998,  0.9847523 ,  1.18935062],
       ...,
       [-0.14781027, -1.00249232,  1.84474434],
       [-0.08016039, -0.70440562, -0.79516323],
       [-0.05811155, -0.20759447, -0.79516323]])

### 데이터 분할

In [126]:
from sklearn.model_selection import train_test_split
import torch

In [127]:
train_features, test_features, train_label, test_label =\
      train_test_split(scailing[:,:2], scailing[:,2], test_size=0.2, random_state=20)
train_features.shape, test_features.shape, train_label.shape, test_label.shape

((1168, 2), (292, 2), (1168,), (292,))

In [128]:
train_features_tensor = torch.tensor(train_features, dtype=torch.float32)
train_label_tensor = torch.tensor(train_label, dtype=torch.float32).view(-1,1) # 행만 있는 경우 행열로 변환 해줌
# test_features_tensor = torch.tensor(test_features)
# test_label_tensor = torch.tensor(test_label)
type(train_features_tensor), type(train_label_tensor)

(torch.Tensor, torch.Tensor)

In [129]:
train_features_tensor.shape, train_label_tensor.shape

(torch.Size([1168, 2]), torch.Size([1168, 1]))

### 모델 학습

In [130]:
# # simple model linear regression
# # model, loss function, optimizer function
# class LinearRegression(torch.nn.Module) :
#     def __init__(self, input_dim, output_dim): # input : feature 의 열 갯수, output : label의 카테고리 갯수 (연속형은 값 하나)
#         super(LinearRegression, self).__init__()
#         self.linear = torch.nn.Linear(input_dim,output_dim)

#     def forward(self, x):
#         out = self.linear(x)
#         return out


# Neural Network model linear regression
# model, loss function, optimizer function
class LinearRegressionNNM(torch.nn.Module) :
    def __init__(self, input_dim): # input : feature 의 열 갯수
        super(LinearRegressionNNM, self).__init__()
        # super(self).__init__()
        self.hidden_1 = torch.nn.Linear(input_dim, 64) # input_dim : feature 수 10개, output_dim : 출력 수 64 개
        self.hidden_2 = torch.nn.Linear(64, 32) # input_dim : 이전 layer output_dim 수 64개, output_dim : 출력 수 32 개
        self.hidden_3 = torch.nn.Linear(32, 16) # input_dim : 이전 layer output_dim 수 32, output_dim : 출력 수 16 개
        self.output = torch.nn.Linear(16, 1) # input_dim : 이전 layer output_dim 수 16, output_dim : 출력 수 1 개 Linear 니까
        self.relu = torch.nn.ReLU() # activation func ReLU 쓸거임. 

    def forward(self, x):
        x = self.relu(self.hidden_1(x)) # hidden layer 1 10,64
        x = self.relu(self.hidden_2(x)) # hidden layer 2 64,32
        x = self.relu(self.hidden_3(x)) # hidden layer 3 32,16
        out = self.output(x) # hidden layer 4 16,1 result 
        
        return out

In [131]:
train_features_tensor.shape[1], train_label_tensor.shape[1] # [1] 이라는건 열이지 인덱싱 하는것이 아님

(2, 1)

In [132]:
model = LinearRegression(train_features_tensor.shape[1], train_label_tensor.shape[1])

In [133]:
model

LinearRegression(
  (linear): Linear(in_features=2, out_features=1, bias=True)
)

In [134]:
criterion = torch.nn.MSELoss() # Loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Optimizer fuction, lr = learnning rate : gradient decsent 빠르게 찾기 최근에 ADAMW 가 제일 좋다. 찾아가는 과정 어렵다. 그래서 optimizer 성능 중요하다.

In [135]:
## 반복 학습
# for epoch in range(10):
for epoch in range(1000):
    pred_y = model.forward(train_features_tensor)
    loss = criterion(pred_y, train_label_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f"epoch : {epoch}, loss : {loss.item()}")

epoch : 0, loss : 2.0535976886749268
epoch : 100, loss : 0.9931437969207764
epoch : 200, loss : 0.9925981163978577
epoch : 300, loss : 0.9925979971885681
epoch : 400, loss : 0.9925979971885681
epoch : 500, loss : 0.9925979971885681
epoch : 600, loss : 0.9925979971885681
epoch : 700, loss : 0.9925979971885681
epoch : 800, loss : 0.9925979971885681
epoch : 900, loss : 0.9925979971885681


### 모델 평가

In [136]:
model.eval()


LinearRegression(
  (linear): Linear(in_features=2, out_features=1, bias=True)
)

In [137]:
with torch.no_grad(): # 학습 목정이 아닌 평가 목적 위해 고정
    pred_y = model(train_features_tensor)
    loss = criterion(pred_y, train_label_tensor) # 예측도 간은 loss function
    print(f"loss : {loss.item()}")

loss : 0.9925979971885681


In [138]:
# 결과 값(예측값, 원래값)
pred_y[0], train_label[0]

(tensor([-0.0343]), -0.245182483072008)

### 모델 배포

In [139]:
select_test_df = test_data[features]
select_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   LotArea    1459 non-null   int64
 1   YearBuilt  1459 non-null   int64
 2   2ndFlrSF   1459 non-null   int64
dtypes: int64(3)
memory usage: 34.3 KB


### Needs Improvement

In [140]:
scailing_array_test = standardscaler.transform(select_test_df)
test_features_tensor = torch.tensor(scailing_array_test, dtype=torch.float32)
with torch.no_grad(): # 학습 목정이 아닌 평가 목적 위해 고정
    pred_y = model(test_features_tensor)

import numpy as np
# inverse transform 및 반올림
pred_y_fin = np.round(standardscaler_label.inverse_transform(pred_y.reshape(-1, 1))).astype(int)

# 1차원 배열로 변환 (필요한 경우)
pred_y_fin = pred_y_fin.ravel()  # or pred_y_fin.flatten()

output = pd.DataFrame({'Id': test_df.Id, 'SalePrice': pred_y_fin})
output.head()


# RuntimeError: mat1 and mat2 shapes cannot be multiplied (1459x3 and 2x1)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1459x3 and 2x1)