In [142]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split

house_df = pd.read_csv('kc_house_data.csv', dtype = {'id':str})
house_df.drop(['id', 'date', 'zipcode'], axis = 1, inplace = True)
house_df = house_df[["price", "bedrooms", "bathrooms", "sqft_living", "sqft_lot"]]
house_df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot
0,221900.0,3,1.0,1180,5650
1,538000.0,3,2.25,2570,7242
2,180000.0,2,1.0,770,10000
3,604000.0,4,3.0,1960,5000
4,510000.0,3,2.0,1680,8080


In [143]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
house_df.price = scaler.fit_transform(house_df[["price"]])
house_df.bedrooms = scaler.fit_transform(house_df[["bedrooms"]])
house_df.bathrooms = scaler.fit_transform(house_df[["bathrooms"]])
house_df.sqft_living = scaler.fit_transform(house_df[["sqft_living"]])
house_df.sqft_lot = scaler.fit_transform(house_df[["sqft_lot"]])

In [144]:
# 데이터 만들기
Y = house_df[['price']]
X = house_df.drop('price', axis = 1)

X_copy = X.copy()
Y_copy = Y.copy()
# X.insert(0, 'one', 1) # design matrix

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, Y, test_size = 0.2, random_state = 13)
X_train, X_test, y_train, y_test = np.array(X_train_1), np.array(X_test_1), np.array(y_train_1), np.array(y_test_1)

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_copy, Y_copy, test_size = 0.2, random_state = 13)


In [145]:
# 직접 행렬식으로 계산
def beta(X, y):
    return np.linalg.inv((X.T@X))@X.T@y
    
check_time = time.time()
beta_train = beta(X_train, y_train)
end_time = time.time()
print('걸린 시간 :', end_time - check_time)

y_hat_test = X_test@beta_train
mse = ((y_test - y_hat_test)**2).sum() / len(X_test) 
print('rmse :', np.sqrt(mse))

걸린 시간 : 0.0009028911590576172
rmse : 0.6820506091980052


In [146]:
# scikit learn으로 계산
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

check_trime = time.time()
lr = LinearRegression()
lr.fit(X_train_2, y_train_2)
end_time = time.time()
print("걸린 시간 ; ", end_time - check_time)

y_pred = lr.predict(X_test_2) # 검증 데이터를 사용해 종속변수를 예측
y_train_pred = lr.predict(X_train_2) # 학습데이터에 대한 종속변수를 예측
print('MSE train data: ', np.sqrt(mean_squared_error(y_train_2, y_train_pred))) # 학습 데이터를 사용했을 때의 평균 제곱 오차를 출력
print('MSE test data: ', np.sqrt(mean_squared_error(y_test_2, y_pred)))         # 검증 데이터를 사용했을 때의 평균 제곱 오차를 출력

걸린 시간 ;  0.1453418731689453
MSE train data:  0.7056162853904003
MSE test data:  0.6820525136127931


In [152]:
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader


In [153]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [154]:
inputs = torch.tensor(np.array(X_train), dtype = torch.float32)
targets = torch.tensor(np.array(y_train), dtype = torch.float32)
dataset = TensorDataset(inputs, targets)
train_loader = DataLoader(dataset, shuffle = True)

w = torch.randn(1, 4, requires_grad = True)
b = torch.randn(1, requires_grad = True)
print(w, b)

def model(X):
    return X @ w.t() + b

def mse_loss(predictions, targets):
    loss_func = torch.nn.MSELoss()
    _loss = loss_func(predictions, targets)
    return _loss
    # difference = predictions - targets
    # return torch.sum(difference * difference) / difference.numel()

# for x,y in train_loader:
#     preds = model(x)
#     print("Prediction is :n",preds)
#     print("nActual targets is :n",y)
#     print("Loss is: ",mse_loss(preds, y))
#     break

tensor([[-0.1363, -0.9527, -1.1333,  0.5418]], requires_grad=True) tensor([-0.8757], requires_grad=True)


In [155]:
epochs = 15
for i in range(epochs):
    
    for x, y in zip(inputs, targets):
        
        preds = x@w.t() + b
        loss = mse_loss(preds, y)
        loss.backward()
        
        with torch.no_grad():
            
            w -= w.grad * 1e-6
            b -= b.grad * 1e-6
            
            w.grad = None
            b.grad = None

    print(f"Epoch {i + 1} / {epochs} : Loss : {loss**0.5}")

Epoch 1 / 15 : Loss : 1.7178213596343994
Epoch 2 / 15 : Loss : 1.5772264003753662
Epoch 3 / 15 : Loss : 1.4458153247833252
Epoch 4 / 15 : Loss : 1.3229541778564453
Epoch 5 / 15 : Loss : 1.2080438137054443
Epoch 6 / 15 : Loss : 1.1005408763885498
Epoch 7 / 15 : Loss : 0.9999319911003113
Epoch 8 / 15 : Loss : 0.9057292342185974
Epoch 9 / 15 : Loss : 0.8175055384635925
Epoch 10 / 15 : Loss : 0.7348468899726868
Epoch 11 / 15 : Loss : 0.6573808789253235
Epoch 12 / 15 : Loss : 0.5847436785697937
Epoch 13 / 15 : Loss : 0.5166121125221252
Epoch 14 / 15 : Loss : 0.45268166065216064
Epoch 15 / 15 : Loss : 0.39267635345458984


In [156]:
eval_inputs = torch.tensor(np.array(X_train), dtype = torch.float32)
eval_targets = torch.tensor(np.array(y_train), dtype = torch.float32)

preds = eval_inputs @ w.t() + b
loss = mse_loss(preds, eval_targets)
print(loss ** 0.5)

tensor(1.3977, grad_fn=<PowBackward0>)
