In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')
df_sub = pd.read_csv('house-prices-advanced-regression-techniques/sample_submission.csv')

In [3]:
df_sub.columns

Index(['Id', 'SalePrice'], dtype='object')

최종 출력은 Id, SalePrice

In [4]:
train_ID = df_train['Id']
test_ID = df_test['Id']

y_train = pd.DataFrame(np.log(df_train['SalePrice']))

df_train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)

In [5]:
print(df_train.shape)
print(df_test.shape)

(1460, 79)
(1459, 79)


In [6]:
num_train_cols = df_train.select_dtypes(include=['int', 'float'])
num_test_cols = df_test.select_dtypes(include=['int', 'float'])

str_train_cols = df_train.select_dtypes('object')
str_test_cols = df_test.select_dtypes('object')

In [7]:
print('num_train_cols = ',num_train_cols.shape)
print('num_test_cols = ', num_test_cols.shape)
print('str_train_cols = ', str_train_cols.shape)
print('str_test_cols = ', str_test_cols.shape)

num_train_cols =  (1460, 36)
num_test_cols =  (1459, 36)
str_train_cols =  (1460, 43)
str_test_cols =  (1459, 43)


In [8]:
num_train_cols = num_train_cols.fillna(0)
num_test_cols = num_test_cols.fillna(0)
str_train_cols = str_train_cols.fillna('None')
str_test_cols = str_test_cols.fillna('None')

In [9]:
print(len(str_train_cols.columns))
print(len(str_test_cols.columns))

43
43


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in str_train_cols:
    str_train_cols[i] = le.fit_transform(str_train_cols[i])
    
for i in str_test_cols:
    str_test_cols[i] = le.fit_transform(str_test_cols[i])

In [11]:
x_train = pd.concat([num_train_cols, str_train_cols], axis=1)
x_test = pd.concat([num_test_cols, str_test_cols], axis=1)

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from sklearn.preprocessing import MinMaxScaler

s1 = MinMaxScaler()
s2 = MinMaxScaler()

x_train = torch.from_numpy(s1.fit_transform(x_train.values)).float()
x_test = torch.from_numpy(s2.fit_transform(x_test.values)).float()

x_train = torch.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test = torch.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

y_train = torch.tensor(y_train.values).float()
y_test = torch.zeros(df_test.shape[0], 1).float()

In [13]:
device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')
device

device(type='mps', index=0)

In [14]:
def set_seed(seed):
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)
    torch.use_deterministic_algorithms(True)

set_seed(123)

In [15]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 32)
        self.fc3 = nn.Linear(32, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, (hn, cn) = self.lstm(x, (h0, c0))
        out = out.view(-1, self.hidden_size)
        out = self.relu(out)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)

        return out

model = LSTM(x_train.size(2), 64, y_train.size(1), 1).to(device)

In [16]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)

In [17]:
epochs = 5000
x_train = x_train.to(device)
y_train = y_train.to(device)
L = []
model.train()
for epoch in range(epochs):
    output = model(x_train)
    optimizer.zero_grad()
    loss = criterion(output, y_train)
    L.append(loss.item())
    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print(f'{epoch} : {loss.item()}')

0 : 143.68882751464844
1000 : 0.08476003259420395
2000 : 0.020841462537646294
3000 : 0.013803889974951744
4000 : 0.012601975351572037


In [18]:
x_test = x_test.to(device)
model.eval()
with torch.no_grad():
    result = model(x_test)

df = pd.DataFrame()
df['Id'] = test_ID
result = np.exp((result).data.detach().cpu().numpy())
df['SalePrice'] = result

In [19]:
df.to_csv('submission.csv', index=False)