In [27]:
import holidays
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from xgboost import XGBRegressor

class TabularDataset(Dataset):
    def __init__(self, x, y) -> None:
        self.x = torch.from_numpy(x)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        return x, y


class TabularModel(nn.Module):
    def __init__(self, input_dims, start_neurons):
        super(TabularModel, self).__init__()

        # Embedding layers
        self.embeddings = nn.ModuleList([nn.Embedding(dim, start_neurons) for dim in input_dims[:-1]])
        self.linear_embedding = nn.Linear(1, start_neurons)

        # Main layers
        self.dropouts = nn.ModuleList([nn.Dropout(0.2) for _ in range(5)])
        self.gates = nn.ModuleList(
            [nn.Linear(start_neurons * len(input_dims), start_neurons * len(input_dims)) for _ in range(5)])
        self.linear_layers = nn.ModuleList(
            [nn.Linear(start_neurons * len(input_dims) * 2, start_neurons * len(input_dims)) for _ in range(5)])

        # Output layers
        self.output_layers = nn.ModuleList([nn.Linear(start_neurons * len(input_dims), 1) for _ in range(10)])

    def forward(self, x):
        embeddings = [self.embeddings[i](x[:, i]) for i in range(len(input_dims[:-1]))]
        embeddings.append(self.linear_embedding(x[:, -1].float().unsqueeze(-1)))

        concatenated_inputs = torch.cat(embeddings, dim=1)

        for dropout, gate, dense in zip(self.dropouts, self.gates, self.linear_layers):
            dropped_input = dropout(concatenated_inputs)
            gate_output = torch.sigmoid(gate(dropped_input))
            gated_input = concatenated_inputs * gate_output
            concat_input = torch.cat([concatenated_inputs, gated_input], dim=1)
            concatenated_inputs = concatenated_inputs + dense(concat_input)

        outputs = [layer(concatenated_inputs) for layer in self.output_layers]
        output = torch.mean(torch.cat(outputs, dim=1), dim=1)
        return output


class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, y_pred, y):
        loss = torch.sqrt(self.mse(y_pred, y) + self.eps)
        return loss


class Trainer:
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def train_step(self, train_loader):
        self.model.train()
        train_loss = 0
        for data, label in tqdm(train_loader, leave=False):
            self.optimizer.zero_grad()

            output = self.model(data)
            loss = self.criterion(output.to(torch.float32), label.to(torch.float32))

            loss.backward()
            self.optimizer.step()
            train_loss += loss.item()

        avg_train_loss = (train_loss / len(train_loader)) ** 0.5

        return avg_train_loss

    def validation_step(self, validation_loader):
        val_loss = 0
        with torch.no_grad():
            for data, label in tqdm(validation_loader, leave=False):
                prediction = self.model(data)
                loss = self.criterion(prediction.to(torch.float32), label.to(torch.float32))
                val_loss += loss.item()
            avg_validation_loss = (val_loss / len(validation_loader)) ** 0.5

        return avg_validation_loss

    def fit(self, train_loader, val_loader):
        for epoch in range(100):
            train_loss = self.train_step(train_loader)
            val_loss = self.validation_step(val_loader)

            print(f"Epoch [{epoch + 1}/{100}]"
                  f"Training Loss: {train_loss:.7f} "
                  f"Validation Loss: {val_loss:.7f} ")


train = pd.read_csv(r'./data/train.csv')
test = pd.read_csv(r'./data/test.csv')
international_trade = pd.read_csv(r'./data/international_trade.csv')
def group_season(df):
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    return df['season']

def holiday(df):
    kr_holidays = holidays.KR()
    df['holiday'] = df.timestamp.apply(lambda x: 'holiday' if x in kr_holidays else 'non-holiday')
    return df['holiday']

train['year'] = train['timestamp'].apply(lambda x : int(x[0:4]))
train['month'] = train['timestamp'].apply(lambda x : int(x[5:7]))
train['day'] = train['timestamp'].apply(lambda x : int(x[8:10]))
train['Weekday'] = pd.to_datetime(train['timestamp']).dt.weekday
train['is_weekend'] = train['Weekday'].apply(lambda x: 1 if x >= 5 else 0)
train['year'] = train['year'] - 2019

test['year'] = test['timestamp'].apply(lambda x : int(x[0:4]))
test['month'] = test['timestamp'].apply(lambda x : int(x[5:7]))
test['day'] = test['timestamp'].apply(lambda x : int(x[8:10]))
test['Weekday'] = pd.to_datetime(test['timestamp']).dt.weekday
test['is_weekend'] = test['Weekday'].apply(lambda x: 1 if x >= 5 else 0)
test['year'] = test['year'] - 2019

train['season'] = group_season(train)
test['season'] = group_season(test)

train['holiday'] = holiday(train)
test['holiday'] = holiday(test)

x = train.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
y = train['price(원/kg)']

x_test = test.drop(columns=['ID', 'timestamp'])

qual_col = ['item', 'corporation', 'location', 'season', 'holiday']

for i in qual_col:      
    le = LabelEncoder()
    x[i]=le.fit_transform(x[i])
    x_test[i]=le.transform(x_test[i])

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1103)

x_train = x_train.values
x_val = x_val.values
y_train = y_train.values
y_val = y_val.values

train_dataset = TabularDataset(x_train, y_train)
val_dataset = TabularDataset(x_val, y_val)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=64)

input_dims = [6, 7, 3, 6, 13, 32, 8, 3, 5, 3]
model = TabularModel(input_dims, 10)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)
trainer = Trainer(model, criterion, optimizer)
trainer.fit(train_loader, val_loader)



In [34]:
model = TabularModel(input_dims, 10)
model.load_state_dict(torch.load(r'best_model.pt'))
model.eval()

TabularModel(
  (embeddings): ModuleList(
    (0): Embedding(6, 10)
    (1): Embedding(7, 10)
    (2): Embedding(3, 10)
    (3): Embedding(6, 10)
    (4): Embedding(13, 10)
    (5): Embedding(32, 10)
    (6): Embedding(8, 10)
    (7): Embedding(3, 10)
    (8): Embedding(5, 10)
  )
  (linear_embedding): Linear(in_features=1, out_features=10, bias=True)
  (dropouts): ModuleList(
    (0): Dropout(p=0.2, inplace=False)
    (1): Dropout(p=0.2, inplace=False)
    (2): Dropout(p=0.2, inplace=False)
    (3): Dropout(p=0.2, inplace=False)
    (4): Dropout(p=0.2, inplace=False)
  )
  (gates): ModuleList(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): Linear(in_features=100, out_features=100, bias=True)
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): Linear(in_features=100, out_features=100, bias=True)
    (4): Linear(in_features=100, out_features=100, bias=True)
  )
  (linear_layers): ModuleList(
    (0): Linear(in_features=200, out_features=100, bi

In [37]:
class TestDataset(Dataset):
    def __init__(self, x) -> None:
        self.x = torch.from_numpy(x.values)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        return x

test_dataset = TestDataset(x_test)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=64)

In [39]:
model.eval()
pred_list = []
with torch.inference_mode():
    for data in test_loader:
        pred = model(data)
        model_pred = pred.cpu().numpy().reshape(-1).tolist()
        pred_list += model_pred

In [45]:
submission = pd.read_csv(r'data\sample_submission.csv')
submission['answer'] = pred_list
submission.to_csv('./dnn_submission.csv', index=False)

[4163.40380859375,
 52.35978317260742,
 3680.00390625,
 2989.71630859375,
 3166.8359375,
 3895.82275390625,
 3625.686279296875,
 3725.36083984375,
 27.183135986328125,
 3132.2109375,
 2987.64892578125,
 2757.85009765625,
 3794.86328125,
 4034.00146484375,
 4214.7099609375,
 67.74739837646484,
 3914.81396484375,
 3624.79443359375,
 3253.265625,
 3367.41650390625,
 4468.67138671875,
 4294.029296875,
 81.33121490478516,
 3650.317138671875,
 3568.981689453125,
 3377.181640625,
 3713.868408203125,
 4694.921875,
 5123.9228515625,
 23.13399887084961,
 4001.16015625,
 4023.90771484375,
 4653.0908203125,
 4757.75244140625,
 4513.33935546875,
 4772.8857421875,
 2.264517307281494,
 3769.37255859375,
 4355.52880859375,
 4357.6376953125,
 4788.43896484375,
 4885.44970703125,
 5209.1376953125,
 91.00202178955078,
 4940.111328125,
 4924.40869140625,
 5045.0888671875,
 4775.5068359375,
 5518.1494140625,
 5610.52001953125,
 304.92205810546875,
 4749.4853515625,
 5446.34130859375,
 5346.4013671875,
 492