In [59]:
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, DataLoader2

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch.optim as optim


In [51]:

class TabularModel(nn.Module):
    def __init__(self, start_neurons):
        super(TabularModel, self).__init__()
        
        # Embedding layers
        input_dims = [5, 6, 2, 12, 31]
        self.embeddings = nn.ModuleList([nn.Embedding(dim, start_neurons) for dim in input_dims])
        self.last_dense = nn.Linear(1, start_neurons)
        
        # Main layers
        self.dropouts = nn.ModuleList([nn.Dropout(0.2) for _ in range(5)])
        self.gates = nn.ModuleList([nn.Linear(start_neurons, start_neurons) for _ in range(5)])
        self.main_denses = nn.ModuleList([nn.Linear(2 * start_neurons, 20 * start_neurons) for _ in range(5)])
        
        # Output layer
        self.output = nn.Linear(20 * start_neurons, 1)

    def forward(self, x):
        embeddings = []
        for i, e in enumerate(self.embeddings):
            embeddings.append(e(x[:, i]))
        
        # Last feature as dense
        embeddings.append(self.last_dense(x[:, -1].float().unsqueeze(-1)))
        
        all_layer = torch.cat(embeddings, 1)
        
        for i in range(5):
            all_layer_d = self.dropouts[i](all_layer)
            all_layer_d_gate = torch.sigmoid(self.gates[i](all_layer_d))
            all_layer_ = all_layer * all_layer_d_gate
            all_layer_c = torch.cat([all_layer, all_layer_], 1)
            all_layer += F.relu(self.main_denses[i](all_layer_c))
            
        output = self.output(all_layer).squeeze(-1)
        
        return output


In [52]:
train = pd.read_csv(r'./data/train.csv')
test = pd.read_csv(r'./data/test.csv')
international_trade = pd.read_csv(r'./data/international_trade.csv')

train['month'] = train['timestamp'].apply(lambda x : int(x[5:7]))
train['day'] = train['timestamp'].apply(lambda x : int(x[8:10]))

test['month'] = test['timestamp'].apply(lambda x : int(x[5:7]))
test['day'] = test['timestamp'].apply(lambda x : int(x[8:10]))

x = train.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
y = train['price(원/kg)']

x_test = test.drop(columns=['ID', 'timestamp'])

qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    x[i]=le.fit_transform(x[i])
    x_test[i]=le.transform(x_test[i])

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1103)



In [53]:
x_train = x_train.values
x_val = x_val.values
y_train = y_train.values
y_val = y_val.values

In [54]:
class TabularDataset(Dataset):
    def __init__(self, x, y) -> None:
        self.x = torch.from_numpy(x)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        return x, y

train_dataset = TabularDataset(x_train, y_train)
val_dataset = TabularDataset(x_val, y_val)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=128)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=128)

In [55]:
class Trainer:
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def train_step(self, train_loader):
        self.model.train()
        self.model
        train_loss = 0
        for data, label in train_loader:
            data = data
            self.optimizer.zero_grad()

            output = self.model(data)
            loss = self.criterion(output, label)

            loss.backward()
            self.optimizer.step()
            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)

        return avg_train_loss

    def validation_step(self, validation_loader):
        val_loss = 0
        with torch.no_grad():
            for data in iter(validation_loader):
                data = data
                prediction = self.model(data)
                loss = self.criterion(prediction, data)
                val_loss += loss.item()
            avg_validation_loss = val_loss / len(validation_loader)

        return avg_validation_loss

    def fit(self, train_loader, val_loader):
        for epoch in range(30):
            train_loss = self.train_step(train_loader)
            val_loss = self.validation_step(val_loader)

            print(f"Epoch [{epoch + 1}/{30}]"
                  f"Training Loss: {train_loss:.7f} "
                  f"Validation Loss: {val_loss:.7f} "
                  )

In [62]:
model = TabularModel(x.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)
trainer = Trainer(model, criterion, optimizer)

In [64]:
trainer.fit(train_loader, val_loader)

IndexError: index out of range in self