In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import

In [None]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Hyperparameter Setting

In [None]:
CFG = {
    'TRAIN_WINDOW_SIZE': 90,
    'PREDICT_SIZE': 21,
    'EPOCHS': 20,
    'LEARNING_RATE': 1e-4,
    'BATCH_SIZE': 4096,
    'SEED': 41,
    'enc_in':7
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

# data

### Load dataset

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/data/train.csv', encoding='utf-8').drop(columns=['ID', '제품'])
price = pd.read_csv('/content/drive/MyDrive/data/price.csv', encoding='utf-8', index_col=0)
b_keyword = pd.read_csv('/content/drive/MyDrive/data/brand_keyword_cnt.csv', encoding='utf-8')

### Data preprocessing

In [None]:
def df_minmax_norm(df):
    min_v, max_v= df.min(axis=1), df.max(axis=1)
    scale_min_dict, scale_max_dict = min_v.to_dict(), max_v.to_dict()
    max_sub_min = max_v - min_v
    max_sub_min[max_sub_min==0] = 1

    data_minmax = df.sub(min_v, axis=0).div(max_sub_min, axis=0)

    return data_minmax, scale_min_dict, scale_max_dict

In [None]:
def make_train_data(data, price, brand, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE'], enc_in=CFG['enc_in']):
    num_rows = len(data)
    window_size = train_size + predict_size

    input_data = np.empty((num_rows * (len(data.columns)- 4 - predict_size), enc_in))
    Y = np.empty((num_rows * (len(data.columns)- 4 - window_size + 1), predict_size))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        price_data = np.array(price.iloc[i, :])
        brand = np.array(b_keyword[b_keyword['브랜드'] == train_data['브랜드'][i]].iloc[0, 1:])
        sales_data = np.array(data.iloc[i, 4:])

        for j in range(len(sales_data)-predict_size):
            input_data[i*(len(sales_data)-predict_size)+j] = np.concatenate((encode_info, np.array([price_data[j]]), np.array([brand[j]]), np.array([sales_data[j]])))

        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            Y[i*(len(sales_data) - window_size + 1) + j] = window[train_size:]

    return input_data, Y

In [None]:
def make_predict_data(data, price, brand, train_size=CFG['TRAIN_WINDOW_SIZE'], enc_in=CFG['enc_in']):
    num_rows = len(data)

    input_data = np.empty((num_rows, train_size, enc_in))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        price_data = np.array(price.iloc[i, -train_size:])
        brand_data = np.array(b_keyword[b_keyword['브랜드'] == train_data['브랜드'][i]].iloc[0, -train_size:])
        sales_data = np.array(data.iloc[i, -train_size:])

        input_data[i] = np.column_stack((np.tile(encode_info, (train_size, 1)), price_data, brand_data, sales_data))

    return input_data

In [None]:
# minmax normalization
train_data.iloc[:, 4:], scale_min_dict, scale_max_dict = df_minmax_norm(train_data.iloc[:, 4:])
price, p_min_dict, p_max_dict = df_minmax_norm(price)

In [None]:
# text label -> float
sales_data = pd.concat([train_data.loc[:, '대분류'], train_data.iloc[:, 4:424]], axis=1)
df = sales_data.groupby('대분류').sum().mean(axis=1)
df = round((df-df.min())/(df.max()-df.min()), 5)
cat_L = {k: df[k] for k in df.index}
# print(len(set(cat_L.values())), cat_L)

sales_data = pd.concat([train_data.loc[:, '중분류'], train_data.iloc[:, 4:424]], axis=1)
df = sales_data.groupby('중분류').sum().mean(axis=1)
df = round((df-df.min())/(df.max()-df.min()), 5)
cat_M = {k: df[k] for k in df.index}
# print(len(set(cat_M.values())), cat_M)

sales_data = pd.concat([train_data.loc[:, '소분류'], train_data.iloc[:, 4:424]], axis=1)
df = sales_data.groupby('소분류').sum().mean(axis=1)
df = round((df-df.min())/(df.max()-df.min()), 5)
cat_S = {k: df[k] for k in df.index}
# print(len(set(cat_S.values())), cat_S)

sales_data = pd.concat([train_data.loc[:, '브랜드'], train_data.iloc[:, 4:424]], axis=1)
df = sales_data.groupby('브랜드').sum().sum(axis=1)
df_ = [[df.loc[k], k] for k in df.index]
df_ = sorted(df_, key=lambda x: (x[0], x[1]))
cat_B = {df_[i][1]: round(1/len(df_)*(i+1), 5) for i in range(len(df_))}
# print(len(set(cat_B.values())), cat_B)

In [None]:
train_data['대분류'] = train_data['대분류'].apply(lambda x: cat_L[x])
train_data['중분류'] = train_data['중분류'].apply(lambda x: cat_M[x])
train_data['소분류'] = train_data['소분류'].apply(lambda x: cat_S[x])
train_data['브랜드'] = train_data['브랜드'].apply(lambda x: cat_B[x])
train_data.head()

In [None]:
b_keyword['브랜드'] = b_keyword['브랜드'].apply(lambda x: cat_B[x])
b_keyword = b_keyword.fillna(0.0)
b_keyword.head()

### Dataset split

In [None]:
# Train / Validation Split
data_len = len(train_data)
train_idx = int(data_len*0.8)  # id: 0 ~ data_len*0.8 까지
train_input, train_target = make_train_data(train_data.iloc[:train_idx, :], price, b_keyword)
val_input, val_target = make_train_data(train_data.iloc[train_idx:, :], price, b_keyword)
test_input = make_predict_data(train_data, price, b_keyword)

  0%|          | 0/12712 [00:00<?, ?it/s]

  0%|          | 0/3178 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [None]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

### Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
        self.X = X
        self.Y = Y
        self.train_size = train_size
        self.n1 = 459 - train_size - predict_size + 1 # 349
        self.n2 = 459 - predict_size # 438

    def __getitem__(self, index):
        if self.Y is None:
            return torch.Tensor(self.X[index])
        else:
            s = (index//self.n1)*self.n2 + index%self.n1
            X_ = self.X[s:s+self.train_size]
            return torch.Tensor(X_), torch.Tensor(self.Y[index])


    def __len__(self):
        if self.Y is None:
            return len(self.X)
        else:
            return len(self.Y)

### DataLoader

In [None]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

# Model

### LSTM (baseline)

In [None]:
class LSTM_Model(nn.Module):
    def __init__(self, input_size=CFG['enc_in'], hidden_size=512, output_size=CFG['PREDICT_SIZE'], num_layers=1):
        super(LSTM_Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=num_layers)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, output_size)
        )

        self.actv = nn.ReLU()

    def forward(self, x):
        '''
        ***** shape *****
        x: (b_s, TRAIN_WINDOW_SIZE, 6)
        lstm_out: (b_s, seq_len, h_s)
        hidden: ((num_layers, b_s, h_s), (num_layers, b_s, h_s))
        output: (b_s, o_s)
        '''

        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)

        # LSTM layer
        lstm_out, hidden = self.lstm(x, hidden)

        # Only use the last output sequence
        last_output = lstm_out[:, -1, :]

        # Fully connected layer
        output = self.actv(self.fc(last_output))

        return output.squeeze(1)  # 차원 1 제거

    def init_hidden(self, batch_size, device):
        # Initialize hidden state and cell state
        return (torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device),
                torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device))

# Train & Inference

### train & validation

In [None]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None

    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)

            optimizer.zero_grad()

            output = model(X)
            loss = criterion(output, Y)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')

        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            torch.save({
                'epoch': epoch,
                'model_state_dict': best_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'/content/drive/MyDrive/checkpoint/epoch{epoch}_model.pth')
            print('Model Saved')
    return best_model

In [None]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

### Run

In [None]:
model = LSTM_Model(num_layers=2)
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.01967] Val Loss : [0.01922]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.01788] Val Loss : [0.01780]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.01753] Val Loss : [0.01767]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [4] Train Loss : [0.01728] Val Loss : [0.01736]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [5] Train Loss : [0.01714] Val Loss : [0.01733]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [6] Train Loss : [0.01706] Val Loss : [0.01722]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [7] Train Loss : [0.01700] Val Loss : [0.01717]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [8] Train Loss : [0.01696] Val Loss : [0.01716]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [9] Train Loss : [0.01691] Val Loss : [0.01708]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [10] Train Loss : [0.01686] Val Loss : [0.01701]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [11] Train Loss : [0.01681] Val Loss : [0.01713]


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [12] Train Loss : [0.01674] Val Loss : [0.01704]


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [13] Train Loss : [0.01667] Val Loss : [0.01706]


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [14] Train Loss : [0.01660] Val Loss : [0.01702]


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [15] Train Loss : [0.01653] Val Loss : [0.01700]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [16] Train Loss : [0.01645] Val Loss : [0.01695]
Model Saved


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [17] Train Loss : [0.01637] Val Loss : [0.01717]


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [18] Train Loss : [0.01628] Val Loss : [0.01729]


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [19] Train Loss : [0.01617] Val Loss : [0.01753]


  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

Epoch : [20] Train Loss : [0.01605] Val Loss : [0.01748]


### Inference

In [None]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)

            output = model(X)

            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)

In [None]:
pred = inference(infer_model, test_loader, device)

  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
pred.shape

(15890, 21)

### Submission

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/data/sample_submission.csv')
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
submit.iloc[:,1:] = pred
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,2,0,0,0,0,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,3,0,0,0,0,1,1,2,2,2,...,2,2,3,3,3,3,3,3,3,3
4,4,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,1


In [None]:
submit.to_csv('/content/drive/MyDrive/LSTM_nlayer2_hs512.csv', index=False)