In [None]:
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.arima.model import ARIMA

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
CFG = {
    # 오버피팅 나지 않는 선에서 TRAIN_WINDOW_SIZE를 조정한다.
    'TRAIN_WINDOW_SIZE':120, # 100일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':512,
    'SEED':42
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
train_data = pd.read_csv('./train.csv').drop(columns=['ID', '제품'])
# train_data = pd.read_csv("./meta.csv")
# train_data.drop("Unnamed: 0", axis = 1, inplace = True)
# train_data

In [None]:
meta_data = pd.read_csv("./brand_keyword_cnt.csv")

In [None]:
import matplotlib.pyplot as plt
x = train_data.iloc[2, 4:]
y = meta_data.iloc[2, 1:]
plt.plot(x)
plt.plot(y)
plt.show()

In [None]:
nan_row = np.array(meta_data.isna().any(axis = 1))
index_nan = []
for idx in range(len(nan_row)):
    if nan_row[idx] == True:
        index_nan.append(meta_data.iloc[idx, 0])
index_nan

In [None]:

for idx in tqdm(range(len(meta_data))):
    maxi = np.max(meta_data.iloc[idx,1:])
    mini = np.min(meta_data.iloc[idx,1:])
    
    if maxi == mini :
        meta_data.iloc[idx,1:] = 0
    else:
        meta_data.iloc[idx,1:] = (meta_data.iloc[idx,1:] - mini) / (maxi - mini)
    
#     scale_max_dict[idx] = maxi
#     scale_min_dict[idx] = mini

In [None]:
# # Data Scaling
# 우선 scaling을 진행한다.
# 순서: scaling 작업 -> 여기에 가중 평균을 접목 시킨다
scale_max_dict = {}
scale_min_dict = {}

for idx in tqdm(range(len(train_data))):
    maxi = np.max(train_data.iloc[idx,4:])
    mini = np.min(train_data.iloc[idx,4:])
    
    if maxi == mini :
        train_data.iloc[idx,4:] = 0
    else:
        train_data.iloc[idx,4:] = (train_data.iloc[idx,4:] - mini) / (maxi - mini)
    
    scale_max_dict[idx] = maxi
    scale_min_dict[idx] = mini

In [None]:
train_data

In [None]:
ind = meta_data.iloc[:, 0]
meta_data.set_index(ind, inplace = True)
meta_data.drop(["브랜드"], axis = 1, inplace = True)
meta_data

In [None]:
for idx in range(len(train_data)):
    brand = train_data.iloc[idx, 3]
    if brand in index_nan:
        print("brand {} doesn't apply meta data".format(brand))
        continue
    else:
        print("meta data apply on {}th row".format(idx))
        c = meta_data.loc[brand, :]
        data = train_data.iloc[idx, 4:]
        for n in range(len(c)):
            data[n] = int(data[n]+c[n])
        train_data.iloc[idx, 4:] = data
train_data

In [None]:
train_data.sum()

In [None]:
# 만약 성능이 따로 좋게 나오지 않는다면? 한 행이 전부 0인지 체크
# 만약 한 행이 전부 0이면 그 때는 해당 행 제외하고만 곱해서 진행

In [None]:
train_data.iloc[1, 4:]

### IQR을 이용하여 가볍게 이상치 처리
- 우선 1차적인 이상 탐지는 가볍게 IQR을 이용해서 도출해본다.
- 행 별로 이상 탐지를 진행한다
- 상한 보다 큰 경우는 상한 값으로 대체한다.

In [None]:
# # 우선 1차적인 이상 탐지는 가볍게 IQR을 이용해서 도출해본다.
# # 행 별로 이상 탐지를 진행한다
# # 상한 보다 큰 경우는 상한 값으로 대체한다.
# def IQR_processing(df):
#     for idx in range(len(df)):
#         data = df.iloc[idx, 4:]
#         Q1 = np.percentile(data, 25)
#         Q3 = np.percentile(data, 75)
#         IQR = Q3 - Q1
#         # 3-sigma
#         outlier = 3*IQR
#         print("{}번째의 Outlier의 기준은 {}이다".format(idx, outlier))
#         if outlier == 0:
#             print("{}번째 step은 건너뛴다".format(idx))
#             continue
#         for i in range(len(data)):
#             if data[i] > Q3 + outlier:
#                 data[i] = np.NaN
#         MAX = np.max(data)
#         print("Outlier외의 최댓값은 {}이다".format(MAX))
#         data.fillna(MAX, inplace = True)
#         df.iloc[idx, 4:] = data
#     return df

# train_data = IQR_processing(train)
# train_data

### train_data 분석
1. 우선 해당 데이터셋은 브랜드별 - 대분류, 중분류, 소분류로 이뤄져 있다.
2. column은 5번째 이후로는 일자별로 브랜드별로 팔리는 정도를 나타낸다.
3. 시계열 데이터의 정상성 check, anomaly detection 진행.

In [None]:
# # # ADF test
# # # 제품 별 ADF 테스트 진행
# # # 귀무가설: 시계열은 정상성이 아니다. 대립가설: 시계열은 정상성이다.
# from statsmodels.tsa.stattools import adfuller
# check_diff = []
# def ADF(v, idx):
#     result = adfuller(v) 
#     print("{}th p-value: {}".format(idx, result[1]))
#     if result[1] > 0.05:
#         check_diff.append(idx)
#         print("{}th index should be manipulated since p-value is {}".format(idx, result[1]))
#     return check_diff

In [None]:
# # KPSS 테스트
# # KPSS 테스트는 ADF 테스트와 귀무가설과 대립가설이 정반대이다.
# # p-value가 0.05 이상이면 귀무가설 채택
# # 귀무가설: 정상 시계열이다, 대립가설: 정상 시계열이 아니다.
# from statsmodels.tsa.stattools import kpss
# check_kpss = []
# nan_idx = []
# def KPSS(v, idx):
#     result = kpss(v)
#     print("{}th p-value: {}".format(idx, result[1]))
#     if result[1] < 0.05:
#         print("{}th index should be manipulated since p-value is {}".format(idx, result[1]))
#         check_kpss.append(idx)
#     return check_kpss

In [None]:
# ## log scale
# check_train = train_data.iloc[:, 4:]
# for i in range(len(check_train)):
#     vec = check_train.iloc[i, :]
#     vec = np.array(vec)
#     for n in range(len(vec)):
#         if vec[n] == 0:
#             vec[n] += 1
#     check_train.iloc[i, :] = np.log(vec)
# train_data = pd.concat([train_data.iloc[:, :4], check_train], axis = 1)
# train_data

In [None]:
# # 1차적으로 raw data에 대하여 ADF 정상성 체크
# check_train = train_data.iloc[:, 4:]
# for idx in range(len(check_train)):
#     vec = check_train.iloc[idx, :]
#     res_ADF = ADF(vec, idx)

In [None]:
# # 1차적으로 raw data에 대하여 ADF 정상성 체크
# check_train = train_data.iloc[:, 4:]
# for idx in range(len(check_train)):
#     vec = check_train.iloc[idx, :]
#     res_KPSS = KPSS(vec, idx)

In [None]:
# # 각 제품군 중 정상성을 만족하지 않는 경우 
# # 차분을 통해서 우선 해결해본다.
# # 다음은 시각화 코드이다.
# ex = train_data.iloc[0, :]
# data_diff = ex.diff(periods = 1).dropna()
# plt.figure(figsize=(12, 6))
# plt.subplot(2, 1, 1)
# plt.plot(ex)
# plt.title('Original Time Series Data')

# plt.subplot(2, 1, 2)
# plt.plot(data_diff)
# plt.title('1st Order Differenced Data')
# plt.tight_layout()
# plt.show()

In [None]:
# def difference(df, res):
#     for i in res:
#         df.iloc[i, :] = df.iloc[i, :].diff(periods = 1)
#     return df
# first_diff_train = difference(check_train, res_ADF)
# first_diff_train.fillna(0, inplace = True)

In [None]:
# cate_data = train_data.iloc[:, 0:4]
# train_data = pd.concat([cate_data, first_diff_train], axis = 1)

In [None]:
# check_kpss_train = train_data.iloc[:, 4:]
# cols = np.array(check_kpss_train.columns)
# for idx in range(len(cols)):
#     vec = first_diff_train.iloc[:, idx]
#     res_KPSS = KPSS(vec, idx)

In [None]:
# # Data Scaling
# 우선 scaling을 진행한다.
scale_max_dict = {}
scale_min_dict = {}

for idx in tqdm(range(len(train_data))):
    maxi = np.max(train_data.iloc[idx,4:])
    mini = np.min(train_data.iloc[idx,4:])
    
    if maxi == mini :
        train_data.iloc[idx,4:] = 0
    else:
        train_data.iloc[idx,4:] = (train_data.iloc[idx,4:] - mini) / (maxi - mini)
    
    scale_max_dict[idx] = maxi
    scale_min_dict[idx] = mini

In [None]:
train_data

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

In [None]:
def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    num_rows = len(data)
    window_size = train_size + predict_size
    
    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, len(data.iloc[0, :4]) + 1))
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, 4:])
        
        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]
    
    return input_data, target_data

In [None]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)
    
    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :4]) + 1))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, -train_size:])
        
        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data
    
    return input_data

In [None]:
train_input, train_target = make_train_data(train_data)
test_input = make_predict_data(train_data)

In [None]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [None]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [None]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

모델은 Bi-Directional GRU를 이용

In [None]:
import torch
import torch.nn as nn

class Conv1dGRU(nn.Module):
    def __init__(self, input_size=5, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
        super(Conv1dGRU, self).__init__()
        self.hidden_size = hidden_size
        self.conv1d = nn.Sequential(
            nn.Conv1d(in_channels=CFG["TRAIN_WINDOW_SIZE"], out_channels=64, kernel_size=1),
            nn.ReLU()
        )
#         self.conv2d = nn.Sequential(
#             nn.Conv1d(in_channels=128, out_channels=CFG["TRAIN_WINDOW_SIZE"], kernel_size=1),
#             nn.ReLU()
#         )
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size, output_size)
        )
        self.actv = nn.ReLU()
        torch.nn.init.kaiming_normal_(self.fc[0].weight)
        torch.nn.init.kaiming_normal_(self.fc[3].weight)
    
    def forward(self, x):
        # x shape: (B, TRAIN_WINDOW_SIZE, 5)
        batch_size = x.size(0)
        x = self.conv1d(x)
#         x = self.conv2d(x)
        # GRU layer
        gru_out, _ = self.gru(x)
        
        # Only use the last output sequence
        last_output = gru_out[:, -1, :]
        
        # Fully connected layer
        output = self.actv(self.fc(last_output))
        
        return output.squeeze(1)


In [None]:
def train(model, optimizer, train_loader, val_loader, device, scheduler):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    cnt = 0
    PATIENCE = 10
    for epoch in range(1, CFG["EPOCHS"]+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_loss)
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            cnt = 0
            print('Model Saved')
        else:
            cnt += 1
        if cnt >= PATIENCE:
            print("Early Stopping")
            return best_model
    return best_model

In [None]:
def validation(model, val_loader, criterion, device):
    pred = []
    target = []
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)
            target.extend(Y.cpu().numpy())
            output = model(X)
            pred.extend(output.cpu().numpy())
            loss = criterion(output, Y) 
            val_loss.append(loss.item())    
    return np.mean(val_loss)

In [None]:
model = Conv1dGRU()
# model = torch.load("./lastmodel_0819.pth")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
optimizer = torch.optim.AdamW(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = "min", factor = 0.5, patience = 5, min_lr = 1e-6, verbose = True)
infer_model = train(model, optimizer, train_loader, val_loader, device, scheduler = scheduler)

In [None]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            output = model(X)
            
            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [None]:
pred = inference(model, test_loader, device)

In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]
    
# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit.head()

In [None]:
submit.to_csv('./baseline_submit_GRU.csv', index=False)