In [10]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

##
from DLinear import LTSF_DLinear
from DLinear import moving_avg
from utils import Data

In [11]:
data = pd.read_csv("data/train.csv")

# 상품, 회사, 지역 별로 분할
ID_split = data['ID'].str.split('_')
code = []
for l in ID_split:
    code.append(f"{l[0]}_{l[1]}_{l[2]}")
unique_code = set(code)

# df_list에 저장
df_list = {}
for code in unique_code:
    code_parts = code.split('_')
    code_item, code_corporation, code_location = code_parts[0], code_parts[1], code_parts[2]
    
    condition = (data['item'] == code_item) & (data['corporation'] == code_corporation) & (data['location'] == code_location)
    
    df_list[f"data_{code}"] = data[condition]


## prepare dataset for training
train_df = df_list['data_BC_C_J']
test_df = df_list['data_BC_C_J']

train_df = train_df.drop(['ID', 'item', 'corporation', 'location'], axis=1)
test_df = test_df.drop(['ID', 'item', 'corporation', 'location'], axis=1)
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

In [12]:
window_size = 14
forcast_size= 7
batch_size = 32
targets = 'price(원/kg)'
date = 'timestamp'
not_col = 'timestamp'

In [16]:
def standardization(train_df, test_df, not_col, target):
    train_df_ = train_df.copy()
    test_df_ = test_df.copy()
    col =  [col for col in list(train_df.columns) if col not in [not_col]]
    print(col)
    mean_list = []
    std_list = []
    for x in col:
        mean, std = train_df_.agg(["mean", "std"]).loc[:,x]
        mean_list.append(mean)
        std_list.append(std)
        train_df_.loc[:, x] = (train_df_[x] - mean) / std
        test_df_.loc[:, x] = (test_df_[x] - mean) / std
    return train_df_, test_df_, mean_list[col.index(target)], std_list[col.index(target)]

In [17]:
train_df_fe, test_df_fe, mean_, std_ = standardization(train_df, test_df, not_col, targets)

['supply(kg)', 'price(원/kg)']


In [24]:
def time_slide_df(df, window_size, forcast_size, date, target):
    df_ = df.copy()
    data_list = []
    dap_list = []
    date_list = []
    for idx in range(0, df_.shape[0]-window_size-forcast_size+1):
        print(df_.loc[idx:idx+window_size-1, target].values)
        x = df_.loc[idx:idx+window_size-1, target].values.reshape(window_size, 1)
        y = df_.loc[idx+window_size:idx+window_size+forcast_size-1, target].values
        date_ = df_.loc[idx+window_size:idx+window_size+forcast_size-1, date].values
        data_list.append(x)
        dap_list.append(y)
        date_list.append(date_)
    return np.array(data_list, dtype='float32'), np.array(dap_list, dtype='float32'), np.array(date_list)

In [32]:
for idx in range(0, 1503):
    x = train_df_fe.loc[idx:idx+window_size-1, targets]
    print(x)

Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtype: float64)
Series([], Name: price(원/kg), dtyp

In [25]:
train_x, train_y, train_date = time_slide_df(train_df_fe, window_size, forcast_size, date, targets)

[]


ValueError: cannot reshape array of size 0 into shape (14,1)

In [1]:
from datetime import datetime, timedelta

def date_range_to_numeric(start_date, end_date):
    # 시작 날짜와 종료 날짜를 datetime 객체로 변환
    start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
    end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')
    
    # 날짜 범위 계산
    date_list = []
    current_date = start_date_obj
    
    while current_date <= end_date_obj:
        date_list.append(int(current_date.strftime('%Y%m%d')))
        current_date += timedelta(days=1)
    
    return date_list

# 예제 사용
start_date = '2023-11-01'
end_date = '2023-11-05'

numeric_dates = date_range_to_numeric(start_date, end_date)

print(numeric_dates)

[20231101, 20231102, 20231103, 20231104, 20231105]


In [13]:
import torch
# 예시로 생성한 Tensor
my_tensor = torch.Tensor([[[57.9432], [-63.0979], [69.1012]], [[-64.9360], [61.8511], [-0.6243]]])

# 뒤에서부터 61개의 값 추출
last_61_values = my_tensor.reshape(1,-1)

print(last_61_values)
print(last_61_values[:,-3:])


tensor([[ 57.9432, -63.0979,  69.1012, -64.9360,  61.8511,  -0.6243]])
tensor([[-64.9360,  61.8511,  -0.6243]])


In [15]:
time = 0
if 20230305 in [20230305, 20230312, 20230319, 20230326]:
    time += 1

print(time)

1


In [None]:
## 
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import json

##
from DLinear import LTSF_DLinear
from DLinear import moving_avg
from utils import standardization
from utils import time_slide_df
from utils import Data
from utils import data_split
from utils import date_range_to_numeric



# data_loading
data = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# 상품, 회사, 지역 별로 분할하여 df_list에 저장
# unique code = {'CB_E_J', 'TG_A_S', 'TG_D_S', 'CB_A_S', 'RD_C_S', 'BC_E_J', 'CR_D_S', 'BC_C_J', 'RD_D_S', 'BC_A_J', 'BC_C_S', 'TG_C_J', 'TG_D_J', 
#                'TG_C_S', 'BC_D_J', 'TG_E_S', 'RD_D_J', 'CR_B_J', 'RD_A_J', 'TG_B_J', 'BC_A_S', 'CR_A_J', 'TG_B_S', 'CR_E_S', 'TG_E_J', 'CR_D_J', 
#                'TG_A_J', 'RD_E_J', 'BC_B_S', 'RD_F_J', 'CR_E_J', 'CB_A_J', 'BC_B_J', 'CR_C_J', 'CB_F_J', 'RD_E_S', 'RD_A_S', 'CB_D_J', 'BC_E_S'}

data_list = data_split(data)
test_list = data_split(test)

unique_code = ['BC_C_J', 'TG_B_J', 'CR_B_J', 'RD_E_S', 'BC_A_J', 'CB_F_J', 'RD_D_J', 'TG_A_S', 'BC_E_S', 'CR_D_J', 'BC_A_S', 'BC_B_S', 'TG_E_J', 
               'CR_E_S', 'RD_F_J', 'BC_E_J', 'TG_A_J', 'CR_C_J', 'CR_D_S', 'TG_C_J', 'CB_A_S', 'TG_D_J', 'CR_E_J', 'RD_C_S', 'BC_C_S', 'CB_E_J', 
               'RD_E_J', 'BC_D_J', 'CR_A_J', 'TG_E_S', 'TG_C_S', 'TG_D_S', 'RD_A_S', 'RD_A_J', 'RD_D_S', 'TG_B_S', 'CB_D_J', 'CB_A_J', 'BC_B_J']


pred = {}

# for i in tqdm(range(len(unique_code))):
for i in tqdm(range(len(unique_code))):

    ## prepare dataset for training
    dataset_code = unique_code[i]

    train_df = data_list[f'data_{dataset_code}'].reset_index(drop=True)
    test_df = test_list[f'data_{dataset_code}'].reset_index(drop=True)
    print(len(train_df))

    # train_df = train_df.drop(['ID', 'item', 'corporation', 'location'], axis=1)
    # test_df = test_df.drop(['ID', 'item', 'corporation', 'location'], axis=1)



    train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
    test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
    print("Dataset prepared")
    print()

    ## paramaters
    window_size = 7
    forcast_size= 1
    batch_size = 32
    targets = 'price(원/kg)'
    date = 'timestamp'
    not_col = 'timestamp'

    print("Start preprocessing")
    print()
    # train_df_fe, test_df_fe, mean_, std_ = standardization(train_df, test_df, not_col, targets)
    # train_x, train_y, train_date = time_slide_df(train_df_fe, window_size, forcast_size, date, targets)
    # test_x, test_y, test_date = time_slide_df(test_df_fe, window_size, forcast_size, date, targets)

    train_x, train_y, train_date = time_slide_df(train_df, window_size, forcast_size, date, targets)
    # test_x, test_y, test_date = time_slide_df(test_df, window_size, forcast_size, date, targets)


    train_ds = Data(train_x[:1400], train_y[:1400])
    valid_ds = Data(train_x[1400:], train_y[1400:])
    # test_ds = Data(test_x, test_y)

    train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle=True,)
    valid_dl = DataLoader(valid_ds, batch_size = train_x[1400:].shape[0], shuffle=False)
    # test_dl  = DataLoader(test_ds,  batch_size = test_x.shape[0], shuffle=False)
    print("Success preprocessing")


    print("Paramater for training")
    print()
    train_loss_list = []
    valid_loss_list = []
    test_loss_list = []
    epochs = 50
    lr = 0.001
    DLinear_model = LTSF_DLinear(
                                window_size=window_size,
                                forcast_size=forcast_size,
                                kernel_size=25,
                                individual=False,
                                feature_size=1,
                                )
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(DLinear_model.parameters(), lr=lr)
    max_loss = 999999999

    print("strat training")
    print()


    for epoch in tqdm(range(1, epochs+1)):
        loss_list = []
        DLinear_model.train()
        for batch_eeeeidx, (data, target) in enumerate(train_dl):
            optimizer.zero_grad()
            output = DLinear_model(data)
            loss = criterion(output, target.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            loss_list.append(np.sqrt(loss.item()))  
        train_loss_list.append(np.sqrt(np.mean(loss_list)))

        DLinear_model.eval()
        with torch.no_grad():
            for data, target in valid_dl:
                output = DLinear_model(data)
                # if epoch == 100:
                #     temp = output.reshape(1,-1)
                #     temp1 = temp[:,-61:].reshape(1,61,1)
                #     pred[dataset_code] = DLinear_model(temp1).reshape(1,-1).numpy()
                valid_loss = np.sqrt(criterion(output, target.unsqueeze(-1)).item())
                valid_loss_list.append(np.sqrt(valid_loss.item()))
            
        if epoch == 50:
            data = train_y[-7:].reshape(1,-1)
            pred_value = np.array([])
            for i in range(28):
                pred_input = torch.Tensor(data).reshape(1,7,1)
                pred_output = DLinear_model(pred_input)
                data = np.append(data,pred_output.item())
                pred_value = np.append(pred_value,pred_output.item())
                data = data[1:]

            pred[dataset_code] = pred_value
            

            # for data, target in test_dl:
            #     output = DLinear_model(data)
            #     test_loss = criterion(output, target.unsqueeze(-1))
            #     test_loss_list.append(test_loss)

        # if valid_loss < max_loss:
        #     torch.save(DLinear_model, f'DLinear_model_{dataset_code}.pth')
        #     max_loss = valid_loss
        #     # print("valid_loss={:.3f}, test_los{:.3f}, Model Save".format(valid_loss, test_loss))
        #     print("valid_loss={:.3f}, Model Save".format(valid_loss))
        #     dlinear_best_epoch = epoch
        #     dlinear_best_train_loss = np.mean(loss_list)
        #     dlinear_best_valid_loss = np.mean(valid_loss.item())

        # print("epoch = {}, train_loss : {:.3f}, valid_loss : {:.3f}, test_loss : {:.3f}".format(epoch, np.mean(loss_list), valid_loss, test_loss))
        
        if epoch % 20 == 0:
            print("epoch = {}, train_loss : {:.3f}, valid_loss : {:.3f}".format(epoch, np.mean(loss_list), valid_loss))





In [None]:
## 
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import json

##
from DLinear import LTSF_DLinear
from DLinear import moving_avg
from NLinear import LTSF_NLinear
from utils import standardization
from utils import time_slide_df
from utils import Data
from utils import data_split
from utils import date_range_to_numeric



# data_loading
data = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# 상품, 회사, 지역 별로 분할하여 df_list에 저장
# unique code = {'CB_E_J', 'TG_A_S', 'TG_D_S', 'CB_A_S', 'RD_C_S', 'BC_E_J', 'CR_D_S', 'BC_C_J', 'RD_D_S', 'BC_A_J', 'BC_C_S', 'TG_C_J', 'TG_D_J', 
#                'TG_C_S', 'BC_D_J', 'TG_E_S', 'RD_D_J', 'CR_B_J', 'RD_A_J', 'TG_B_J', 'BC_A_S', 'CR_A_J', 'TG_B_S', 'CR_E_S', 'TG_E_J', 'CR_D_J', 
#                'TG_A_J', 'RD_E_J', 'BC_B_S', 'RD_F_J', 'CR_E_J', 'CB_A_J', 'BC_B_J', 'CR_C_J', 'CB_F_J', 'RD_E_S', 'RD_A_S', 'CB_D_J', 'BC_E_S'}

data_list = data_split(data)
test_list = data_split(test)

unique_code = ['BC_C_J', 'TG_B_J', 'CR_B_J', 'RD_E_S', 'BC_A_J', 'CB_F_J', 'RD_D_J', 'TG_A_S', 'BC_E_S', 'CR_D_J', 'BC_A_S', 'BC_B_S', 'TG_E_J', 
               'CR_E_S', 'RD_F_J', 'BC_E_J', 'TG_A_J', 'CR_C_J', 'CR_D_S', 'TG_C_J', 'CB_A_S', 'TG_D_J', 'CR_E_J', 'RD_C_S', 'BC_C_S', 'CB_E_J', 
               'RD_E_J', 'BC_D_J', 'CR_A_J', 'TG_E_S', 'TG_C_S', 'TG_D_S', 'RD_A_S', 'RD_A_J', 'RD_D_S', 'TG_B_S', 'CB_D_J', 'CB_A_J', 'BC_B_J']


pred = {}

# for i in tqdm(range(len(unique_code))):
for i in tqdm(range(len(unique_code))):

    ## prepare dataset for training
    dataset_code = unique_code[i]

    train_df = data_list[f'data_{dataset_code}'].reset_index(drop=True)
    test_df = test_list[f'data_{dataset_code}'].reset_index(drop=True)
    print(len(train_df))

    # train_df = train_df.drop(['ID', 'item', 'corporation', 'location'], axis=1)
    # test_df = test_df.drop(['ID', 'item', 'corporation', 'location'], axis=1)



    train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
    test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
    print("Dataset prepared")
    print()

    ## paramaters
    window_size = 7
    forcast_size= 1
    batch_size = 32
    targets = 'price(원/kg)'
    date = 'timestamp'
    not_col = 'timestamp'

    print("Start preprocessing")
    print()
    # train_df_fe, test_df_fe, mean_, std_ = standardization(train_df, test_df, not_col, targets)
    # train_x, train_y, train_date = time_slide_df(train_df_fe, window_size, forcast_size, date, targets)
    # test_x, test_y, test_date = time_slide_df(test_df_fe, window_size, forcast_size, date, targets)

    train_x, train_y, train_date = time_slide_df(train_df, window_size, forcast_size, date, targets)
    # test_x, test_y, test_date = time_slide_df(test_df, window_size, forcast_size, date, targets)


    train_ds = Data(train_x[:1400], train_y[:1400])
    valid_ds = Data(train_x[1400:], train_y[1400:])
    # test_ds = Data(test_x, test_y)

    train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle=True,)
    valid_dl = DataLoader(valid_ds, batch_size = train_x[1400:].shape[0], shuffle=False)
    # test_dl  = DataLoader(test_ds,  batch_size = test_x.shape[0], shuffle=False)
    print("Success preprocessing")


    print("Paramater for training")
    print()
    train_loss_list = []
    valid_loss_list = []
    test_loss_list = []
    epochs = 100
    lr = 0.001
    NLinear_model = LTSF_NLinear(
                                window_size = window_size, 
                                forcast_size = forcast_size, 
                                individual = True, 
                                feature_size = 1
                                )
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(NLinear_model.parameters(), lr=lr)
    max_loss = 999999999

    print("strat training")
    print()


    for epoch in tqdm(range(1, epochs+1)):
        loss_list = []
        NLinear_model.train()
        for batch_eeeeidx, (data, target) in enumerate(train_dl):
            optimizer.zero_grad()
            output = NLinear_model(data)
            loss = criterion(output, target.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            loss_list.append(np.sqrt(loss.item()))  
        train_loss_list.append(np.sqrt(np.mean(loss_list)))

        NLinear_model.eval()
        with torch.no_grad():
            for data, target in valid_dl:
                output = NLinear_model(data)
                valid_loss = np.sqrt(criterion(output, target.unsqueeze(-1)).item())
                valid_loss_list.append(np.sqrt(valid_loss.item()))
            
        if epoch == epochs:
            data = train_y[-7:].reshape(1,-1)
            pred_value = np.array([])
            for i in range(28):
                pred_input = torch.Tensor(data).reshape(1,7,1)
                pred_output = NLinear_model(pred_input)
                data = np.append(data,pred_output.item())
                pred_value = np.append(pred_value,pred_output.item())
                data = data[1:]

            pred[dataset_code] = pred_value

            # for data, target in test_dl:
            #     output = NLinear_model(data)
            #     test_loss = criterion(output, target.unsqueeze(-1))
            #     test_loss_list.append(test_loss)

        # if valid_loss < max_loss:
        #     torch.save(NLinear_model, f'NLinear_model_{dataset_code}.pth')
        #     max_loss = valid_loss
        #     # print("valid_loss={:.3f}, test_los{:.3f}, Model Save".format(valid_loss, test_loss))
        #     print("valid_loss={:.3f}, Model Save".format(valid_loss))
        #     dlinear_best_epoch = epoch
        #     dlinear_best_train_loss = np.mean(loss_list)
        #     dlinear_best_valid_loss = np.mean(valid_loss.item())

        # print("epoch = {}, train_loss : {:.3f}, valid_loss : {:.3f}, test_loss : {:.3f}".format(epoch, np.mean(loss_list), valid_loss, test_loss))
        
        if epoch % 20 == 0:
            print("epoch = {}, train_loss : {:.3f}, valid_loss : {:.3f}".format(epoch, np.mean(loss_list), valid_loss))

In [6]:
## Storing result

date_col = []
date = 20230304
for i in range(28):
    date_col.append(date + i)
print(date_col)



result = []
for i in range(len(unique_code)):
    pred_adj = pred[unique_code[i]]
    for j in range(28):
        code = unique_code[i]
        final = pred_adj[j]
        if date_col[j] in [20230305, 20230312, 20230319, 20230326]:
            final = 0
        else:
            final = final

        code_time = f"{unique_code[i]}_{date_col[j]}"
        result.append([code_time, final])
    
    
result_df = pd.DataFrame(result, columns = ['ID', 'pred'])
submission = pd.read_csv("data/sample_submission.csv")


# 'submission' 데이터프레임과 'result_df' 데이터프레임을 'ID'를 기준으로 병합
final_submission = submission.merge(result_df, on='ID', how='left')

# 'pred' 값을 'answer' 열에 복사
final_submission['answer'] = final_submission['pred']

# 'pred' 열 삭제
final_submission.drop('pred', axis=1, inplace=True)

# 마이너스인 경우 0으로 대체
final_submission['answer'] = final_submission['answer'].apply(lambda x: max(x, 0))

# 결과 데이터프레임 출력
print(final_submission.head())


# 업데이트된 데이터프레임을 새로운 CSV 파일로 저장
final_submission.to_csv('csv/NLinear_7_1_vanila.csv', index=False)

[20230304, 20230305, 20230306, 20230307, 20230308, 20230309, 20230310, 20230311, 20230312, 20230313, 20230314, 20230315, 20230316, 20230317, 20230318, 20230319, 20230320, 20230321, 20230322, 20230323, 20230324, 20230325, 20230326, 20230327, 20230328, 20230329, 20230330, 20230331]
                ID       answer
0  TG_A_J_20230304  2806.498047
1  TG_A_J_20230305     0.000000
2  TG_A_J_20230306  2859.436523
3  TG_A_J_20230307  3250.026855
4  TG_A_J_20230308  3188.229248


In [35]:
# CR_D_S, CB_A_S, BC_B_S, BC_C_S, CR_E_S, RD_C_S

submission = pd.read_csv("csv/DLinear_7_1_vanila.csv")
submission.loc[submission['ID'].str.contains('CR_D_S'), 'answer'] = 0
submission.loc[submission['ID'].str.contains('CB_A_S'), 'answer'] = 0
submission.loc[submission['ID'].str.contains('BC_B_S'), 'answer'] = 0
submission.loc[submission['ID'].str.contains('BC_C_S'), 'answer'] = 0
submission.loc[submission['ID'].str.contains('CR_E_S'), 'answer'] = 0
submission.loc[submission['ID'].str.contains('RD_C_S'), 'answer'] = 0

submission.to_csv('csv/DLinear_7_1_vanila_rm0.csv',index=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(10, 20)
        self.fc2 = nn.Linear(20, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# 모델 초기화
model = Net()

# 손실 함수 초기화
criterion = nn.CrossEntropyLoss()

# 옵티마이저 초기화
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# 조기 종료 변수 초기화
early_stopping_epochs = 5
best_loss = float('inf')
early_stop_counter = 0

# 학습 루프
for epoch in range(100):
    # 학습
    model.train()
    train_loss = 0.0
    for data, target in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    # 검증
    model.eval()
    valid_loss = 0.0
    with torch.no_grad():
        for data, target in valid_loader:
            output = model(data)
            loss = criterion(output, target)
            valid_loss += loss.item() * data.size(0)

    # 검증 데이터셋의 손실이 이전보다 증가하는 경우
    if valid_loss > best_loss:
        early_stop_counter += 1
    else:
        best_loss = valid_loss
        early_stop_counter = 0

    # 조기 종료 조건 확인
    if early_stop_counter >= early_stopping_epochs:
        print("Early Stopping!")
        break
