## Import

In [336]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [337]:
# import torch

# # GPU 메모리 비우기
# torch.cuda.empty_cache()

# # 새로운 GPU 컨텍스트 생성
# with torch.cuda.device(0):
#     torch.cuda.empty_cache()


In [338]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [339]:
import torch

In [340]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3090'

In [341]:
torch.cuda.is_available()

True

In [342]:
torch.__version__

'2.0.1+cu118'

In [343]:
# torch.cuda.empty_cache()

## Hyperparameter Setting

In [344]:
CFG = {
    'TRAIN_WINDOW_SIZE':105, # 90일치로 학습  초기는 120일이였음 
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':150,
    'LEARNING_RATE':1e-2,
    'BATCH_SIZE':4096,
    'SEED':41
}

In [345]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

### 데이터 불러오기

In [346]:
import pandas as pd
train_data = pd.read_csv('E:/LG/LG_data/train.csv').drop(columns=['ID', '제품'])

In [347]:
train_data.head(2)

Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,0,0,...,0,0,0,1,3,2,0,0,2,0


In [348]:
import pandas as pd

# Load the 'brand_keyword_cnt.csv' file
train_data = pd.read_csv('E:/LG/LG_data/train.csv')

# Filter rows where the '브랜드명' column is '89232-ED'
filtered_rows = train_data[train_data['브랜드'] == 'B002-00003']

# Print the filtered rows
filtered_rows.head()


Unnamed: 0,ID,제품,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
4,4,B002-00003-00001,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,B002-00003-00002,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,B002-00003-00003,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,0,0,0,7,...,0,0,0,0,0,0,0,0,0,0
7,7,B002-00003-00004,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,8,B002-00003-00005,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,0,0,0,15,...,0,0,0,0,0,0,0,0,0,1


In [349]:
import pandas as pd
brand_keyword_cnt = pd.read_csv('E:/LG/LG_data/brand_keyword_cnt.csv')

In [350]:
brand_keyword_cnt.head(10)

Unnamed: 0,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,B002-00001,0.84131,0.91383,1.45053,2.42239,1.87119,1.58108,1.23295,1.17493,1.14592,...,0.31911,0.39164,0.37713,0.49318,0.07252,0.2901,0.31911,0.23208,0.33362,0.44966
1,B002-00002,12.64868,20.2785,15.33217,12.75021,13.56251,13.70757,11.93791,15.56425,14.08471,...,10.26979,11.96692,10.64693,10.41485,10.48738,9.48651,9.28343,10.42935,11.15462,11.38671
2,B002-00003,0.33362,0.43516,0.36263,0.17406,0.21758,0.46417,0.42065,0.2901,0.37713,...,0.53669,0.69625,0.44966,0.39164,1.02988,0.49318,0.91383,0.79779,1.01537,0.88482
3,B002-00005,1.07339,1.71163,2.01624,1.9147,1.98723,2.14679,1.68262,1.378,1.42152,...,2.21932,2.50942,2.87206,2.37888,2.03075,1.53756,1.34899,1.26196,2.32085,2.30635
4,B002-00006,0.0,0.0,0.188558,0.246574,0.246574,0.246574,0.377139,0.087012,0.261084,...,0.072526,0.290103,0.087012,0.0,0.130542,0.0,0.0,0.072526,0.217577,0.0
5,B002-00007,0.88482,1.624588,1.914691,1.697114,1.595591,1.711623,2.364356,1.334484,1.711623,...,0.710748,1.42152,1.885695,2.509407,1.668117,0.928326,0.928326,0.88482,1.3925,1.203943
6,B002-00008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,B002-00009,0.232064,0.246574,0.464151,0.377139,0.580207,0.768764,0.797784,0.377139,0.768764,...,0.522191,0.696239,0.8558,0.667242,0.957346,0.667242,0.261084,0.348119,0.812294,0.696239
8,B002-00010,0.33362,0.44966,0.5512,0.52219,0.47867,0.47867,0.37713,0.36263,0.44966,...,0.30461,0.53669,0.68175,1.02988,0.2756,2.61096,2.50942,0.94284,0.92834,1.84218
9,B002-00011,4.3371,6.38236,6.61444,6.29532,6.19379,6.07774,6.04873,6.7595,6.68697,...,6.48389,8.13751,8.123,7.71685,6.46939,6.16478,6.62895,7.25268,7.74586,6.94807


In [351]:
train_data.describe()

Unnamed: 0,ID,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
count,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,...,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0
mean,7944.5,12.887476,10.41888,9.01309,9.190938,11.204216,12.486281,12.933103,12.832599,13.326935,...,0.501699,0.48326,0.473694,9.498175,9.370044,7.706293,7.123033,6.975456,9.04034,10.031026
std,4587.192224,183.612376,149.663362,95.82452,86.274138,92.072773,108.478567,135.615709,233.900622,242.761978,...,8.853648,8.911203,8.567765,93.854552,90.632349,68.413621,68.62935,59.826757,73.637191,103.256072
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3972.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7944.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,11916.75,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,15889.0,15056.0,14320.0,6064.0,4470.0,6370.0,8210.0,9712.0,24512.0,24032.0,...,700.0,660.0,660.0,4308.0,4596.0,3960.0,4224.0,2700.0,3045.0,6048.0


In [352]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15890 entries, 0 to 15889
Columns: 465 entries, ID to 2023-04-04
dtypes: int64(460), object(5)
memory usage: 56.4+ MB


In [353]:
brand_keyword_cnt.head(5)

Unnamed: 0,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,B002-00001,0.84131,0.91383,1.45053,2.42239,1.87119,1.58108,1.23295,1.17493,1.14592,...,0.31911,0.39164,0.37713,0.49318,0.07252,0.2901,0.31911,0.23208,0.33362,0.44966
1,B002-00002,12.64868,20.2785,15.33217,12.75021,13.56251,13.70757,11.93791,15.56425,14.08471,...,10.26979,11.96692,10.64693,10.41485,10.48738,9.48651,9.28343,10.42935,11.15462,11.38671
2,B002-00003,0.33362,0.43516,0.36263,0.17406,0.21758,0.46417,0.42065,0.2901,0.37713,...,0.53669,0.69625,0.44966,0.39164,1.02988,0.49318,0.91383,0.79779,1.01537,0.88482
3,B002-00005,1.07339,1.71163,2.01624,1.9147,1.98723,2.14679,1.68262,1.378,1.42152,...,2.21932,2.50942,2.87206,2.37888,2.03075,1.53756,1.34899,1.26196,2.32085,2.30635
4,B002-00006,0.0,0.0,0.188558,0.246574,0.246574,0.246574,0.377139,0.087012,0.261084,...,0.072526,0.290103,0.087012,0.0,0.130542,0.0,0.0,0.072526,0.217577,0.0


In [354]:
import pandas as pd

# Load the 'brand_keyword_cnt.csv' file
df_brand_keyword = pd.read_csv('E:/LG/LG_data/brand_keyword_cnt.csv')

# Fill NaN values with 0
df_brand_keyword = df_brand_keyword.fillna(0) 

# Convert float values (excluding the first column) to integers
for col in df_brand_keyword.columns[1:]:
    df_brand_keyword[col] = df_brand_keyword[col].astype(str).str.split('.').str[0].astype(int)

# Divide all values by 10
df_brand_keyword.iloc[:, 1:] = df_brand_keyword.iloc[:, 1:]

# Save the modified DataFrame back to the CSV file
df_brand_keyword.to_csv('brand_keyword_cnt_int.csv', index=False)


In [355]:
brand_keyword_cnt_int = pd.read_csv('E:/LG/brand_keyword_cnt_int.csv')

In [356]:
brand_keyword_cnt_int.tail(10)

Unnamed: 0,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
3160,B002-03789,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3161,B002-03790,14,17,19,17,17,18,16,17,16,...,3,3,3,3,3,3,3,3,3,4
3162,B002-03791,2,3,4,6,5,4,4,3,4,...,3,4,5,4,4,3,2,2,4,4
3163,B002-03792,1,1,1,1,1,1,1,1,1,...,2,4,3,2,2,2,1,3,3,2
3164,B002-03793,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3165,B002-03794,2,2,3,4,3,3,3,2,3,...,2,2,2,2,2,1,1,1,2,1
3166,B002-03795,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3167,B002-03796,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3168,B002-03798,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3169,B002-03799,4,5,6,6,7,6,5,5,6,...,5,6,6,5,4,5,3,4,5,5


In [357]:
import pandas as pd

# 더 작은 값을 추가하여 분모가 0인 경우 방지하는 상수 정의
EPSILON = 1e-8

numeric_colss = brand_keyword_cnt_int.columns[1:]


# 칵 column의 min 및 max 계산
min_valuess = brand_keyword_cnt_int[numeric_colss].min(axis=1)
max_valuess = brand_keyword_cnt_int[numeric_colss].max(axis=1)

# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 EPSILON 더해주기
ranges = max_valuess - min_valuess + EPSILON

# min-max scaling 수행
scaled_data = (brand_keyword_cnt_int[numeric_colss].subtract(min_valuess, axis=0)).div(ranges, axis=0)

# 스케일링된 데이터로 업데이트
brand_keyword_cnt_int[numeric_colss] = scaled_data

# max와 min 값을 dictionary 형태로 저장
scale_min_dicts = min_valuess.to_dict()
scale_max_dicts =  max_valuess.to_dict()



print(brand_keyword_cnt_int.head())

brand_keyword_cnt_int.to_csv('preprocessed_data.csv', index=False)



          브랜드  2022-01-01  2022-01-02  2022-01-03  2022-01-04  2022-01-05  \
0  B002-00001    0.000000    0.000000    0.333333    0.666667    0.333333   
1  B002-00002    0.218750    0.468750    0.312500    0.218750    0.250000   
2  B002-00003    0.000000    0.000000    0.000000    0.000000    0.000000   
3  B002-00005    0.142857    0.142857    0.285714    0.142857    0.142857   
4  B002-00006    0.000000    0.000000    0.000000    0.000000    0.000000   

   2022-01-06  2022-01-07  2022-01-08  2022-01-09  ...  2023-03-26  \
0    0.333333    0.333333    0.333333    0.333333  ...    0.000000   
1    0.250000    0.187500    0.312500    0.281250  ...    0.156250   
2    0.000000    0.000000    0.000000    0.000000  ...    0.000000   
3    0.285714    0.142857    0.142857    0.142857  ...    0.285714   
4    0.000000    0.000000    0.000000    0.000000  ...    0.000000   

   2023-03-27  2023-03-28  2023-03-29  2023-03-30  2023-03-31  2023-04-01  \
0    0.000000    0.000000    0.000000  

In [358]:
brand_keyword_cnt_pro = pd.read_csv('E:/LG/preprocessed_data.csv')
brand_keyword_cnt_pro.tail()

Unnamed: 0,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
3165,B002-03794,0.076923,0.076923,0.153846,0.230769,0.153846,0.153846,0.153846,0.076923,0.153846,...,0.076923,0.076923,0.076923,0.076923,0.076923,0.0,0.0,0.0,0.076923,0.0
3166,B002-03795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3167,B002-03796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3168,B002-03798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3169,B002-03799,0.125,0.25,0.375,0.375,0.5,0.375,0.25,0.25,0.375,...,0.25,0.375,0.375,0.25,0.125,0.25,0.0,0.125,0.25,0.25


In [359]:
import pandas as pd
import numpy as np

# Load the two CSV files
df_train = pd.read_csv('E:/LG/LG_data/train.csv').drop(columns=['ID', '제품' ]) # '소분류', '중분류', '대분류'
brand_keyword_cnt_pro = pd.read_csv('preprocessed_data.csv')


# Create a dictionary to store the brand keyword values
brand_keywords = brand_keyword_cnt_pro.set_index('브랜드').to_dict('index')



# Iterate through each row in 'df_train' and perform calculations
result_data = []
for idx, row in df_train.iterrows():
    brand = row['브랜드']
    if brand in brand_keywords:
        keyword_values = brand_keywords[brand]
        multiplied_values = []
        for col in df_train.columns[1:]:
            value = row[col]
            if isinstance(value, (int, float)):  # Check if the value is numeric
                multiplied_values.append(int(value) + keyword_values.get(col, 1))
            else:
                multiplied_values.append(value)  # Handle non-numeric values differently
        result_data.append([brand] + multiplied_values)
    else:
        result_data.append(row.tolist())

# Create a new DataFrame from the result data
result_columns = df_train.columns.tolist()
result_df = pd.DataFrame(result_data, columns=result_columns)

# Save the result to a new CSV file
result_df.to_csv('resultss.csv', index=False)


In [360]:
import pandas as pd
result_data = pd.read_csv('E:/LG/resultss.csv')
result_data.tail()

Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
15885,B002-03799,B002-C002-0008,B002-C003-0042,B002-03799,0.125,0.25,0.375,0.375,0.5,0.375,...,0.25,0.375,0.375,0.25,0.125,0.25,0.0,0.125,0.25,0.25
15886,B002-03799,B002-C002-0008,B002-C003-0044,B002-03799,0.125,0.25,0.375,0.375,0.5,0.375,...,0.25,0.375,0.375,3.25,0.125,2.25,4.0,1.125,1.25,3.25
15887,B002-03799,B002-C002-0008,B002-C003-0044,B002-03799,0.125,0.25,0.375,0.375,0.5,0.375,...,0.25,0.375,0.375,0.25,0.125,0.25,0.0,0.125,0.25,0.25
15888,B002-03799,B002-C002-0008,B002-C003-0044,B002-03799,0.125,0.25,0.375,0.375,0.5,0.375,...,0.25,0.375,0.375,0.25,0.125,0.25,0.0,0.125,0.25,2.25
15889,B002-03799,B002-C002-0004,B002-C003-0020,B002-03799,0.125,0.25,0.375,0.375,0.5,0.375,...,0.25,0.375,0.375,0.25,0.125,0.25,0.0,0.125,0.25,0.25


### 데이터 전처리

In [361]:
import pandas as pd

# 더 작은 값을 추가하여 분모가 0인 경우 방지하는 상수 정의
EPSILON = 1e-8

numeric_cols = result_data.columns[4:]


# 칵 column의 min 및 max 계산
min_values = result_data[numeric_cols].min(axis=1)
max_values = result_data[numeric_cols].max(axis=1)

# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 EPSILON 더해주기
ranges = max_values - min_values + EPSILON

# min-max scaling 수행
scaled_data = (result_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)

# 스케일링된 데이터로 업데이트
result_data[numeric_cols] = scaled_data

# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()


result_data.tail()



Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
15885,B002-03799,B002-C002-0008,B002-C003-0042,B002-03799,0.000801,0.001601,0.002402,0.002402,0.003203,0.002402,...,0.001601,0.002402,0.002402,0.001601,0.000801,0.001601,0.0,0.000801,0.001601,0.001601
15886,B002-03799,B002-C002-0008,B002-C003-0044,B002-03799,0.001016,0.002033,0.003049,0.003049,0.004065,0.003049,...,0.002033,0.003049,0.003049,0.026423,0.001016,0.018293,0.03252,0.009146,0.010163,0.026423
15887,B002-03799,B002-C002-0008,B002-C003-0044,B002-03799,0.004785,0.009569,0.014354,0.014354,0.019139,0.014354,...,0.009569,0.014354,0.014354,0.009569,0.004785,0.009569,0.0,0.004785,0.009569,0.009569
15888,B002-03799,B002-C002-0008,B002-C003-0044,B002-03799,0.008772,0.017544,0.026316,0.026316,0.035088,0.026316,...,0.017544,0.026316,0.026316,0.017544,0.008772,0.017544,0.0,0.008772,0.017544,0.157895
15889,B002-03799,B002-C002-0004,B002-C003-0020,B002-03799,0.1,0.2,0.3,0.3,0.4,0.3,...,0.2,0.3,0.3,0.2,0.1,0.2,0.0,0.1,0.2,0.2


In [362]:
result_data.tail()

Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
15885,B002-03799,B002-C002-0008,B002-C003-0042,B002-03799,0.000801,0.001601,0.002402,0.002402,0.003203,0.002402,...,0.001601,0.002402,0.002402,0.001601,0.000801,0.001601,0.0,0.000801,0.001601,0.001601
15886,B002-03799,B002-C002-0008,B002-C003-0044,B002-03799,0.001016,0.002033,0.003049,0.003049,0.004065,0.003049,...,0.002033,0.003049,0.003049,0.026423,0.001016,0.018293,0.03252,0.009146,0.010163,0.026423
15887,B002-03799,B002-C002-0008,B002-C003-0044,B002-03799,0.004785,0.009569,0.014354,0.014354,0.019139,0.014354,...,0.009569,0.014354,0.014354,0.009569,0.004785,0.009569,0.0,0.004785,0.009569,0.009569
15888,B002-03799,B002-C002-0008,B002-C003-0044,B002-03799,0.008772,0.017544,0.026316,0.026316,0.035088,0.026316,...,0.017544,0.026316,0.026316,0.017544,0.008772,0.017544,0.0,0.008772,0.017544,0.157895
15889,B002-03799,B002-C002-0004,B002-C003-0020,B002-03799,0.1,0.2,0.3,0.3,0.4,0.3,...,0.2,0.3,0.3,0.2,0.1,0.2,0.0,0.1,0.2,0.2


In [363]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류','브랜드'] # '대분류', '중분류', '소분류',

for col in categorical_columns:
    label_encoder.fit(result_data[col])
    result_data[col] = label_encoder.transform(result_data[col])

In [364]:
class CustomDataset(Dataset):
    def __init__(self, data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE'], is_inference=False):
        self.data = data.values # convert DataFrame to numpy array
        self.train_size = train_size
        self.predict_size = predict_size
        self.window_size = self.train_size + self.predict_size
        self.is_inference = is_inference

    def __len__(self):
        if self.is_inference:
            return len(self.data)
        else:
            return self.data.shape[0] * (self.data.shape[1] - self.window_size - 3)

    def __getitem__(self, idx):
        if self.is_inference:
            # 추론 시
            encode_info = self.data[idx, :4]
            window = self.data[idx, -self.train_size:]
            input_data = np.column_stack((np.tile(encode_info, (self.train_size, 1)), window))
            return input_data
        else:
            # 학습 시
            row = idx // (self.data.shape[1] - self.window_size - 3)
            col = idx % (self.data.shape[1] - self.window_size - 3)
            encode_info = self.data[row, :4]
            sales_data = self.data[row, 4:]
            window = sales_data[col : col + self.window_size]
            input_data = np.column_stack((np.tile(encode_info, (self.train_size, 1)), window[:self.train_size]))
            target_data = window[self.train_size:]
            return input_data, target_data

In [365]:
# def make_result_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
#     '''
#     학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
#     data : 일별 판매량
#     train_size : 학습에 활용할 기간
#     predict_size : 추론할 기간
#     '''
#     num_rows = len(data)
#     window_size = train_size + predict_size
    
#     input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, len(data.iloc[0, :4]) + 1))
#     target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))
    
#     for i in tqdm(range(num_rows)):
#         encode_info = np.array(data.iloc[i, :4])
#         sales_data = np.array(data.iloc[i, 4:])
        
#         for j in range(len(sales_data) - window_size + 1):
#             window = sales_data[j : j + window_size]
#             temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
#             input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
#             target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]
    
#     return input_data, target_data

In [366]:
# def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
#     '''
#     평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
#     data : 일별 판매량
#     train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
#     '''
#     num_rows = len(data)
    
#     input_data = np.empty((num_rows, train_size, len(data.iloc[0, :4]) + 1))
    
#     for i in tqdm(range(num_rows)):
#         encode_info = np.array(data.iloc[i, :4])
#         sales_data = np.array(data.iloc[i, -train_size:])
        
#         window = sales_data[-train_size : ]
#         temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
#         input_data[i] = temp_data
    
#     return input_data

In [367]:
from torch.utils.data import Dataset, DataLoader, random_split
# CustomDataset 인스턴스 생성
dataset = CustomDataset(result_data)

# 전체 데이터셋의 크기
total_size = len(dataset)

# 분리할 데이터셋의 크기 계산
train_size = int(total_size * 0.9)
val_size = total_size - train_size

# random_split 함수를 사용해 데이터셋 분리
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoader 인스턴스 생성
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

In [368]:
result_data.tail()

Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
15885,3169,7,41,3169,0.000801,0.001601,0.002402,0.002402,0.003203,0.002402,...,0.001601,0.002402,0.002402,0.001601,0.000801,0.001601,0.0,0.000801,0.001601,0.001601
15886,3169,7,43,3169,0.001016,0.002033,0.003049,0.003049,0.004065,0.003049,...,0.002033,0.003049,0.003049,0.026423,0.001016,0.018293,0.03252,0.009146,0.010163,0.026423
15887,3169,7,43,3169,0.004785,0.009569,0.014354,0.014354,0.019139,0.014354,...,0.009569,0.014354,0.014354,0.009569,0.004785,0.009569,0.0,0.004785,0.009569,0.009569
15888,3169,7,43,3169,0.008772,0.017544,0.026316,0.026316,0.035088,0.026316,...,0.017544,0.026316,0.026316,0.017544,0.008772,0.017544,0.0,0.008772,0.017544,0.157895
15889,3169,3,19,3169,0.1,0.2,0.3,0.3,0.4,0.3,...,0.2,0.3,0.3,0.2,0.1,0.2,0.0,0.1,0.2,0.2


In [369]:
# train_input, train_target = make_result_data(result_data)
# test_input = make_predict_data(result_data)

In [370]:
# data_len = len(train_input)
# val_ratio = 0.1
# test_ratio = 0.1

# val_len = int(data_len * val_ratio)
# test_len = int(data_len * test_ratio)

# val_input = train_input[-val_len:]
# val_target = train_target[-val_len:]


# train_input = train_input[:-val_len - test_len]
# train_target = train_target[:-val_len - test_len]


In [371]:
# train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

### Custom Dataset

In [372]:
# class CustomDataset(Dataset):
#     def __init__(self, X, Y):
#         self.X = X
#         self.Y = Y
        
#     def __getitem__(self, index):
#         if self.Y is not None:
#             return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
#         return torch.Tensor(self.X[index])
    
#     def __len__(self):
#         return len(self.X)

In [373]:
# train_dataset = CustomDataset(train_input, train_target)
# train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

# val_dataset = CustomDataset(val_input, val_target)
# val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

### 모델 선언

## GRU 이용

In [374]:
# import torch.nn as nn
# import torch

# class BaseModel(nn.Module):
#     def __init__(self, input_size=5, hidden_size=512, num_layers=2, output_size=CFG['PREDICT_SIZE']):
#         super(BaseModel, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.gru = nn.GRU(input_size, hidden_size, num_layers=num_layers, batch_first=True)
#         self.fc = nn.Sequential(
#             nn.Linear(hidden_size, hidden_size // 2),
#             nn.ReLU(),
#             nn.Dropout(),
#             nn.Linear(hidden_size // 2, output_size)
#         )

#         self.actv = nn.ReLU()

#     def forward(self, x):
#         # x shape: (B, TRAIN_WINDOW_SIZE, 5)
#         batch_size = x.size(0)
#         hidden = self.init_hidden(batch_size, x.device)

#         # GRU layer
#         gru_out, hidden = self.gru(x, hidden)

#         # Only use the last output sequencea
#         last_output = gru_out[:, -1, :]

#         # Fully connected layer
#         output = self.actv(self.fc(last_output))

#         return output.squeeze(1)

#     def init_hidden(self, batch_size, device):
#         # Initialize hidden state for all GRU layers
#         return torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)


In [375]:
import torch.nn as nn

class ImprovedLSTMModel(nn.Module):
    def __init__(self, input_size=5, hidden_size=512, num_layers=2, output_size=CFG['PREDICT_SIZE'], dropout_prob=0.2):
        super(ImprovedLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.ln = nn.LayerNorm(hidden_size)  # Layer Normalization
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, output_size)
        )
        self.actv = nn.ReLU()

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)

        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = self.ln(lstm_out)  # Applying Layer Normalization

        last_output = lstm_out[:, -1, :]

        output = self.actv(self.fc(last_output))

        return output.squeeze(1)
    
    def init_hidden(self, batch_size, device):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device),
                torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device))


## 개선된 gru

In [376]:
# class ImprovedModel(nn.Module):
#     def __init__(self, input_size=5, hidden_size=512, num_layers=2, output_size=CFG['PREDICT_SIZE'], dropout_prob=0.2):
#         super(ImprovedModel, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.gru = nn.GRU(input_size, hidden_size, num_layers=num_layers, batch_first=True)
#         self.ln = nn.LayerNorm(hidden_size)  # Layer Normalization
#         self.dropout = nn.Dropout(dropout_prob)
#         self.fc = nn.Sequential(
#             nn.Linear(hidden_size, hidden_size // 2),
#             nn.ReLU(),
#             nn.Linear(hidden_size // 2, output_size)
#         )
#         self.actv = nn.ReLU()

#     def forward(self, x):
#         batch_size = x.size(0)
#         hidden = self.init_hidden(batch_size, x.device)

#         gru_out, hidden = self.gru(x, hidden)
#         gru_out = self.dropout(gru_out)
#         gru_out = self.ln(gru_out)  # Applying Layer Normalization

#         last_output = gru_out[:, -1, :]

#         output = self.actv(self.fc(last_output))

#         return output.squeeze(1)
#     def init_hidden(self, batch_size, device):
#         return torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)



In [377]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset
# from torch.nn import Transformer

# # 트랜스포머 모델 정의
# class TransformerModel(nn.Module):
#     def __init__(self, input_size = 5 , d_model = 512 , nhead =8, num_layers= 2, 
#                 output_size= CFG['PREDICT_SIZE'], dropout_prob = 0.2):
#         super(TransformerModel, self).__init__()
#         self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers)
#         self.ln = nn.LayerNorm(d_model)  # Layer Normalization
#         self.fc = nn.Sequential(
#             nn.Linear(d_model, d_model // 2),
#             nn.ReLU(),
#             nn.Linear(d_model // 2, output_size)
#         )
#         self.actv = nn.ReLU()
#         self.dropout = nn.Dropout(dropout_prob)

#     def forward(self, x):
#         x = self.transformer(x, x)  # Self-attention
#         x = self.ln(x)  # Applying Layer Normalization
#         x = x.mean(dim=1)  # 평균 풀링
#         x = self.dropout(x)
#         output = self.actv(self.fc(x))
#         return output.squeeze(1)

## lstm으로 만든것

In [378]:
# import torch.nn as nn
# import torch

# class ImprovedModel(nn.Module):
#     def __init__(self, input_size=5, hidden_size=512, num_layers=1, output_size=CFG['PREDICT_SIZE']):
#         super(ImprovedModel, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)  # Using nn.LSTM instead of nn.GRU
#         self.dropout = nn.Dropout(0.2)  # Adding dropout after LSTM
#         self.fc = nn.Sequential(
#             nn.Linear(hidden_size, hidden_size // 2),
#             nn.ReLU(),
#             nn.Linear(hidden_size // 2, output_size)
#         )
#         self.actv = nn.ReLU()  # Using LeakyReLU activation

#     def forward(self, x):
#         batch_size = x.size(0)
#         hidden, cell = self.init_hidden(batch_size, x.device)  # Initializing hidden and cell states for LSTM

#         lstm_out, (hidden, cell) = self.lstm(x, (hidden, cell))  # Using LSTM instead of GRU
#         lstm_out = self.dropout(lstm_out)  # Applying dropout

#         last_output = lstm_out[:, -1, :]

#         output = self.actv(self.fc(last_output))

#         return output.squeeze(1)

#     def init_hidden(self, batch_size, device):
#         return (torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device),
#                 torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device))  # Initializing hidden and cell states for LSTM


In [379]:
def train(model, optimizer, train_loader, val_loader, device, scheduler=None):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = float('inf')
    best_model = None
    early_stopping_counter = 0
    max_early_stopping = 30  # Max number of consecutive epochs with increasing validation loss
    
    for epoch in range(1, CFG['EPOCHS'] + 1):
        model.train()
        train_loss = []
        train_mae = []

        for X, Y in tqdm(iter(train_loader)):
            # X = X.to(device)
            # Y = Y.to(device)
            X = X.float().to(device)
            Y = Y.float().to(device)

            optimizer.zero_grad()

            output = model(X)
            loss = criterion(output, Y)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())


        
        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')

        if scheduler is not None:
            scheduler.step(val_loss)  # Update learning rate based on scheduler's policy
        
        if val_loss < best_loss:
            best_loss = val_loss
            best_model = model
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= max_early_stopping:
                print(f'Early stopping triggered at epoch {epoch}')
                break

    return best_model


In [380]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            # X = X.to(device)
            # Y = Y.to(device)
            X = X.float().to(device)
            Y = Y.float().to(device)
            
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
            
    return np.mean(val_loss)

## Run !!

In [381]:
import torch.optim.lr_scheduler as lr_scheduler

model = ImprovedLSTMModel() # BaseModel() 
optimizer = torch.optim.RAdam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1, factor=0.7, verbose=True) #  120일로 조정 0.7로
infer_model = train(model, optimizer, train_loader, val_loader, device, scheduler)

  0%|          | 0/1167 [00:00<?, ?it/s]

  1%|          | 7/1167 [01:37<4:30:26, 13.99s/it]


KeyboardInterrupt: 

# LSTM
# RADAM
# 둘 다 합쳐서
# # lr 원래로 복귀  이후 0.7 로 변경  또는 그냐으 0.3으로 진행  -> 와우 lr조정했는데 괜찮은데??

In [None]:
## 다음에는 LSTM으로 해보자

## 메타 데이터를 이용해서 학습한 것을 이용해서 해보고 -> 이후 에포크 증가 시켜보고 -> 이후 전처리 방식을 그 전에 전처리 한것을 더하고 전처리 하는 방식도 해보자 -> 그리고 괜찮은거 같으면 그것을 바탕으로 미세조정해보자

## 0.7 -> 0.2
## 0.2 -> 0.5 

## 모델 추론

In [None]:
# test_dataset = CustomDataset(test_input, None)
# test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
test_dataset = CustomDataset(data = result_data, is_inference=True)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

In [None]:
def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            # X = X.to(device)
            X = X.float().to(device)
            
            output = model(X)
            
            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [None]:
# pred = inference(infer_model, test_loader, device)
pred = inference(infer_model, test_loader, device)

100%|██████████| 4/4 [00:01<00:00,  3.71it/s]


In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :]  * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]
    
# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
# # 추론 결과를 inverse scaling 및 후처리
# for idx in range(len(pred)):
#     pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

# # 결과 반올림 및 정수 변환
# pred = np.round(pred, 0).astype(int)

# # 중간 코드 적용: 값에서 1을 빼고, 0보다 작으면 0으로 처리
# for idx in range(len(pred)):
#     pred[idx, :] = pred[idx, :]   # 10 빼기
#     pred[idx, :] = np.where(pred[idx, :] > 10, pred[idx, :] - 10, pred[idx, :])  # 값이 10보다 크면 10 빼기
#     pred[idx, :] = np.where(pred[idx, :] < 10, pred[idx, :] - 1, pred[idx, :])  # 값이 10보다 작으면 1 빼기
#     pred[idx, :] = np.maximum(pred[idx, :], 0)  # 0보다 작으면 0으로 처리


In [None]:
pred.shape

(15890, 21)

## Submission

In [None]:
submit = pd.read_csv('E:/LG/LG_data/sample_submission.csv')
submit.tail(40)

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
15850,15850,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15851,15851,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15852,15852,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15853,15853,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15854,15854,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15855,15855,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15856,15856,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15857,15857,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15858,15858,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15859,15859,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 값이 10 보다 크면 10빼고 10보다 작으면 5 빼기로 만들어 보지

In [None]:
submit.iloc[:,1:] = pred
submit.tail(40)

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
15850,15850,10,10,9,8,8,8,8,8,8,...,7,7,7,8,8,8,7,7,7,8
15851,15851,1,2,2,2,2,2,2,2,2,...,2,2,2,2,2,3,2,2,2,2
15852,15852,4,5,6,5,5,5,6,7,7,...,6,6,7,8,8,8,8,7,7,7
15853,15853,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
15854,15854,19,19,16,15,15,14,13,13,13,...,13,12,12,11,11,12,12,12,11,11
15855,15855,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,1
15856,15856,7,8,9,9,9,9,8,9,9,...,9,9,9,9,10,10,10,9,9,9
15857,15857,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15858,15858,5,6,7,7,6,5,6,7,7,...,6,6,6,7,7,7,7,6,6,6
15859,15859,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [None]:
submit.to_csv('E:/LG/LG_data/models/baseline_submit_4096_512_RAdam_02_LAYER_1_gru_0.7_0.2_105_1e-2_03.csv', index=False)