In [1]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### 하이퍼 파리미터 설정

In [2]:
#Google Colab 환경에서 Google Drive를 마운트하고 '/content/drive' 경로에 연결
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#주어진 파일 경로에서 CSV 파일을 읽어오고, 데이터프레임에서 'ID'와 '제품' 열을 삭제
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DACON/LG_Aimers/train.csv').drop(columns=['ID', '제품'])

### 숫자 부분을 -> 0~ 1로 바꾸기
train데이터들을 스케일링을 통해서 다양한 특성에도 영향을 안미치게 하는 방법

In [4]:
# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = train_data.columns[5:] # 앞에 id와, 제품을 삭제했기 때문에 5부터 시작한다고 파악,
# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

### 문자 데이터를 숫자형 라벨로 변환

In [5]:
# 'LabelEncoder' 객체를 생성하여 범주형 데이터를 라벨 인코딩합니다.
label_encoder = LabelEncoder()

# 라벨 인코딩을 수행할 대상 열을 'categorical_columns' 변수에 정의합니다.
categorical_columns = ['대분류', '중분류', '소분류', '브랜드', '쇼핑몰']

# 각 범주형 열에 대해 라벨 인코딩을 수행합니다.
for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

# 데이터 프레임 완성

In [6]:
# 'train_data' 데이터프레임에서 '브랜드' 열을 삭제합니다. #
train_data = train_data.drop(['브랜드'], axis = 1)
# 수정된 데이터프레임의 처음 몇 개의 행을 표시합니다.
train_data.head()

Unnamed: 0,대분류,중분류,소분류,쇼핑몰,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-04-15,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24
0,1,6,37,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,7,43,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.111111,0.0,0.111111,0.0,0.111111,0.111111,0.055556,0.0,0.0,0.0
2,2,7,43,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,7,43,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,7,43,9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017606,0.010563,0.007042,0.007042,0.007042,0.0,0.0,0.0,0.0,0.014085


In [7]:
def make_train_data_for_xgb(data, train_size, predict_size):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    num_rows = len(data) # 제품의 총 개수를 파악을 한다.
    window_size = train_size + predict_size  # 150 + 21 = 171

    input_data = []
    target_data = []

    for i in tqdm(range(num_rows)):
        encode_info = data.iloc[i, :4].values
        sales_data = data.iloc[i, 4:].values

        for j in range(len(sales_data) - window_size + 1): # 475 - 150 + 1 = 326 #
            window = sales_data[j : j + window_size] # 171~326까지의 데이터

            # 입력 데이터의 구성: 인코딩 정보 + train_size만큼의 판매 데이터
            temp_data = np.concatenate((encode_info, window[:train_size]))
            input_data.append(temp_data)

            # 타겟 데이터의 구성: predict_size만큼의 판매 데이터
            target_data.append(window[train_size:])

    input_data = np.array(input_data)
    target_data = np.array(target_data)

    return input_data, target_data

In [8]:
def make_predict_data_for_xgb(data, train_size):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)

    input_data = []

    for i in tqdm(range(num_rows)):
        encode_info = data.iloc[i, :4].values
        sales_data = data.iloc[i, 4:].values

        # 입력 데이터의 구성: 인코딩 정보 + 마지막 train_size만큼의 판매 데이터
        temp_data = np.concatenate((encode_info, sales_data[-train_size:]))
        input_data.append(temp_data)

    input_data = np.array(input_data)

    return input_data

In [9]:
train_size = 300
predict_size = 21

# 학습용 데이터 생성
train_input, train_target = make_train_data_for_xgb(train_data, train_size, predict_size)

# 평가용 데이터 생성
test_input = make_predict_data_for_xgb(train_data, train_size)

  0%|          | 0/28894 [00:00<?, ?it/s]

  0%|          | 0/28894 [00:00<?, ?it/s]

In [10]:
train_input.shape, train_target.shape, test_input.shape

((12395526, 34), (12395526, 21), (28894, 34))

In [27]:
train_input[123955]

array([ 1.,  5., 33.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [94]:
# Train / Validation Split
# 학습 데이터의 총 길이를 가져옵니다.
data_len = len(train_input)

# 데이터를 학습 세트와 검증 세트로 나눕니다.
# 일반적으로 데이터의 일부를 검증 세트로 분할하여 모델의 성능을 평가합니다.
# 여기서는 데이터의 약 40%를 검증 세트로 사용합니다.

val_input = train_input[-int(data_len*0.4):]    # 데이터의 뒷부분을 검증 입력 데이터로 선택
val_target = train_target[-int(data_len*0.4):]  # 검증 타겟 데이터로 선택
train_input = train_input[:-int(data_len*0.4)]  # 나머지를 학습 입력 데이터로 선택
train_target = train_target[:-int(data_len*0.4)]  # 학습 타겟 데이터로 선택

In [95]:
train_input.shape, train_target.shape, test_input.shape,val_input.shape, val_target.shape

((2756488, 304), (2756488, 21), (28894, 304), (1837658, 304), (1837658, 21))

In [101]:
# XGBoost model training
import xgboost as xgb
dtrain = xgb.DMatrix(train_input, label=train_target)
dval = xgb.DMatrix(val_input, label=val_target)

params = {
    'objective': 'reg:squarederror',  # 회귀 문제를 다루는 경우
    'eval_metric': 'rmse',
    'eta': 0.01,
    'max_depth': 10,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'alpha': 0.1,
    'lambda': 1,
    'learning_rate': 0.01,
    'gamma': 0.1,
    'min_child_weight': 3,
    'max_delta_step': 3,
    'scale_pos_weight': 3,
    'booster': 'gbtree',  # 트리 기반 부스팅 사용
    'verbosity': 1,
    'seed': 4,
    'tree_method': 'gpu_hist',
    'gpu_id': 0
}

evals = [(dtrain, 'train'), (dval, 'eval')]
bst = xgb.train(params, dtrain, num_boost_round=500, evals=evals, early_stopping_rounds=10)

# Predictions
y_pred = bst.predict(dval)

[0]	train-rmse:0.45508	eval-rmse:0.45684
[1]	train-rmse:0.45087	eval-rmse:0.45260
[2]	train-rmse:0.44670	eval-rmse:0.44841
[3]	train-rmse:0.44257	eval-rmse:0.44426
[4]	train-rmse:0.43849	eval-rmse:0.44015
[5]	train-rmse:0.43445	eval-rmse:0.43609
[6]	train-rmse:0.43045	eval-rmse:0.43208
[7]	train-rmse:0.42650	eval-rmse:0.42810
[8]	train-rmse:0.42258	eval-rmse:0.42417
[9]	train-rmse:0.41871	eval-rmse:0.42028
[10]	train-rmse:0.41488	eval-rmse:0.41643
[11]	train-rmse:0.41110	eval-rmse:0.41262
[12]	train-rmse:0.40735	eval-rmse:0.40885
[13]	train-rmse:0.40364	eval-rmse:0.40513
[14]	train-rmse:0.39998	eval-rmse:0.40144
[15]	train-rmse:0.39635	eval-rmse:0.39779
[16]	train-rmse:0.39276	eval-rmse:0.39418
[17]	train-rmse:0.38920	eval-rmse:0.39061
[18]	train-rmse:0.38569	eval-rmse:0.38708
[19]	train-rmse:0.38222	eval-rmse:0.38359
[20]	train-rmse:0.37878	eval-rmse:0.38013
[21]	train-rmse:0.37538	eval-rmse:0.37671
[22]	train-rmse:0.37201	eval-rmse:0.37333
[23]	train-rmse:0.36869	eval-rmse:0.36999
[2

In [102]:
submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DACON/LG_Aimers/sample_submission.csv')
submit.head()

Unnamed: 0,ID,2023-04-25,2023-04-26,2023-04-27,2023-04-28,2023-04-29,2023-04-30,2023-05-01,2023-05-02,2023-05-03,...,2023-05-06,2023-05-07,2023-05-08,2023-05-09,2023-05-10,2023-05-11,2023-05-12,2023-05-13,2023-05-14,2023-05-15
0,SAMPLE_00000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,SAMPLE_00001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,SAMPLE_00002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,SAMPLE_00003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SAMPLE_00004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
# Predicting the test dataset
dtest = xgb.DMatrix(test_input)
test_predictions = bst.predict(dtest)
for idx in range(len(test_predictions)):
   test_predictions[idx, :] = test_predictions[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

# 결과 후처리
test_predictions = np.round(test_predictions, 0).astype(int)

submit.iloc[:, 1:] = test_predictions

submit.head()

Unnamed: 0,ID,2023-04-25,2023-04-26,2023-04-27,2023-04-28,2023-04-29,2023-04-30,2023-05-01,2023-05-02,2023-05-03,...,2023-05-06,2023-05-07,2023-05-08,2023-05-09,2023-05-10,2023-05-11,2023-05-12,2023-05-13,2023-05-14,2023-05-15
0,SAMPLE_00000,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,1,1,1,1
1,SAMPLE_00001,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,SAMPLE_00002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1
3,SAMPLE_00003,0,0,0,0,1,1,0,1,1,...,1,1,1,1,1,1,1,1,1,1
4,SAMPLE_00004,15,18,17,28,33,24,47,30,29,...,40,59,40,37,33,32,35,28,30,33


In [104]:
submit.to_csv('./13차 val:0.4_xgboost_304(쇼핑몰 안 합치기)700번.csv', index=False)

In [105]:
submit['2023-04-25'].value_counts()

0      13356
1       4397
2       1826
3       1192
4        824
       ...  
765        1
396        1
495        1
398        1
316        1
Name: 2023-04-25, Length: 556, dtype: int64