In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

import datetime
from datetime import timedelta
# !pip install holidays
import holidays

In [2]:
os.chdir("/content/drive/MyDrive/3. Grad School/LG Aimers")

# 데이터 불러오기
data = pd.read_csv("DATA/train/train.csv")

#### 파생변수 생성

In [3]:
# 월(month) -> 계절 매핑 딕셔너리
month_to_season = {
    1: "Winter", 2: "Winter", 12: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Autumn", 10: "Autumn", 11: "Autumn"}

In [4]:
# 월별 가중치 매핑
monthly_weights = {
    1: 1.5, 2: 1.5, 3: 0.5,
    4: 0.8, 5: 0.8, 6: 0.8,
    7: 1.2, 8: 1.2, 9: 0.8,
    10: 0.8, 11: 0.8, 12: 1.2}

In [5]:
# 요일별 가중치 매핑
weekly_weights = {
    "Monday": 0.8, "Tuesday": 0.8, "Wednesday": 0.8,
    "Thursday": 0.8, "Friday": 1.2, "Saturday": 1.7,
    "Sunday": 1.5}

In [26]:
class Make_Variables():
        def __init__(self, data = None, date = None, predict = 7, month_to_season = None, monthly_weights = None, weekly_weights = None):
            self.data = data
            self.date = date
            self.predict = predict
            self.month_to_season = month_to_season
            self.monthly_weights = monthly_weights
            self.weekly_weights = weekly_weights

        def update_kor_holidays(self):
            """국경일 추가"""
            kor_holidays = holidays.KR(years = [2023, 2024, 2025])
            kor_holidays.update({
                datetime.date(2024,10,1) : "Temporary Holiday", # 국군의 날 임시공휴일
                datetime.date(2025,1,27) : "Temporary Holiday", # 설날 임시공휴일
                datetime.date(2025,3,3) : "Temporary Holiday", # 삼일절 대체공휴일
                datetime.date(2025, 5, 29) : "Election Period",
                datetime.date(2025, 5, 30) : "Election Period",
                datetime.date(2025, 6, 3) : "Presidential Election Day"})
            return kor_holidays

        def check_holidays(self, date, kor_holidays) -> int:
            """날짜 받아서 공휴일/주말 여부 출력"""
            # date = pd.Timestamp(date)
            if isinstance(date, pd.Series):
                check_holiday = date.dt.date.isin(kor_holidays)
                check_weekend = date.dt.weekday >= 5
            else:
                check_holiday = date.date() in kor_holidays
                check_weekend = date.weekday() >= 5
            is_holiday = (check_holiday | check_weekend)
            return is_holiday

        def get_sandwich_score(self, idx, is_holiday_col) -> int:
            """데이터프레임 기준으로 샌드위치 점수 계산"""
            score = 0
            # 앞/뒤 하루씩 봤을 때 모두 휴일 -> 3점
            if idx > 0 and idx < len(is_holiday_col) - 1:
                if (is_holiday_col[idx - 1] == 1) and (is_holiday_col[idx + 1] == 1):
                    score = 3
            # 앞/뒤 이틀씩 봤을 때 휴일 3일 -> 2점, 2일 -> 1점
            if score == 0 and idx > 1 and idx < len(is_holiday_col) - 2:
                start_idx = idx - 2
                end_idx = idx + 2
                nearby_holidays = is_holiday_col[start_idx : end_idx + 1].sum() - is_holiday_col[idx]
                if nearby_holidays == 3:
                    score = 2
                elif nearby_holidays == 2:
                    score = 1
            return score

        def get_sandwich_score_for_dates(self, date, kor_holidays) -> int:
            """특정 날짜를 받아와서 앞뒤 날짜를 구하고, 샌드위치 점수 계산"""
            # 하루씩
            prev_date, next_date = date - timedelta(days = 1), date + timedelta(days = 1)
            prev_hol, next_hol = self.check_holidays(prev_date, kor_holidays), self.check_holidays(next_date, kor_holidays)
            if prev_hol and next_hol:
                return 3
            days_offsets = [-2, -1, 1, 2]
            nearby_holidays = sum(self.check_holidays(date + timedelta(days = d), kor_holidays) for d in days_offsets)
            if nearby_holidays == 3:
                return 2
            elif nearby_holidays == 2:
                return 1
            else:
                return 0

        def get_month_weights(self, data = None, monthly_weights = monthly_weights):
            """월별 가중치 부여"""
            # 데이터프레임 들어오면
            if data is not None:
                data['month_weight'] = data['month'].map(monthly_weights)
                return data

        def get_week_weights(self, data = None, weekly_weights = weekly_weights):
            """요일별 가중치 부여"""
            # 데이터프레임 들어오면
            if data is not None:
                data['week_weight'] = data['weekday'].map(weekly_weights)
                return data

        def get_prev_days_mean(self, data, test_df = None, date = None, menu = None, howmany = 7):
            """
            일요일 날짜 받아와서 직전 주차의 일-토 매출수량 평균 계산
            주의 - test data에서 생성할 때는 참고할 데이터와 붙여넣을 데이터가 다름
            data : 참고할 데이터
            test_df : 참고할 데이터
            """
            if test_df is None:
                # 혹시 모르니까 검증
                if date.weekday() == 6:
                    # 이전 날짜들
                    prev_start = date - timedelta(days = howmany)
                    prev_end = date - timedelta(days = 1)
                    prev_data = data[(data['영업일자'] >= prev_start) & (data['영업일자'] <= prev_end) & (data['영업장명_메뉴명'] == menu)]
                    prev_avg = prev_data['매출수량'].mean()
                    # 첫 주 0으로 처리
                    if pd.isna(prev_avg):
                        prev_avg = 0
                    week_end = date + timedelta(days = 6)
                    curr_mask = (data['영업일자'] >= date) & (data['영업일자'] <= week_end) & (data['영업장명_메뉴명'] == menu)
                    colname = f"prev_avg_{howmany}"
                    data.loc[curr_mask, colname] = prev_avg
                    return data
                else:
                    return np.nan

            # test data라면
            else:
                # 혹시 모르니까 검증
                if date.weekday() == 6:
                    # 이전 날짜들
                    prev_start = date - timedelta(days = howmany)
                    prev_end = date - timedelta(days = 1)
                    prev_data = test_df[(test_df['영업일자'] >= prev_start) & (test_df['영업일자'] <= prev_end) & (test_df['영업장명_메뉴명'] == menu)]
                    prev_avg = prev_data['매출수량'].mean()
                    # 첫 주 0으로 처리
                    if pd.isna(prev_avg):
                        prev_avg = 0
                    week_end = date + timedelta(days = 6)
                    curr_mask = (data['영업일자'] >= date) & (data['영업일자'] <= week_end) & (data['영업장명_메뉴명'] == menu)
                    colname = f"prev_avg_{howmany}"
                    data.loc[curr_mask, colname] = prev_avg
                    return data
                else:
                    return np.nan


        # train, test 공통
        def make_fund_variables(self, data, month_to_season = month_to_season):
            # 영업일자 -> datetime
            data['영업일자'] = pd.to_datetime(data['영업일자'])

            # 연, 월, 일, 요일 분리
            data['year'] = data['영업일자'].dt.year
            data['month'] = data['영업일자'].dt.month
            data['day'] = data['영업일자'].dt.day
            data['weekday'] = data['영업일자'].dt.day_name()
            data['weekday_enc'] = data['영업일자'].dt.weekday

            # 계절 변수 생성
            data['season'] = data['month'].map(month_to_season)

            # 연도 차이 변수 생성
            data['year_enc'] = data['year'] - 2023

            # 월, 일, 요일 사이클릭 변환
            data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
            data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)

            data['day_sin'] = np.sin(2 * np.pi * data['day'] / 31)
            data['day_cos'] = np.cos(2 * np.pi * data['day'] / 31)

            data['weekday_sin'] = np.sin(2 * np.pi * data['weekday_enc'] / 7)
            data['weekday_cos'] = np.cos(2 * np.pi * data['weekday_enc'] / 7)

            # 공휴일 확인
            kor_holidays = self.update_kor_holidays()
            check_holiday = data['영업일자'].dt.date.isin(kor_holidays)
            check_weekend = data['weekday'].isin(['Saturday', 'Sunday'])
            data['is_holiday'] = (check_holiday | check_weekend).astype(int)
            data['holiday_name'] = data['영업일자'].dt.date.map(kor_holidays)

            return data

        # train의 입력 데이터
        def make_variables_train(self, data):
            data = self.make_fund_variables(data)
            kor_holidays = self.update_kor_holidays()

            ### 샌드위치 데이
            check_sandwich = data['is_holiday'].astype(int).values
            prev, next = np.roll(check_sandwich, 1), np.roll(check_sandwich, -1)
            sandwich = (prev == 1) & (check_sandwich == 0) & (next == 1)
            data['is_sandwich'] = sandwich.astype(int)

            # 샌드위치 - 첫날
            first = data['영업일자'].min()
            data.loc[data['영업일자'] == first, 'is_sandwich'] = self.get_sandwich_score_for_dates(first, kor_holidays)

            # 샌드위치 - 마지막 날
            last = data['영업일자'].max()
            data.loc[data['영업일자'] == last, 'is_sandwich'] = self.get_sandwich_score_for_dates(last, kor_holidays)

            # 샌드위치 포함한 공휴일
            data['is_holiday_sandwich'] = data['is_holiday'] | (data['is_sandwich'] > 0).astype(int)

            ### 월별 가중치
            data = self.get_month_weights(data, monthly_weights)

            ### 요일별 가중치
            data = self.get_week_weights(data, weekly_weights)

            ### 직전 주차 평균
            sundays = data[data['weekday'] == "Sunday"][["영업일자", "영업장명_메뉴명"]].copy()
            for _, row in sundays.iterrows():
                date = row['영업일자']
                menu = row['영업장명_메뉴명']
                data = self.get_prev_days_mean(data = data, date = date, menu = menu, howmany = 7)
                data = self.get_prev_days_mean(data = data, date = date, menu = menu, howmany = 14)

            ### 영업장명, 메뉴명 분리
            if '영업장명_메뉴명' in data.columns:
                data[['영업장명', '메뉴명']] = data['영업장명_메뉴명'].str.split('_', expand = True)

            ### 음수 처리
            negative = data[data['매출수량'] < 0]

            for idx, row in negative.iterrows():
                num = row['매출수량']
                if num < -10:
                    date = row['영업일자']
                    menu = row['영업장명_메뉴명']
                    prev_date = pd.to_datetime(date) - pd.Timedelta(days = 1)
                    prev_row = data[(data['영업일자'] == prev_date) & (data['영업장명_메뉴명'] == menu)]

                    if prev_row.iloc[0]["매출수량"] >= abs(num):
                        data.loc[prev_row.index[0], '매출수량'] += num

            # 남은 건 전부 0으로
            data.loc[data['매출수량'] < 0, '매출수량'] = 0

            return data

        # 예측하고자 하는 날들
        def make_variables_test(self, date, test_df, predict):
            """
            date : 최종 날짜 (입력 7일 중 가장 마지막) - TimeStamp
            test_df : 예측할 때 참고해올 데이터 -> 이거로 직전 주차 평균 생성
            """
            date = pd.to_datetime(date)
            future_dates = [date + timedelta(days = i + 1) for i in range(predict)]
            future_df = pd.DataFrame({'영업일자' : future_dates})

            menus_df = (test_df[['영업장명_메뉴명']].drop_duplicates().reset_index(drop = True))
            future_df = future_df.merge(menus_df, how='cross')

            kor_holidays = self.update_kor_holidays()

            # 기본적인 변수들
            future_df = self.make_fund_variables(future_df)

            future_df['영업일자'] = pd.to_datetime(future_df['영업일자']).dt.normalize()

            # 샌드위치
            future_df['is_sandwich'] = future_df['영업일자'].apply(lambda d: self.get_sandwich_score_for_dates(d, kor_holidays))

             # 샌드위치 포함한 공휴일
            future_df['is_holiday_sandwich'] = future_df['is_holiday'].astype(int) | (future_df['is_sandwich'] > 0).astype(int)

            # 월별 가중치
            future_df = self.get_month_weights(future_df, monthly_weights)

            # 요일별 가중치
            future_df = self.get_week_weights(future_df, weekly_weights)

            # 직전 주차 평균 -> 이거는 test 까지 받아오고 생각해야 함..
            sundays =  future_df.loc[future_df['weekday'] == "Sunday", ['영업일자', '영업장명_메뉴명']].copy()
            for _, row in sundays.iterrows():
                date = row['영업일자']
                menu = row['영업장명_메뉴명']
                future_df = self.get_prev_days_mean(data = future_df, test_df = test_df, date = date, menu = menu, howmany = 7)
                future_df = self.get_prev_days_mean(data = future_df, test_df = test_df, date = date, menu = menu, howmany = 14)

            return future_df

In [None]:
# 그냥 전부 만들면 돼
mv = Make_Variables()
data = mv.make_variables_train(data)

In [None]:
print("기존 공휴일 : ", data['is_holiday'].sum().item())
print("샌드위치 : ", data['is_sandwich'].sum().item())
print("샌드위치 포함 공휴일 : ", data['is_holiday_sandwich'].sum().item())

기존 공휴일 :  33389
샌드위치 :  579
샌드위치 포함 공휴일 :  33968


In [None]:
import pickle
data.to_pickle("/content/drive/MyDrive/3. Grad School/LG Aimers/DATA/train_data.pickle")

#### 저장된 데이터 불러오기

In [7]:
# 데이터 불러오기
data = pd.read_pickle("/content/drive/MyDrive/3. Grad School/LG Aimers/DATA/train_data.pickle")

In [27]:
data.head(2)

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량,year,month,day,weekday,weekday_enc,season,year_enc,...,is_holiday,holiday_name,is_sandwich,is_holiday_sandwich,month_weight,week_weight,prev_avg_7,prev_avg_14,영업장명,메뉴명
0,2023-01-01,느티나무 셀프BBQ_1인 수저세트,0,2023,1,1,Sunday,6,Winter,0,...,1,New Year's Day,0,1,1.5,1.5,0.0,0.0,느티나무 셀프BBQ,1인 수저세트
1,2023-01-02,느티나무 셀프BBQ_1인 수저세트,0,2023,1,2,Monday,0,Winter,0,...,0,,0,0,1.5,0.8,0.0,0.0,느티나무 셀프BBQ,1인 수저세트


In [36]:
cols =  ["year", "month", "day", "weekday", "season", "is_holiday", "is_holiday_sandwich", "month_weight", "week_weight", "prev_avg_7", "prev_avg_14" ]
enc_cols = ["year", "month", "weekday", "season", "is_holiday", "is_holiday_sandwich"]
# month_weight, week_weight, prev_avg_7는 인코딩 없이 들어감 (분류, 회귀에서는)
# lstm에서는 minmaxscaling 해주는 게 안전
num_cols = ["month_weight", "week_weight", "prev_avg_7"]

In [21]:
cols =  ["year_enc", "month_sin", "month_cos", "day_sin", "day_cos", "weekday_sin", "weekday_cos", "season", "is_holiday", "is_holiday_sandwich", "month_weight", "week_weight", "prev_avg_7", "prev_avg_14"]
enc_cols = ["year_enc", "month", "weekday", "season", "is_holiday", "is_holiday_sandwich"]
# lstm에서는 minmaxscaling 해주는 게 안전
num_cols = ["month_weight", "week_weight", "prev_avg_7", "prev_avg_14"]

#### Time Series CV

- 분류
    - 방법 : 시간 순으로 과거 데이터로 모델 적합 -> 미래 데이터로 성능 측정
    - 메트릭 : PR-AUC, F1
    - 목적 : threshold 튜닝
- 회귀
    - 방법은 똑같이
    - 메트릭 : MAE + SMAPE
    - 목적 :

In [8]:
# 일단은 그냥 validation
stores = data['영업장명'].unique()

def make_validation(data):
    dates = sorted(data['영업일자'].unique())
    n = int(len(dates) * 0.2)
    val_dates = dates[-n : ]
    val_mask = data['영업일자'].isin(val_dates)
    train = data[~val_mask]
    validation = data[val_mask]
    return train, validation

#### 매출 여부 (분류)

In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
! pip install category_encoders
from category_encoders import TargetEncoder
from collections import defaultdict
import pickle
import joblib

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [10]:
class ClassificationModel():
    def __init__(self, data = None, cols = None, enc_cols = None, model_path = None):
        self.data = data
        self.cols = cols
        self.enc_cols = enc_cols

    def fit_model_by_menu(self, train_cls, validation_cls, cols, enc_cols):
        """
        train_cls, validation_cls - train, validation dataset
        cols - 전체 변수들
        enc_cols - 인코딩 진행할 변수들 (범주형)
        """
        models = {}

        for menu, group_df in train_cls.groupby("영업장명_메뉴명"):

            # 수량 전부 0이거나 0 아닌 날 없으면 학습 불가
            if group_df['매출_여부'].nunique() < 2:
                print(f"{menu} 학습 불가")
                continue

            # 범주형 변수 처리
            target_encoder = TargetEncoder()
            group_df[enc_cols] = target_encoder.fit_transform(group_df[enc_cols], group_df['매출_여부'])

            # x, y 분리
            x_train = group_df[cols]
            y_train = group_df["매출_여부"]

            val_group = validation_cls[validation_cls["영업장명_메뉴명"] == menu]
            x_test_enc = target_encoder.transform(val_group[enc_cols])
            x_test = pd.concat([x_test_enc, val_group[[c for c in cols if c not in enc_cols]]], axis=1)
            y_test = val_group["매출_여부"]

            # 모델 설정
            xgb_model = XGBClassifier(random_state = 1471)

            # 모델 학습
            xgb_model.fit(x_train, y_train)
            # y_pred = xgb_model.predict(x_test)

            models[menu] = {
                "model" : xgb_model,
                "encoder" : target_encoder
            }

        return models

    def save_cls_model(self, models, model_path):
        joblib.dump(models, model_path)
        print("모델 저장 완료!")

    def load_saved_model(self, model_path):
        models = joblib.load(model_path)
        return models

In [37]:
classification = ClassificationModel()
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/cls_models.pkl'

data_zero = data.copy()
data_zero['매출_여부'] = data_zero['매출수량'].apply(lambda x:1 if x > 0 else 0)
train_cls, validation_cls = make_validation(data_zero) ############ 추후 수정 예정

train_cls = data_zero

models = classification.fit_model_by_menu(train_cls, validation_cls, cols, enc_cols)
classification.save_cls_model(models, model_path)

모델 저장 완료!


In [38]:
# 다시 불러오기
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/cls_models.pkl'
classification = ClassificationModel()
models_class = classification.load_saved_model(model_path)

#### 매출 예측 (회귀)

In [12]:
from xgboost import XGBRegressor
# ! pip install category_encoders
from category_encoders import TargetEncoder
from collections import defaultdict
import pickle
import joblib

In [13]:
class RegressionModel():
    def __init__(self, data = None, cols = None, enc_cols = None, model_path = None):
        self.data = data
        self.cols = cols
        self.enc_cols = enc_cols

    # def validation ???

    def fit_model_by_menu(self, train_reg, validation_reg, cols, enc_cols):
        """
        train_reg, validation_reg - train, validation dataset
        cols - 전체 변수들
        enc_cols - 인코딩 진행할 변수들 (범주형)
        """
        models = {}

        for menu, group_df in train_reg.groupby("영업장명_메뉴명"):

            # 데이터 수 적으면 학습 불가
            if len(group_df) < 10:
                print(f"{menu} 학습 불가")
                continue

            # 범주형 변수 처리
            target_encoder = TargetEncoder()
            group_df[enc_cols] = target_encoder.fit_transform(group_df[enc_cols], group_df['매출수량'])

            # x, y 분리
            x_train = group_df[cols]
            y_train = group_df["매출수량"]

            val_group = validation_reg[validation_reg["영업장명_메뉴명"] == menu]
            x_test_enc = target_encoder.transform(val_group[enc_cols])
            x_test = pd.concat([x_test_enc, val_group[[c for c in cols if c not in enc_cols]]], axis=1)
            y_test = val_group["매출수량"]

            # 모델 설정
            xgb_model = XGBRegressor(random_state = 1471)

            # 모델 학습
            xgb_model.fit(x_train, y_train)
            # y_pred = xgb_model.predict(x_test)

            models[menu] = {
                "model" : xgb_model,
                "encoder" : target_encoder
            }

        return models

    def save_reg_model(self, models, model_path):
        joblib.dump(models, model_path)
        print("모델 저장 완료!")

    def load_saved_model(self, model_path):
        models = joblib.load(model_path)
        return models

In [39]:
regression = RegressionModel()
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 2/reg_models.pkl'

data_notzero = data[data['매출수량'] > 0]
train_reg, validation_reg = make_validation(data_notzero)

train_reg = data_notzero

models = regression.fit_model_by_menu(train_reg, validation_reg, cols, enc_cols)
regression.save_reg_model(models, model_path)

모델 저장 완료!


In [40]:
# 다시 불러오기
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 2/reg_models.pkl'
regression = RegressionModel()
models_reg = regression.load_saved_model(model_path)

#### 매출 예측 (시계열)

In [15]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import random
import glob
import joblib

import torch
import torch.nn as nn
from tqdm import tqdm

In [16]:
### Random Seed & Parameters
def set_seed(seed = 1471):
    random.seed(seed) # 일반 seed
    np.random.seed(seed) # numpy 난수 고정
    torch.manual_seed(seed) # CPU 난수 고정
    os.environ["PYTHONHASHSEED"] = str(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(1471)

In [17]:
# 전처리 여기서 다 하기
class Preprocess_LSTM():
    def __init__(self, data = None, cols = None, enc_cols = None, scaler = None):
        self.data = data
        self.enc_cols = enc_cols
        self.cols = cols
        self.scaler = scaler

    # 범주형 변수는 Label Encoding
    def label_encoding_lstm(self, data, enc_cols):
        for col in enc_cols:
            if data[col].dtype == 'object' or data[col].dtype.name == 'bool' or data[col].dtype.name == 'category':
                le = LabelEncoder()
                data[col] = le.fit_transform(data[col])
        return data

    # 매출수량은 MinMaxScaling
    def minmax_scaling_target(self, data, scaler):
        if scaler is None:
            scaler = MinMaxScaler()
        data['매출수량'] = scaler.fit_transform(data[['매출수량']])

        return data, scaler

In [18]:
class MultiOutputLSTM(nn.Module):
        def __init__(self, input_dim = 1, hidden_dim = 128, num_layers = 4, output_dim = 7):
            """ 7개 값 예측 (PREDICT 만큼의 날짜의 값을 예측하고자 함)"""
            super(MultiOutputLSTM, self).__init__()
            self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first = True)
            self.fc = nn.Linear(hidden_dim, output_dim)

        def forward(self, x):
            out, _ = self.lstm(x)
            return self.fc(out[:, -1, :]) # 마지막 시점 출력만 선택해서 fc에 넣음 -> (batch * output_dim)

In [19]:
class LSTMModel():
    def __init__(self, data = None, cols = None, enc_cols = None, num_cols = None, scaler = None, lookback = 28, predict = 7, device = "cuda", epochs = 200, batch_size = 16):
        self.data = data
        self.cols = cols
        self.enc_cols = enc_cols
        self.scaler = scaler
        self.lookback = lookback
        self.predict = predict
        self.device = torch.device(device)
        self.epochs = epochs
        self.batch_size = batch_size

    # enc_cols는 LabelEncoding
    def label_encoding_lstm(self, data, enc_cols):
        encoders = {}
        for col in enc_cols:
            if data[col].dtype == 'object' or data[col].dtype.name == 'bool' or data[col].dtype.name == 'category':
                le = LabelEncoder()
                data[col] = le.fit_transform(data[col])
                encoders[col] = le
        return data, encoders

    def minmax_scaling_features(self, data, num_cols):
        scaler = MinMaxScaler()
        data[num_cols] = scaler.fit_transform(data[num_cols])
        return data, scaler

    # 매출수량은 MinMaxScaling
    def minmax_scaling_target(self, data):
        scaler = MinMaxScaler()
        data['매출수량'] = scaler.fit_transform(data[['매출수량']])
        return data, scaler

    def train_lstm(self, train_df, cols, enc_cols, num_cols, device, epochs, batch_size):
        """
        영업장, 메뉴별로 LSTM 모델 훈련, 각각을 trained_models에 저장
        1. 전체 데이터를 '영업장명_메뉴명'으로 나누고 -> 각 데이터를 정규화, LSTM 학습
        """
        trained_models = {}

        # store_menu : 영업장명_메뉴명 / group : 나머지 데이터
        for store_menu, group in tqdm(train_df.groupby(["영업장명_메뉴명"]), desc = "Training LSTM"):

            # 날짜 순으로 정렬해서 데이터가 너무 적으면 -> 학습 생략
            store_train = group.sort_values("영업일자").copy()
            if len(store_train) < self.lookback + self.predict:
                continue

            # x - enc_cols 변수 정규화 진행
            store_train, encoders = self.label_encoding_lstm(store_train, enc_cols)

            # x - num_cols 변수 정규화 진행
            store_train, features_scaler = self.minmax_scaling_features(store_train, num_cols)

            # y - 매출수량 정규화 진행
            store_train, target_scaler =self.minmax_scaling_target(store_train)

            # x, y 할당
            train_vals = store_train[cols].values
            target_vals = store_train["매출수량"].values

            # lookback : 과거 데이터 며칠 쓸지
            # predict : 미래 데이터 며칠 예측할 건지
            # 입력과 출력 (x_train, y_train) 생성
            x_train, y_train = [], []
            for i in range(len(train_vals) - self.lookback - self.predict + 1):
                x_train.append(train_vals[i : i + self.lookback])
                y_train.append(target_vals[i + self.lookback : i + self.lookback + self.predict])

            # 텐서 변환
            x_train = torch.tensor(x_train).float().to(device)
            y_train = torch.tensor(y_train).float().to(device)

            # 모델 초기화 (영업장_메뉴별로 다른 모델)
            model = MultiOutputLSTM(input_dim = len(cols), output_dim = self.predict).to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
            criterion = nn.MSELoss()

            # 학습 모드로 설정
            model.train()

            # epochs 만큼 훈련
            for epoch in range(epochs):
                # idx : 랜덤하게 섞인 index들
                idx = torch.randperm(len(x_train))
                for i in range(0, len(x_train), batch_size):
                    batch_idx = idx[i:i+batch_size] # 배치 개수만큼 끊어서
                    x_batch, y_batch = x_train[batch_idx], y_train[batch_idx] # 배치 데이터 할당
                    output = model(x_batch) # 모델 태워서
                    loss = criterion(output, y_batch) # 평가하고
                    optimizer.zero_grad() # 역전파를 위한 초기화
                    loss.backward() # 역전파
                    optimizer.step() # 최적화

            # 모델 저장
            trained_models[store_menu] = {
                'model': model.eval(),
                'encoders' : encoders,
                'features_scaler' : features_scaler,
                'target_scaler': target_scaler,
                'last_sequence': train_vals[-self.lookback:]  # (28, 1)
            }

        return trained_models

    def save_lstm_model_gpu(self, models, model_path):
        joblib.dump(models, model_path)
        print("GPU 버전 모델 저장 완료!")

    def save_lstm_model_cpu(self, models, model_path):
        cpu_models = {}
        for k, bundle in models.items():
            cpu_models[k] = {
            'model': bundle['model'].to('cpu').eval(),  # 모델만 CPU로
            'encoders': bundle['encoders'],
            'features_scaler': bundle['features_scaler'],
            'target_scaler': bundle['target_scaler'],
            'last_sequence': bundle['last_sequence']
        }
        joblib.dump(cpu_models, model_path)
        print("CPU 버전 모델 저장 완료!")


    def load_saved_model(self, model_path):
        models = joblib.load(model_path)
        return models

In [41]:
# 데이터 준비
features = cols + ["매출수량", "영업일자", "영업장명_메뉴명"]
dataset_lstm = data[features]

lstm = LSTMModel(lookback = 28, predict = 7)
trainset_lstm, validationset_lstm = make_validation(dataset_lstm)

trainset_lstm = dataset_lstm

lookback, predict, batch_size, epochs = 28, 7, 16, 200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_lstm = lstm.train_lstm(train_df = trainset_lstm, cols = cols, enc_cols = enc_cols, num_cols = num_cols,
                               device = device, epochs = epochs, batch_size = batch_size)

model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 2/lstm_models_gpu.pkl'
lstm.save_lstm_model_gpu(trained_lstm, model_path)

model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 2/lstm_models_cpu.pkl'
lstm.save_lstm_model_cpu(trained_lstm, model_path)

Training LSTM: 100%|██████████| 193/193 [1:10:33<00:00, 21.93s/it]


GPU 버전 모델 저장 완료!
CPU 버전 모델 저장 완료!


In [42]:
# 저장된 모델 로드 - GPU
lookback, predict, batch_size, epochs = 28, 7, 16, 200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 2/lstm_models_gpu.pkl'
lstm = LSTMModel(lookback = 28, predict = 7)
trained_lstm = lstm.load_saved_model(model_path)

In [None]:
# 저장된 모델 로드 - CPU
lookback, predict, batch_size, epochs = 28, 7, 16, 200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 2/lstm_models_cpu.pkl'
lstm = LSTMModel(lookback = 28, predict = 7)
trained_lstm = lstm.load_saved_model(model_path)

#### 예측하기

In [43]:
class PredictionFunctions():
    def __init__(self, test_df = None, trained_models = None, test_prefix = None, cols = None, enc_cols = None, lookback = 28, predict = 7):
        self.test_df = test_df
        self.trained_models = trained_models
        self.test_prefix = test_prefix
        self.cols = cols
        self.enc_cols = enc_cols
        self.lookback = lookback
        self.predict = predict

    def predict_class(self, test_df, trained_models, test_prefix : str, cols : list, enc_cols : list, lookback = 28, predict = 7):
        """
        Input : test_df - test data, trained_models - list(menu : {model, encoder}), cols - x 변수들
        Output : [영업일자, 영업장명_메뉴명, 매출여부] DataFrame
        """
        results = []

        for store_menu_tup, store_test in test_df.groupby(['영업장명_메뉴명']):
            store_menu = store_menu_tup[0]
            # 훈련된 모델에 메뉴가 있는 경우만 진행
            if store_menu not in trained_models:
                continue

            # 모델 불러오기
            model = trained_models[store_menu]["model"]
            encoder = trained_models[store_menu]["encoder"]

            # 변수 추가하기
            mv = Make_Variables()
            store_test['영업일자'] = pd.to_datetime(store_test['영업일자'])
            store_test_sorted = store_test.sort_values('영업일자')
            last_date = store_test_sorted['영업일자'].iloc[-1]

            future_df = mv.make_variables_test(date = last_date, test_df = store_test, predict = 7)
            encoded = encoder.transform(future_df[enc_cols])
            encoded_df = pd.DataFrame(encoded, columns = enc_cols, index = future_df.index)
            future_df[enc_cols] = encoded_df
            future_df = future_df[cols]

            # 예측일자: TEST_00+1일 ~ TEST_00+7일
            pred_dates = [f"{test_prefix}+{i+1}일" for i in range(predict)]

            for d, val in zip(pred_dates, model.predict(future_df)):
                results.append({
                    '영업일자': d,
                    '영업장명_메뉴명': store_menu,
                    '매출여부': val
                })

        return pd.DataFrame(results)

    def predict_reg(self, test_df, trained_models, test_prefix : str, cols : list, enc_cols : list, lookback = 28, predict = 7):
        """
        Input : test_df - test data, trained_models - list(menu : {model, encoder}), cols - x 변수들
        Output : [영업일자, 영업장명_메뉴명, 매출수량] DataFrame
        """
        results = []

        for store_menu_tup, store_test in test_df.groupby(['영업장명_메뉴명']):
            store_menu = store_menu_tup[0]
            # 훈련된 모델에 메뉴가 있는 경우만 진행
            if store_menu not in trained_models:
                continue

            # 모델 불러오기
            model = trained_models[store_menu]["model"]
            encoder = trained_models[store_menu]["encoder"]

            # 변수 추가하기
            mv = Make_Variables()
            store_test['영업일자'] = pd.to_datetime(store_test['영업일자'])
            store_test_sorted = store_test.sort_values('영업일자')
            last_date = store_test_sorted['영업일자'].iloc[-1]

            future_df = mv.make_variables_test(date = last_date, test_df = store_test, predict = 7)
            encoded = encoder.transform(future_df[enc_cols])
            encoded_df = pd.DataFrame(encoded, columns = enc_cols, index = future_df.index)
            future_df[enc_cols] = encoded_df
            future_df = future_df[cols]

            # 예측일자: TEST_00+1일 ~ TEST_00+7일
            pred_dates = [f"{test_prefix}+{i+1}일" for i in range(predict)]

            for d, val in zip(pred_dates, model.predict(future_df)):
                results.append({
                    '영업일자': d,
                    '영업장명_메뉴명': store_menu,
                    '매출수량': val
                })

        return pd.DataFrame(results)

    def predict_lstm(self, test_df, trained_models, test_prefix : str, cols : list, enc_cols : list, num_cols : list, lookback = 28, predict = 7):
        """
        Input : test_df - test data, trained_models - list(menu : { model}), cols - x 변수들
        Output : [영업일자, 영업장명_메뉴명, 매출수량] DataFrame
        """
        results = []

        # 매장, 메뉴별로 그룹화해서 예측
        for store_menu, store_test in test_df.groupby(['영업장명_메뉴명']):
            # 훈련된 모델에 메뉴가 있는 경우만 진행
            if store_menu not in trained_models:
                continue

            # 모델, scaler 불러오기
            model = trained_models[store_menu]['model']
            encoders = trained_models[store_menu]['encoders']
            features_scaler = trained_models[store_menu]['features_scaler']
            target_scaler = trained_models[store_menu]['target_scaler']

            # LSTM 입력으로 활용할 최근 lookback 만큼의 데이터 가져오기
            mv = Make_Variables()
            store_test = mv.make_variables_train(data = store_test) ############ 수정함 ! 맞는지 모르겠음..
            store_test_sorted = store_test.sort_values('영업일자')

            features = cols + ["매출수량"]
            if len(store_test_sorted) < lookback:
                continue

            recent_df = store_test_sorted[features].iloc[-lookback:].copy()
            if len(recent_df) < lookback:
                continue # lookback 만큼의 데이터가 없으면 예측 안 하고 넘어가기

            ##### 요기서 변수 추가
            last_date = store_test_sorted['영업일자'].iloc[-1]
            recent_df_for_mv = store_test_sorted[features + ['영업장명_메뉴명', '영업일자']].iloc[-lookback:].copy()
            future_df = mv.make_variables_test(date = last_date, test_df = recent_df_for_mv, predict = 7)
            future_df['매출수량'] = 0.0
            full_df = pd.concat([recent_df, future_df[features]], axis = 0)

            # 정규화, 스케일링
            for col in enc_cols:
                if col not in full_df.columns:
                    continue
                if col in encoders:
                    le = encoders[col]
                    full_df[col] = le.transform(full_df[col])
                else:
                    full_df[col] = full_df[col].astype(int)
            full_df[num_cols] = features_scaler.transform(full_df[num_cols])

            x_input_vals = full_df[cols].values
            x_input = x_input_vals[:lookback]
            x_input = torch.tensor([x_input]).float().to(device)

            # 예측 수행
            with torch.no_grad():
                pred_scaled = model(x_input).squeeze().cpu().numpy()

            # 역정규화
            restored = []
            for i in range(predict):
                dummy = np.zeros((1, len(features)))
                dummy[0, features.index("매출수량")] = pred_scaled[i]
                restored_val = target_scaler.inverse_transform(dummy)[0, features.index("매출수량")]
                restored.append(max(restored_val, 0)) # 음수 나오면 0으로 처리

            # 예측일자: TEST_00+1일 ~ TEST_00+7일
            pred_dates = [f"{test_prefix}+{i+1}일" for i in range(predict)]

            for d, val in zip(pred_dates, restored):
                results.append({
                    '영업일자': d,
                    '영업장명_메뉴명': store_menu,
                    '매출수량(lstm)': val
                })

        return pd.DataFrame(results)

#### 예측값 생성

In [44]:
import re
all_preds_class = []
all_preds_reg = []
all_preds_lstm = []


# 모든 test_*.csv 순회
test_files = sorted(glob.glob('DATA/test/TEST_*.csv'))
predictions = PredictionFunctions()

for path in test_files:
    test_df = pd.read_csv(path)

    # 파일명에서 접두어 추출 (예: TEST_00)
    filename = os.path.basename(path)
    test_prefix = re.search(r'(TEST_\d+)', filename).group(1)

    # 일단 분류 모델 넣고
    pred_class = predictions.predict_class(test_df, models_class, test_prefix, cols, enc_cols)
    all_preds_class.append(pred_class)

    # 1 나오면 회귀 모델 넣고
    pred_reg = predictions.predict_reg(test_df, models_reg, test_prefix, cols, enc_cols)
    all_preds_reg.append(pred_reg)

    # lstm 넣고
    pred_lstm = predictions.predict_lstm(test_df, trained_lstm, test_prefix, cols, enc_cols, num_cols)
    all_preds_lstm.append(pred_lstm)

    # 합치기 (가중치.. 일단은 1.5 / 8.5 정도....)

df_class = pd.concat(all_preds_class, ignore_index=True)
df_reg   = pd.concat(all_preds_reg, ignore_index=True)
df_lstm  = pd.concat(all_preds_lstm, ignore_index=True)

  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _VF.lstm(
  result = _V

In [45]:
full_pred_df = pd.merge(df_class, df_reg, on=['영업일자', '영업장명_메뉴명'], how='outer')
full_pred_df.rename(columns={'매출수량': '매출수량(reg)'}, inplace=True)

df_lstm_plz = df_lstm.copy()
df_lstm_plz['영업장명_메뉴명'] = df_lstm['영업장명_메뉴명'].apply(lambda x: x[0] if isinstance(x, tuple) else x)

full_pred_df = pd.merge(df_lstm_plz, full_pred_df, on=['영업일자', '영업장명_메뉴명'], how='outer')
full_pred_df.head()

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량(lstm),매출여부,매출수량(reg)
0,TEST_00+1일,느티나무 셀프BBQ_1인 수저세트,5.305065,1,10.249413
1,TEST_00+1일,느티나무 셀프BBQ_BBQ55(단체),26.588142,0,69.521263
2,TEST_00+1일,"느티나무 셀프BBQ_대여료 30,000원",5.006311,1,5.589658
3,TEST_00+1일,"느티나무 셀프BBQ_대여료 60,000원",2.70886,1,0.774103
4,TEST_00+1일,"느티나무 셀프BBQ_대여료 90,000원",0.500561,1,1.257615


In [46]:
full_pred_df['매출수량'] = np.where(
    full_pred_df['매출여부'] == 1,
    full_pred_df['매출수량(reg)'] * 0.1 + full_pred_df['매출수량(lstm)'] * 0.9,
    full_pred_df['매출수량(lstm)']
)

full_pred_df.drop(columns=['매출여부', '매출수량(reg)', '매출수량(lstm)'], inplace=True)
full_pred_df

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량
0,TEST_00+1일,느티나무 셀프BBQ_1인 수저세트,5.799500
1,TEST_00+1일,느티나무 셀프BBQ_BBQ55(단체),26.588142
2,TEST_00+1일,"느티나무 셀프BBQ_대여료 30,000원",5.064646
3,TEST_00+1일,"느티나무 셀프BBQ_대여료 60,000원",2.515384
4,TEST_00+1일,"느티나무 셀프BBQ_대여료 90,000원",0.576267
...,...,...,...
13505,TEST_09+7일,화담숲카페_메밀미숫가루,55.975263
13506,TEST_09+7일,화담숲카페_아메리카노 HOT,23.070299
13507,TEST_09+7일,화담숲카페_아메리카노 ICE,50.579588
13508,TEST_09+7일,화담숲카페_카페라떼 ICE,13.581158


In [47]:
def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    # (영업일자, 메뉴) → 매출수량 딕셔너리로 변환
    pred_dict = dict(zip(
        zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']),
        pred_df['매출수량'].astype(float)
    ))

    final_df = sample_submission.copy()

    menu_cols = final_df.columns[1:]
    final_df[menu_cols] = final_df[menu_cols].astype(float)

    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:  # 메뉴명들
            final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)


    return final_df

In [35]:
sample_submission = pd.read_csv('DATA/sample_submission.csv')
final_hybrid = convert_to_submission_format(full_pred_df, sample_submission)
final_hybrid.to_csv('baseline_submission_hybrid.csv', index=False, encoding='utf-8-sig')

In [None]:
df_lstm_end = df_lstm_plz.rename(columns = {'매출수량(lstm)' : '매출수량'})
sample_submission = pd.read_csv('DATA/sample_submission.csv')
final_lstm = convert_to_submission_format(df_lstm_end, sample_submission)
final_lstm.to_csv('baseline_submission_lstm.csv', index=False, encoding='utf-8-sig')

In [48]:
full_pred_df_notzero = full_pred_df.copy()
full_pred_df_notzero.loc[full_pred_df_notzero['매출수량'].abs() < 1e-9, '매출수량'] = 1

sample_submission = pd.read_csv('DATA/sample_submission.csv')
final_hybrid = convert_to_submission_format(full_pred_df_notzero, sample_submission)
final_hybrid.to_csv('baseline_submission_hybrid_notzero.csv', index=False, encoding='utf-8-sig')