In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

import datetime
from datetime import timedelta
# !pip install holidays
import holidays

In [2]:
os.chdir("/content/drive/MyDrive/3. Grad School/LG Aimers")

# 데이터 불러오기
data = pd.read_csv("DATA/train/train.csv")

#### 파생변수 생성

In [3]:
# 월(month) -> 계절 매핑 딕셔너리
month_to_season = {
    1: "Winter", 2: "Winter", 12: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Autumn", 10: "Autumn", 11: "Autumn"}

In [4]:
# 월별 가중치 매핑
monthly_weights = {
    1: 1.5, 2: 1.5, 3: 0.5,
    4: 0.8, 5: 0.8, 6: 0.8,
    7: 1.2, 8: 1.2, 9: 0.8,
    10: 0.8, 11: 0.8, 12: 1.2}

In [5]:
# 요일별 가중치 매핑
weekly_weights = {
    "Monday": 0.8, "Tuesday": 0.8, "Wednesday": 0.8,
    "Thursday": 0.8, "Friday": 1.2, "Saturday": 1.7,
    "Sunday": 1.5}

In [6]:
class Make_Variables():
        def __init__(self, data = None, date = None, predict = 7, month_to_season = None, monthly_weights = None, weekly_weights = None):
            self.data = data
            self.date = date
            self.predict = predict
            self.month_to_season = month_to_season
            self.monthly_weights = monthly_weights
            self.weekly_weights = weekly_weights

        def update_kor_holidays(self):
            """국경일 추가"""
            kor_holidays = holidays.KR(years = [2023, 2024, 2025])
            kor_holidays.update({
                # datetime.date(2023,2,14) : "Valentine's Day",
                # datetime.date(2023,3,14) : "White Day",
                # datetime.date(2023,11,11) : "Pepero Day",
                # datetime.date(2024,2,14) : "Valentine's Day",
                # datetime.date(2024,3,14) : "White Day",
                # datetime.date(2024,11,11) : "Pepero Day",
                # datetime.date(2025,2,14) : "Valentine's Day",
                # datetime.date(2025,3,14) : "White Day",
                # datetime.date(2025,11,11) : "Pepero Day",

                datetime.date(2024,10,1) : "Temporary Holiday", # 국군의 날 임시공휴일
                datetime.date(2025,1,27) : "Temporary Holiday", # 설날 임시공휴일
                datetime.date(2025,3,3) : "Temporary Holiday", # 삼일절 대체공휴일
                datetime.date(2025, 5, 29) : "Election Period",
                datetime.date(2025, 5, 30) : "Election Period",
                datetime.date(2025, 6, 3) : "Presidential Election Day"})
            return kor_holidays

        def check_holidays(self, date, kor_holidays) -> int:
            """날짜 받아서 공휴일/주말 여부 출력"""
            # date = pd.Timestamp(date)
            if isinstance(date, pd.Series):
                check_holiday = date.dt.date.isin(kor_holidays)
                check_weekend = date.dt.weekday >= 5
            else:
                check_holiday = date.date() in kor_holidays
                check_weekend = date.weekday() >= 5
            is_holiday = (check_holiday | check_weekend)
            return is_holiday

        def get_sandwich_score(self, idx, is_holiday_col) -> int:
            """데이터프레임 기준으로 샌드위치 점수 계산"""
            score = 0
            # 앞/뒤 하루씩 봤을 때 모두 휴일 -> 3점
            if idx > 0 and idx < len(is_holiday_col) - 1:
                if (is_holiday_col[idx - 1] == 1) and (is_holiday_col[idx + 1] == 1):
                    score = 3
            # 앞/뒤 이틀씩 봤을 때 휴일 3일 -> 2점, 2일 -> 1점
            if score == 0 and idx > 1 and idx < len(is_holiday_col) - 2:
                start_idx = idx - 2
                end_idx = idx + 2
                nearby_holidays = is_holiday_col[start_idx : end_idx + 1].sum() - is_holiday_col[idx]
                if nearby_holidays == 3:
                    score = 2
                elif nearby_holidays == 2:
                    score = 1
            return score

        def get_sandwich_score_for_dates(self, date, kor_holidays) -> int:
            """특정 날짜를 받아와서 앞뒤 날짜를 구하고, 샌드위치 점수 계산"""
            # 하루씩
            prev_date, next_date = date - timedelta(days = 1), date + timedelta(days = 1)
            prev_hol, next_hol = self.check_holidays(prev_date, kor_holidays), self.check_holidays(next_date, kor_holidays)
            if prev_hol and next_hol:
                return 3
            days_offsets = [-2, -1, 1, 2]
            nearby_holidays = sum(self.check_holidays(date + timedelta(days = d), kor_holidays) for d in days_offsets)
            if nearby_holidays == 3:
                return 2
            elif nearby_holidays == 2:
                return 1
            else:
                return 0

        def get_month_weights(self, data = None, monthly_weights = monthly_weights):
            """월별 가중치 부여"""
            # 데이터프레임 들어오면
            if data is not None:
                data['month_weight'] = data['month'].map(monthly_weights)
                return data

        def get_week_weights(self, data = None, weekly_weights = weekly_weights):
            """요일별 가중치 부여"""
            # 데이터프레임 들어오면
            if data is not None:
                data['week_weight'] = data['weekday'].map(weekly_weights)
                return data

        def get_prev_days_mean(self, data, test_df = None, date = None, menu = None, howmany = 7):
            """
            일요일 날짜 받아와서 직전 주차의 일-토 매출수량 평균 계산
            주의 - test data에서 생성할 때는 참고할 데이터와 붙여넣을 데이터가 다름
            data : 참고할 데이터
            test_df : 참고할 데이터
            """
            if test_df is None:
                # 혹시 모르니까 검증
                if date.weekday() == 6:
                    # 이전 날짜들
                    prev_start = date - timedelta(days = howmany)
                    prev_end = date - timedelta(days = 1)
                    prev_data = data[(data['영업일자'] >= prev_start) & (data['영업일자'] <= prev_end) & (data['영업장명_메뉴명'] == menu)]
                    prev_avg = prev_data['매출수량'].mean()
                    # 첫 주 0으로 처리
                    if pd.isna(prev_avg):
                        prev_avg = 0
                    week_end = date + timedelta(days = 6)
                    curr_mask = (data['영업일자'] >= date) & (data['영업일자'] <= week_end) & (data['영업장명_메뉴명'] == menu)
                    colname = f"prev_avg_{howmany}"
                    data.loc[curr_mask, colname] = prev_avg
                    return data
                else:
                    return np.nan

            # test data라면
            else:
                # 혹시 모르니까 검증
                if date.weekday() == 6:
                    # 이전 날짜들
                    prev_start = date - timedelta(days = howmany)
                    prev_end = date - timedelta(days = 1)
                    prev_data = test_df[(test_df['영업일자'] >= prev_start) & (test_df['영업일자'] <= prev_end) & (test_df['영업장명_메뉴명'] == menu)]
                    prev_avg = prev_data['매출수량'].mean()
                    # 첫 주 0으로 처리
                    if pd.isna(prev_avg):
                        prev_avg = 0
                    week_end = date + timedelta(days = 6)
                    curr_mask = (data['영업일자'] >= date) & (data['영업일자'] <= week_end) & (data['영업장명_메뉴명'] == menu)
                    colname = f"prev_avg_{howmany}"
                    data.loc[curr_mask, colname] = prev_avg
                    return data
                else:
                    return np.nan


        # train, test 공통
        def make_fund_variables(self, data, month_to_season = month_to_season):
            # 영업일자 -> datetime
            data['영업일자'] = pd.to_datetime(data['영업일자'])

            # 연, 월, 일, 요일 분리
            data['year'] = data['영업일자'].dt.year
            data['month'] = data['영업일자'].dt.month
            data['day'] = data['영업일자'].dt.day
            data['weekday'] = data['영업일자'].dt.day_name()
            data['weekday_enc'] = data['영업일자'].dt.weekday

            # 계절 변수 생성
            data['season'] = data['month'].map(month_to_season)

            # 연도 차이 변수 생성
            data['year_enc'] = data['year'] - 2023

            # 월, 일, 요일 사이클릭 변환
            data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
            data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)

            data['day_sin'] = np.sin(2 * np.pi * data['day'] / 31)
            data['day_cos'] = np.cos(2 * np.pi * data['day'] / 31)

            data['weekday_sin'] = np.sin(2 * np.pi * data['weekday_enc'] / 7)
            data['weekday_cos'] = np.cos(2 * np.pi * data['weekday_enc'] / 7)

            # 공휴일 확인
            kor_holidays = self.update_kor_holidays()
            check_holiday = data['영업일자'].dt.date.isin(kor_holidays)
            check_weekend = data['weekday'].isin(['Saturday', 'Sunday'])
            data['is_holiday'] = (check_holiday | check_weekend).astype(int)
            data['holiday_name'] = data['영업일자'].dt.date.map(kor_holidays)

            return data

        # train의 입력 데이터
        def make_variables_train(self, data):
            data = self.make_fund_variables(data)
            kor_holidays = self.update_kor_holidays()

            ### 샌드위치 데이
            check_sandwich = data['is_holiday'].astype(int).values
            prev, next = np.roll(check_sandwich, 1), np.roll(check_sandwich, -1)
            sandwich = (prev == 1) & (check_sandwich == 0) & (next == 1)
            data['is_sandwich'] = sandwich.astype(int)

            # 샌드위치 - 첫날
            first = data['영업일자'].min()
            data.loc[data['영업일자'] == first, 'is_sandwich'] = self.get_sandwich_score_for_dates(first, kor_holidays)

            # 샌드위치 - 마지막 날
            last = data['영업일자'].max()
            data.loc[data['영업일자'] == last, 'is_sandwich'] = self.get_sandwich_score_for_dates(last, kor_holidays)

            # 샌드위치 포함한 공휴일
            data['is_holiday_sandwich'] = data['is_holiday'] | (data['is_sandwich'] > 0).astype(int)

            ### 월별 가중치
            data = self.get_month_weights(data, monthly_weights)

            ### 요일별 가중치
            data = self.get_week_weights(data, weekly_weights)

            ### 직전 주차 평균
            sundays = data[data['weekday'] == "Sunday"][["영업일자", "영업장명_메뉴명"]].copy()
            for _, row in sundays.iterrows():
                date = row['영업일자']
                menu = row['영업장명_메뉴명']
                data = self.get_prev_days_mean(data = data, date = date, menu = menu, howmany = 7)
                data = self.get_prev_days_mean(data = data, date = date, menu = menu, howmany = 14)

            ### 영업장명, 메뉴명 분리
            if '영업장명_메뉴명' in data.columns:
                data[['영업장명', '메뉴명']] = data['영업장명_메뉴명'].str.split('_', expand = True)

            ### 음수 처리
            negative = data[data['매출수량'] < 0]

            for idx, row in negative.iterrows():
                num = row['매출수량']
                if num < -10:
                    date = row['영업일자']
                    menu = row['영업장명_메뉴명']
                    prev_date = pd.to_datetime(date) - pd.Timedelta(days = 1)
                    prev_row = data[(data['영업일자'] == prev_date) & (data['영업장명_메뉴명'] == menu)]

                    if prev_row.iloc[0]["매출수량"] >= abs(num):
                        data.loc[prev_row.index[0], '매출수량'] += num

            # 남은 건 전부 0으로
            data.loc[data['매출수량'] < 0, '매출수량'] = 0

            return data

        # 예측하고자 하는 날들
        def make_variables_test(self, date, test_df, predict):
            """
            date : 최종 날짜 (입력 7일 중 가장 마지막) - TimeStamp
            test_df : 예측할 때 참고해올 데이터 -> 이거로 직전 주차 평균 생성
            """
            date = pd.to_datetime(date)
            future_dates = [date + timedelta(days = i + 1) for i in range(predict)]
            future_df = pd.DataFrame({'영업일자' : future_dates})

            menus_df = (test_df[['영업장명_메뉴명']].drop_duplicates().reset_index(drop = True))
            future_df = future_df.merge(menus_df, how='cross')

            kor_holidays = self.update_kor_holidays()

            # 기본적인 변수들
            future_df = self.make_fund_variables(future_df)

            future_df['영업일자'] = pd.to_datetime(future_df['영업일자']).dt.normalize()

            # 샌드위치
            future_df['is_sandwich'] = future_df['영업일자'].apply(lambda d: self.get_sandwich_score_for_dates(d, kor_holidays))

             # 샌드위치 포함한 공휴일
            future_df['is_holiday_sandwich'] = future_df['is_holiday'].astype(int) | (future_df['is_sandwich'] > 0).astype(int)

            # 월별 가중치
            future_df = self.get_month_weights(future_df, monthly_weights)

            # 요일별 가중치
            future_df = self.get_week_weights(future_df, weekly_weights)

            # 직전 주차 평균 -> 이거는 test 까지 받아오고 생각해야 함..
            sundays =  future_df.loc[future_df['weekday'] == "Sunday", ['영업일자', '영업장명_메뉴명']].copy()
            for _, row in sundays.iterrows():
                date = row['영업일자']
                menu = row['영업장명_메뉴명']
                future_df = self.get_prev_days_mean(data = future_df, test_df = test_df, date = date, menu = menu, howmany = 7)
                future_df = self.get_prev_days_mean(data = future_df, test_df = test_df, date = date, menu = menu, howmany = 14)

            return future_df

In [7]:
# 그냥 전부 만들면 돼
mv = Make_Variables()
data = mv.make_variables_train(data)

In [None]:
print("기존 공휴일 : ", data['is_holiday'].sum().item())
print("샌드위치 : ", data['is_sandwich'].sum().item())
print("샌드위치 포함 공휴일 : ", data['is_holiday_sandwich'].sum().item())

기존 공휴일 :  33389
샌드위치 :  579
샌드위치 포함 공휴일 :  33968


In [8]:
import pickle
data.to_pickle("/content/drive/MyDrive/3. Grad School/LG Aimers/DATA/train_data.pickle")

#### 저장된 데이터 불러오기

In [8]:
# 데이터 불러오기
data = pd.read_pickle("/content/drive/MyDrive/3. Grad School/LG Aimers/DATA/train_data.pickle")

In [9]:
data.head(2)

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량,year,month,day,weekday,weekday_enc,season,year_enc,...,is_holiday,holiday_name,is_sandwich,is_holiday_sandwich,month_weight,week_weight,prev_avg_7,prev_avg_14,영업장명,메뉴명
0,2023-01-01,느티나무 셀프BBQ_1인 수저세트,0,2023,1,1,Sunday,6,Winter,0,...,1,New Year's Day,0,1,1.5,1.5,0.0,0.0,느티나무 셀프BBQ,1인 수저세트
1,2023-01-02,느티나무 셀프BBQ_1인 수저세트,0,2023,1,2,Monday,0,Winter,0,...,0,,0,0,1.5,0.8,0.0,0.0,느티나무 셀프BBQ,1인 수저세트


In [10]:
cols =  ["year_enc", "month_sin", "month_cos", "day_sin", "day_cos", "weekday_sin", "weekday_cos", "season", "is_holiday", "is_holiday_sandwich", "month_weight", "week_weight", "prev_avg_7", "prev_avg_14"]
enc_cols = ["season", "is_holiday", "is_holiday_sandwich"]
# lstm에서는 minmaxscaling 해주는 게 안전
num_cols = ["month_weight", "week_weight", "prev_avg_7", "prev_avg_14"]

#### 매출 여부 (분류)

In [2]:
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
! pip install category_encoders
from category_encoders import TargetEncoder
from sklearn.metrics import f1_score
from typing import Dict, List, Optional
from collections import defaultdict
import pickle
import joblib

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [1]:
class ClassificationModel():
    def __init__(self, data = None, cols = None, enc_cols = None, model_path = None):
        self.data = data
        self.cols = cols
        self.enc_cols = enc_cols

    def fit_model_cv(self, data : pd.DataFrame, cols : List[str], enc_cols : List[str]) -> pd.DataFrame:
        """
        data - 전체 dataset
        cols - 전체 변수들
        enc_cols - 인코딩 진행할 변수들 (범주형)
        """
        oof_parts = []

        for menu, group_df in data.groupby("영업장명_메뉴명"):

            group_df = group_df.sort_values('영업일자')

            # 나중에 oof 저장할 때 사용할 것들
            org_idx = group_df.index
            oof_proba = np.full(len(group_df), np.nan)

            # x, y 분리
            x = group_df[cols]
            y = group_df["매출_여부"]

            # time series split
            tscv = TimeSeriesSplit(n_splits = 3)

            # 각 split 별로
            for fold, (train_idx, val_idx) in enumerate(tscv.split(x)):

                x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
                x_val, y_val = x.iloc[val_idx], y.iloc[val_idx]

                # 수량 전부 0이거나 0 아닌 날 없으면 학습 불가
                if y_train.nunique() < 2:
                    print(f"{menu} 학습 불가")
                    continue

                # 범주형 변수 인코딩
                target_encoder = TargetEncoder()
                target_encoder.fit(x_train[enc_cols], y_train)

                x_train = pd.concat([
                    target_encoder.transform(x_train[enc_cols]),
                    x_train[[c for c in cols if c not in enc_cols]]
                ], axis = 1)

                x_val = pd.concat([
                    target_encoder.transform(x_val[enc_cols]),
                    x_val[[c for c in cols if c not in enc_cols]]
                ], axis = 1)

                # 모델 설정
                xgb_model = XGBClassifier(random_state = 1471)

                # 모델 학습
                xgb_model.fit(x_train, y_train)
                oof_proba[val_idx] = xgb_model.predict_proba(x_val)[:, 1]

            part = pd.DataFrame({
                "index": org_idx.values,
                "영업장명_메뉴명": menu,
                "y_true": group_df["매출_여부"].values,
                "y_proba": oof_proba
            }).set_index("index")

            oof_parts.append(part)

        oof_df = pd.concat(oof_parts).sort_index()
        return oof_df


    def tuning_cv(self, data, cols, enc_cols, param_grid):
        """
        메뉴별로 하이퍼파라미터 튜닝 진행 (성능 보고.. 추가 진행? )
        """
        best_params_by_menu = {}

        return None


    def tune_threshold(self, oof_df : pd.DataFrame, metric = 'f1') -> Dict[str, float]:
        """
        메뉴별로 OOF 데이터프레임을 받아와서
        threshold를 최적화
        metric으로는 f1 사용
        """
        best_thresholds = {}

        for menu, group_df in oof_df.groupby("영업장명_메뉴명"):
            y_true = group_df["y_true"].values
            y_proba = group_df["y_proba"].values

            # 초기값 진행
            best_score, best_thr = -1, 0.5
            for thr in np.linspace(0.05, 0.95, 51):
                y_pred = (y_proba >= thr).astype(int)
                score = f1_score(y_true, y_pred, zero_division = 0)
                if score > best_score:
                    best_score = score
                    best_thr = thr

            best_thresholds[menu] = best_thr

        return best_thresholds


    def get_final_model(self, data, cols, enc_cols, thresholds, hyperparameters_dict = None) -> Dict[str, Dict[str, Any]]:
        """
        메뉴별로 튜닝된 하이퍼파라미터, threshold를 반영하여
        최종 모델 적합
        """
        models = {}

        for menu, group_df in data.groupby("영업장명_메뉴명"):

            group_df = group_df.sort_values('영업일자')

            # 범주형 변수 처리 - 전체 데이터로
            target_encoder_full = TargetEncoder()
            group_df[enc_cols] = target_encoder_full.fit_transform(group_df[enc_cols], group_df['매출_여부'])

            # x, y 분리
            x_full = group_df[cols]
            y_full = group_df["매출_여부"]

            # 하이퍼파라미터
            hyperparameters = hyperparameters_dict.get(menu, {}) if hyperparameters_dict else {}

            # 모델 설정
            xgb_model_full = XGBClassifier(random_state = 1471, **hyperparameters)

            # 모델 학습
            xgb_model_full.fit(x_full, y_full)

            models[menu] = {
                "model" : xgb_model_full,
                "encoder" : target_encoder_full,
                "threshold" : thresholds.get(menu, 0.5)}

        return models

    def fit_whole_model(self, data, cols, enc_cols) -> Dict[str, Dict[str, Any]]:
        oof_df = self.fit_model_cv(data, cols, enc_cols)
        print("CV 완료!")
        thresholds = self.tune_threshold(oof_df)
        print("threshold 탐색 완료!")
        models = self.get_final_model(data, cols, enc_cols, thresholds)
        return models

    def save_cls_model(self, models, model_path):
        joblib.dump(models, model_path)
        print("모델 저장 완료!")

    def load_saved_model(self, model_path):
        models = joblib.load(model_path)
        return models

NameError: name 'pd' is not defined

In [None]:
classification = ClassificationModel()
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/cls_models.pkl'

data_zero = data.copy()
data_zero['매출_여부'] = data_zero['매출수량'].apply(lambda x:1 if x > 0 else 0)

models = classification.fit_whole_model(data = data_zero, cols = cols, enc_cols = enc_cols)
classification.save_cls_model(models, model_path)

In [None]:
# 다시 불러오기
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/cls_models.pkl'
classification = ClassificationModel()
models_class = classification.load_saved_model(model_path)

#### 매출 예측 (회귀)

In [None]:
from xgboost import XGBRegressor
# ! pip install category_encoders
from category_encoders import TargetEncoder
from sklearn.model_selection import TimeSeriesSplit
from collections import defaultdict
from itertools import product
import pickle
import joblib

In [None]:
class RegressionModel():
    def __init__(self, data = None, cols = None, enc_cols = None, model_path = None):
        self.data = data
        self.cols = cols
        self.enc_cols = enc_cols

    def smape_score(self, y_true, y_pred, eps = 1e-8):
        y_true = np.asarray(y_true, dtype=float)
        y_pred = np.asarray(y_pred, dtype=float)
        denom = np.abs(y_true) + np.abs(y_pred)
        denom = np.where(denom < eps, eps, denom)
        return 200.0 * np.mean(np.abs(y_true - y_pred) / denom)

    def tuning_cv(self, data : pd.DataFrame, cols : List[str], enc_cols : List[str], param_grid = None) -> Dict[str, Dict]:
        """
        메뉴별로 하이퍼파라미터 튜닝 진행
        """
        best_params_by_menu = {}

        if param_grid is None:
            param_grid = {
                "max_depth" : [5, 7, 9],
                "n_estimators" : [100, 400],
                "learning_rate" : [0.05, 0.1]}

        keys = list(param_grid.keys())
        combos = list(product(*[param_grid[k] for k in keys]))

        for menu, group_df in data.groupby("영업장명_메뉴명"):

            group_df = group_df.sort_values("영업일자")

            x = group_df[cols]
            y = group_df["매출수량"]

            # time series split
            tscv = TimeSeriesSplit(n_splits = 3)

            # 초기값
            best_smape = np.inf
            best_params = None

            for values in combos:
                params = dict(zip(keys, values))

                base_params = {"random_state" : 1478}
                base_params.update(params)

                fold_smapes : List[float] = []

                for fold, (train_idx, val_idx) in enumerate(tscv.split(x)):

                    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
                    x_val, y_val = x.iloc[val_idx], y.iloc[val_idx]

                    # 데이터 수 적으면 학습 불가
                    if len(x_train) < 20:
                        print(f"{menu} 튜닝 불가)")
                        continue

                    # 범주형 변수 인코딩
                    target_encoder = TargetEncoder()
                    target_encoder.fit(x_train[enc_cols], y_train)

                    x_train = pd.concat([
                        target_encoder.transform(x_train[enc_cols]),
                        x_train[[c for c in cols if c not in enc_cols]]
                    ], axis = 1)

                    x_val = pd.concat([
                        target_encoder.transform(x_val[enc_cols]),
                        x_val[[c for c in cols if c not in enc_cols]]
                    ], axis = 1)

                    # 모델 설정
                    xgb_model = XGBRegressor(**base_params)

                    # 모델 학습
                    xgb_model.fit(x_train, y_train)

                    pred = xgb_model.predict(x_val)
                    smape = self.smape_score(y_val, pred)
                    fold_smapes.append(smape)

                mean_smape = float(np.mean(fold_smapes))

                if mean_smape < best_smape:
                    best_smape = mean_smape
                    best_params = base_params

            if best_params is not None:
                best_params_by_menu[menu] = best_params

        return best_params_by_menu


    def get_final_model(self, data, cols, enc_cols, hyperparameters_dict : dict[str, dict[str, Any]] = None)  -> Dict[str, Dict[str, Any]]:
        """
        data, validation_reg - train, validation dataset
        cols - 전체 변수들
        enc_cols - 인코딩 진행할 변수들 (범주형)
        """
        models = {}

        for menu, group_df in data.groupby("영업장명_메뉴명"):

            group_df = group_df.sort_values("영업일자")

            # 데이터 수 적으면 학습 불가
            if len(group_df) < 10:
                print(f"{menu} 학습 불가")
                continue

            # 범주형 변수 처리
            target_encoder_full = TargetEncoder()
            group_df[enc_cols] = target_encoder_full.fit_transform(group_df[enc_cols], group_df['매출수량'])

            # x, y 분리
            x_full = group_df[cols]
            y_full = group_df["매출수량"]

            # 하이퍼파라미터 불러오기
            params = hyperparameters_dict.get(menu, {}) if hyperparameters_dict else {}

            base_params = {"random_state" : 1478}
            base_params.update(params)

            # 모델 설정
            xgb_model_full = XGBRegressor(**base_params)

            # 모델 학습
            xgb_model_full.fit(x_full, y_full)

            models[menu] = {
                "model" : xgb_model_full,
                "hyperparameters" : base_params,
                "encoder" : target_encoder_full
            }

        return models

    def fit_whole_model(self, data, cols, enc_cols) -> Dict[str, Dict[str, Any]]:
        best_params_by_menu = self.tuning_cv(data, cols, enc_cols)
        print("튜닝 완료!")
        models = self.get_final_model(data, cols, enc_cols, best_params_by_menu)
        return models

    def save_reg_model(self, models, model_path):
        joblib.dump(models, model_path)
        print("모델 저장 완료!")

    def load_saved_model(self, model_path):
        models = joblib.load(model_path)
        return models

In [None]:
regression = RegressionModel()
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/reg_models.pkl'

data_notzero = data[data['매출수량'] > 0]
train_reg, validation_reg = make_validation(data_notzero)

models = regression.fit_model_by_menu(train_reg, validation_reg, cols, enc_cols)
regression.save_reg_model(models, model_path)

모델 저장 완료!


In [None]:
# 다시 불러오기
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/reg_models.pkl'
regression = RegressionModel()
models_reg = regression.load_saved_model(model_path)

#### 매출 예측 (시계열)

In [18]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
import random
import glob
import joblib

import torch
import torch.nn as nn
from tqdm import tqdm

In [12]:
### Random Seed & Parameters
def set_seed(seed = 1471):
    random.seed(seed) # 일반 seed
    np.random.seed(seed) # numpy 난수 고정
    torch.manual_seed(seed) # CPU 난수 고정
    os.environ["PYTHONHASHSEED"] = str(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(1478)

In [13]:
class MultiOutputLSTM(nn.Module):
        def __init__(self, input_dim = 1, hidden_dim = 256, num_layers = 4, output_dim = 7):
            """ 7개 값 예측 (PREDICT 만큼의 날짜의 값을 예측하고자 함)"""
            super(MultiOutputLSTM, self).__init__()
            self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first = True)
            self.fc = nn.Linear(hidden_dim, output_dim)

        def forward(self, x):
            out, _ = self.lstm(x)
            return self.fc(out[:, -1, :]) # 마지막 시점 출력만 선택해서 fc에 넣음 -> (batch * output_dim)

In [14]:
# weight 직접 입력해서 반영 가능
class WeightedSMAPELoss(nn.Module):
    def __init__(self, eps=1e-8):
        super().__init__()
        self.eps = eps

    def forward(self, y_pred, y_true, w=None):
        num = (y_pred - y_true).abs()
        den = (y_pred.abs() + y_true.abs()).clamp(min=self.eps)
        smape = 2.0 * num / (den)

        mask = (y_true != 0).float()
        if w is None:
            w = torch.ones_like(y_true)
        else:
            if w.dim() == 1:
                w = w.view(-1, 1)
            w = w.expand_as(y_true)

        w_mask = w * mask
        denom = w_mask.sum().clamp(min=self.eps)
        loss = (smape * w_mask).sum() / denom
        return loss

# weighted huberLoss
class WeightedHuberLoss(nn.Module):
    def __init__(self, delta = 1.0, zeros : bool = False, eps = 1e-8):
        """
        delta -L2 -> L1로 전환되는 임계값
        zeros - y_true = 0인 샘플 제외할지 여부
        eps - 분모가 0이 되는 것을 방지하기 위한 작은 값
        """
        super().__init__()
        self.delta = float(delta)
        self.zeros = bool(zeros)
        self.eps = float(eps)

    def forward(self, y_pred, y_true, w = None):
        """
        w -  (N, ) 형태로 된 가중치
        """
        error = y_pred - y_true
        abs_error = error.abs()
        huber = torch.where(
            abs_error <= self.delta,
            0.5 * error**2,
            self.delta * (abs_error - 0.5 * self.delta),
        )

        if self.zeros:
            mask = (y_true != 0).float()
        else:
            mask = torch.ones_like(y_true)

        if w is None:
            w_full = torch.ones_like(y_true)
        else:
            if w.dim() == 1:
                w = w.view(-1, 1) # (N, 1)로 확장
            w_full = w.expand_as(y_true).float()

        w_mask = w_full * mask
        denom = w_mask.sum().clamp(min=self.eps)
        return (huber * w_mask).sum() / denom

# SMAPE + Huber Loss
class CombinationLoss(nn.Module):
    def __init__(self, losses, weights):
        """
        losses - 결합할 손실 리스트
        weights - 각 손실의 가중치 리스트
        """
        super().__init__()
        if not losses or len(losses) != len(weights):
            raise ValueError("loss 배열과 weight 배열의 길이가 다릅니다.")
        self.losses = nn.ModuleList(losses)

        weight_tensor = torch.tensor(weights, dtype=torch.float32)
        self.weights = weight_tensor / weight_tensor.sum()

    def forward(self, y_pred, y_true, w = None):
        """
        w - 공통 가중치
        """
        total_loss = 0.0
        device_weights = self.weights.to(y_pred.device)
        for i, lf in enumerate(self.losses):
            loss = lf(y_pred, y_true, w)
            total_loss += device_weights[i] * loss
        return total_loss

In [16]:
class LSTMModel():
    def __init__(self, data = None, cols = None, enc_cols = None, num_cols = None, scaler = None,
                 lookback = 28, predict = 7, hidden_dim = 256, num_layers = 4, device = "cuda", epochs = 200, batch_size = 16,
                 delta = 0.1, val_ratio = 0.2, horizon_weight_mode = "linear",
                 lr = 0.04, losses = None, loss_weights = [0.5, 0.5], menu_weights = None):
        self.data = data
        self.cols = cols
        self.enc_cols = enc_cols
        self.scaler = scaler
        self.lookback = lookback
        self.predict = predict
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.device = torch.device(device)
        self.epochs = epochs
        self.batch_size = batch_size
        self.delta = delta
        self.val_ratio = val_ratio
        self.horizon_weight_mode = horizon_weight_mode
        self.lr = float(lr)
        self.losses = losses
        self.loss_weights = loss_weights
        self.menu_weights = menu_weights

    # enc_cols는 LabelEncoding
    def label_encoding_lstm(self, data, enc_cols):
        encoders = {}
        for col in enc_cols:
            if data[col].dtype == 'object' or data[col].dtype.name == 'bool' or data[col].dtype.name == 'category':
                le = LabelEncoder()
                data[col] = le.fit_transform(data[col])
                encoders[col] = le
        return data, encoders

    def minmax_scaling_features(self, data, num_cols):
        scaler = MinMaxScaler()
        data[num_cols] = scaler.fit_transform(data[num_cols])
        return data, scaler

    # 매출수량은 MinMaxScaling
    def minmax_scaling_target(self, data):
        scaler = MinMaxScaler()
        data['매출수량'] = scaler.fit_transform(data[['매출수량']])
        return data, scaler

    def build_horizon_weights(self, predict = 7): ###################################################
        """
        수평선(예측일) 가중치 벡터 생성.
        - 'linear': 1..predict 선형 가중 후 정규화
        - 'uniform': 동일 가중
        """
        if self.horizon_weight_mode == "uniform":
            w = torch.ones(predict)
        else:
            # 기본: 뒤쪽일수록 더 큰 가중
            w = torch.arange(1, predict + 1).float()
        w = w / w.sum()
        return w

    def compute_smape(self, y_pred, y_true, eps=1e-8):
        """정규화 스케일에서의 SMAPE( y_true==0 은 마스킹 )"""
        num = (y_pred - y_true).abs()
        den = (y_pred.abs() + y_true.abs()).clamp(min=eps)
        smape = 2.0 * num / den
        mask = (y_true != 0).float()
        denom = mask.sum().clamp(min=eps)
        return (smape * mask).sum().item() / denom.item()



    from sklearn.model_selection import TimeSeriesSplit
##############################################################################
    def _tss_last_split_indices(self, n_samples: int, n_splits: int = 3):
        """
        마지막 TimeSeriesSplit fold의 (train_idx, val_idx)만 반환.
        누수 방지를 위해 gap = lookback + predict - 1 권장.
        """
        idx = np.arange(n_samples)
        tscv = TimeSeriesSplit(n_splits=n_splits, gap=self.lookback + self.predict - 1)
        tr_idx, va_idx = None, None
        for tr, va in tscv.split(idx):
            tr_idx, va_idx = tr, va
        return tr_idx, va_idx

    def _make_windows(self, x: np.ndarray, y: np.ndarray):
        """
        구간 내부에서만 윈도우 생성 (누수 없음)
        x: (T, F), y: (T,)
        return: x:(N, lookback, F), y:(N, predict) or (None, None)
        """
        T = len(x)
        xs, ys = [], []
        for i in range(T - self.lookback - self.predict + 1):
            xs.append(x[i:i+self.lookback])
            ys.append(y[i+self.lookback:i+self.lookback+self.predict])
        if not xs:
            return None, None
        return np.stack(xs).astype(np.float32), np.stack(ys).astype(np.float32)
##############################################################################

    def train_lstm(self, train_df, cols, enc_cols, num_cols,
                   device, epochs, batch_size, lr, losses, loss_weights,
                   n_splits : int = 3, print_every = 50):
        """
        영업장, 메뉴별로 LSTM 모델 훈련, 각각을 trained_models에 저장
        Loss - CombinationLoss([WeightedSMAPELoss(), WeightedHuberLoss()], [0.5, 0.5]
        """
        trained_models = {}
        horizon_w = self.build_horizon_weights(self.predict).to(device) #################################

        # Loss 구성
        comb_loss = CombinationLoss(losses = losses, weights = loss_weights).to(device)

        # store_menu : 영업장명_메뉴명 / group : 나머지 데이터
        for store_menu, group in tqdm(train_df.groupby(["영업장명_메뉴명"]), desc = "Training LSTM"):

            # 날짜 순으로 정렬해서 데이터가 너무 적으면 -> 학습 생략
            store_train = group.sort_values("영업일자").copy()
            if len(store_train) < self.lookback + self.predict:
                continue

            # ===== TimeSeriesSplit: 마지막 폴드를 검증으로 사용 =====
            tr_idx, va_idx = self._tss_last_split_indices(len(store_train), n_splits=n_splits)
            df_tr = store_train.iloc[tr_idx].copy()
            df_va = store_train.iloc[va_idx].copy()

            # ===== 인코더/스케일러는 train(df_tr)에만 fit, df_tr/df_va에 transform (누수 방지) =====
            encoders = {}
            if enc_cols:
                for c in enc_cols:
                    le = LabelEncoder()
                    df_tr[c] = le.fit_transform(df_tr[c].astype(str))
                    # val에서 미지 카테고리 안전 처리
                    cls2idx = {cls: i for i, cls in enumerate(le.classes_)}
                    df_va[c] = df_va[c].astype(str).map(lambda v: cls2idx.get(v, -1))
                    encoders[c] = le

            features_scaler = None
            if num_cols:
                features_scaler = MinMaxScaler()
                df_tr[num_cols] = features_scaler.fit_transform(df_tr[num_cols])
                df_va[num_cols] = features_scaler.transform(df_va[num_cols])

            target_scaler = MinMaxScaler()
            df_tr[['매출수량']] = target_scaler.fit_transform(df_tr[['매출수량']])
            df_va[['매출수량']] = target_scaler.transform(df_va[['매출수량']])

            # ===== 원시 배열 =====
            X_tr = df_tr[cols].to_numpy(dtype=np.float32)
            y_tr = df_tr['매출수량'].to_numpy(dtype=np.float32)
            X_va = df_va[cols].to_numpy(dtype=np.float32)
            y_va = df_va['매출수량'].to_numpy(dtype=np.float32)

            # ===== 세그먼트 내부에서만 윈도우 생성 (경계 누수 없음) =====
            x_tr_np, y_tr_np = self._make_windows(X_tr, y_tr)
            x_va_np, y_va_np = self._make_windows(X_va, y_va)
            if x_tr_np is None or x_va_np is None:
                continue

            # 텐서 이동
            x_train = torch.tensor(x_tr_np).to(device)         # (N_tr, lookback, F)
            y_train = torch.tensor(y_tr_np).to(device)         # (N_tr, predict)
            x_val   = torch.tensor(x_va_np).to(device)
            y_val   = torch.tensor(y_va_np).to(device)

            # 모델 초기화 (영업장_메뉴별로 다른 모델)
            model = MultiOutputLSTM(input_dim = len(cols),
                                    hidden_dim = self.hidden_dim,
                                    num_layers = self.num_layers,
                                    output_dim = self.predict).to(device)

            optimizer = torch.optim.Adam(model.parameters(), lr =lr)
            best_val = float("inf")
            best_sd = None

            # 학습 모드로 설정
            model.train()

            # epochs 만큼 훈련
            for epoch in range(1, epochs + 1):
                perm = torch.randperm(len(x_train), device = device)
                epoch_train_loss = 0.0
                total_train_count = 0

                # idx : 랜덤하게 섞인 index들
                for i in range(0, len(x_train), batch_size):
                    batch_idx = perm[i : i+batch_size] # 배치 개수만큼 끊어서
                    x_batch, y_batch = x_train[batch_idx], y_train[batch_idx] # 배치 데이터 할당
                    weight_mat = horizon_w.unsqueeze(0).repeat(y_batch.size(0), 1) ###########################
                    output = model(x_batch) # 모델 태워서
                    loss = comb_loss(output, y_batch, w = weight_mat) # 평가하고
                    optimizer.zero_grad() # 역전파를 위한 초기화
                    loss.backward() # 역전파
                    optimizer.step() # 최적화

                    epoch_train_loss += loss.item() * y_batch.size(0)
                    total_train_count += y_batch.size(0)

                epoch_train_loss /= max(1, total_train_count)

                ### Validation
                with torch.no_grad():
                    model.eval()
                    pred_val = model(x_val)
                    weight_val = horizon_w.unsqueeze(0).repeat(y_val.size(0), 1)
                    val_loss = comb_loss(pred_val, y_val, w = weight_val).item()
                    val_smape = self.compute_smape(pred_val, y_val)

                    if epoch % 50 == 0:
                        print(f"\n[{store_menu}] Epoch {epoch} ==============================\n "
                            f"Train Loss {epoch_train_loss:.5f} | Val Loss {val_loss:.5f} | SMAPE {val_smape:.5f}")

                    if val_loss < best_val:
                        best_val = val_loss
                        best_sd = {k : v.detach().cpu().clone() for k, v in model.state_dict().items()}
                    model.train()

            if best_sd is not None:
                model.load_state_dict(best_sd)

            # 모델 저장
            trained_models[store_menu] = {
                'model': model.eval(),
                'encoders' : encoders,
                'features_scaler' : features_scaler,
                'target_scaler': target_scaler,
                # 'last_sequence': train_vals[-self.lookback:]
                }

        return trained_models

    def save_lstm_model_gpu(self, models, model_path):
        joblib.dump(models, model_path)
        print("GPU 버전 모델 저장 완료!")

    def save_lstm_model_cpu(self, models, model_path):
        cpu_models = {}
        for k, bundle in models.items():
            cpu_models[k] = {
            'model': bundle['model'].to('cpu').eval(),  # 모델만 CPU로
            'encoders': bundle['encoders'],
            'features_scaler': bundle['features_scaler'],
            'target_scaler': bundle['target_scaler'],
            'last_sequence': bundle['last_sequence']
        }
        joblib.dump(cpu_models, model_path)
        print("CPU 버전 모델 저장 완료!")


    def load_saved_model(self, model_path):
        models = joblib.load(model_path)
        return models

In [19]:
# 데이터 준비
features = cols + ["매출수량", "영업일자", "영업장명_메뉴명"]
dataset_lstm = data[features]

losses = [WeightedSMAPELoss(eps = 1e-8), WeightedHuberLoss(delta = 1.0, zeros = False, eps = 1e-8)]
loss_weights = [0.5, 0.5]
lstm = LSTMModel(lookback = 28, hidden_dim = 256, num_layers = 4, predict = 7)

lookback, predict, batch_size, epochs = 28, 7, 16, 200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_lstm = lstm.train_lstm(train_df = dataset_lstm, cols = cols, enc_cols = enc_cols, num_cols = num_cols,
                               device = device, epochs = epochs, batch_size = batch_size, lr = 0.04,
                               losses = losses, loss_weights = loss_weights, n_splits = 3, print_every = 50)

model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/lstm_models_gpu.pkl'
lstm.save_lstm_model_gpu(trained_lstm, model_path)

model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/lstm_models_cpu.pkl'
lstm.save_lstm_model_cpu(trained_lstm, model_path)

Training LSTM:   0%|          | 0/193 [00:00<?, ?it/s]

 Train Loss 2.13194 | Val Loss 3.18855 | SMAPE 1.80762
 Train Loss 5.86421 | Val Loss 6.20008 | SMAPE 1.91770


Training LSTM:   0%|          | 0/193 [13:58<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# 저장된 모델 로드 - GPU
lookback, predict, batch_size, epochs = 28, 7, 16, 200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/lstm_models_gpu.pkl'
lstm = LSTMModel(lookback = 28, predict = 7)
trained_lstm = lstm.load_saved_model(model_path)

CPU 버전 모델 저장 완료!


In [None]:
# 저장된 모델 로드 - CPU
lookback, predict, batch_size, epochs = 28, 7, 16, 200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 3/lstm_models_cpu.pkl'
lstm = LSTMModel(lookback = 28, predict = 7)
trained_lstm = lstm.load_saved_model(model_path)

#### 예측하기

In [None]:
class PredictionFunctions():
    def __init__(self, test_df = None, trained_models = None, test_prefix = None, cols = None, enc_cols = None, lookback = 28, predict = 7):
        self.test_df = test_df
        self.trained_models = trained_models
        self.test_prefix = test_prefix
        self.cols = cols
        self.enc_cols = enc_cols
        self.lookback = lookback
        self.predict = predict

    def predict_class(self, test_df, trained_models, test_prefix : str, cols : list, enc_cols : list, lookback = 28, predict = 7):
        """
        Input : test_df - test data, trained_models - list(menu : {model, encoder}), cols - x 변수들
        Output : [영업일자, 영업장명_메뉴명, 매출여부] DataFrame
        """
        results = []

        for store_menu_tup, store_test in test_df.groupby(['영업장명_메뉴명']):
            store_menu = store_menu_tup[0]
            # 훈련된 모델에 메뉴가 있는 경우만 진행
            if store_menu not in trained_models:
                continue

            # 모델 불러오기
            model = trained_models[store_menu]["model"]
            encoder = trained_models[store_menu]["encoder"]
            threshold = trained_models[store_menu].get("threshold", 0.5)

            # 변수 추가하기
            mv = Make_Variables()
            store_test['영업일자'] = pd.to_datetime(store_test['영업일자'])
            store_test_sorted = store_test.sort_values('영업일자')
            last_date = store_test_sorted['영업일자'].iloc[-1]

            future_df = mv.make_variables_test(date = last_date, test_df = store_test, predict = 7)
            encoded = encoder.transform(future_df[enc_cols])
            encoded_df = pd.DataFrame(encoded, columns = enc_cols, index = future_df.index)
            future_df[enc_cols] = encoded_df
            x = future_df[cols]

            if hasattr(model, "classes_"):
                pos_idx = int(np.where(model.classes_ == 1)[0][0])
            else:
                pos_idx = 1

            proba = model.predict_proba(x)[:, pos_idx]
            y_hat = (proba >= threshold).astype(int)


            # 예측일자: TEST_00+1일 ~ TEST_00+7일
            pred_dates = [f"{test_prefix}+{i+1}일" for i in range(predict)]

            for d, val in zip(pred_dates, model.predict(future_df)):
                results.append({
                    '영업일자': d,
                    '영업장명_메뉴명': store_menu,
                    '매출여부': val
                })

        return pd.DataFrame(results)

    def predict_reg(self, test_df, trained_models, test_prefix : str, cols : list, enc_cols : list, lookback = 28, predict = 7):
        """
        Input : test_df - test data, trained_models - list(menu : {model, encoder}), cols - x 변수들
        Output : [영업일자, 영업장명_메뉴명, 매출수량] DataFrame
        """
        results = []

        for store_menu_tup, store_test in test_df.groupby(['영업장명_메뉴명']):
            store_menu = store_menu_tup[0]
            # 훈련된 모델에 메뉴가 있는 경우만 진행
            if store_menu not in trained_models:
                continue

            # 모델 불러오기
            model = trained_models[store_menu]["model"]
            encoder = trained_models[store_menu]["encoder"]

            # 변수 추가하기
            mv = Make_Variables()
            store_test['영업일자'] = pd.to_datetime(store_test['영업일자'])
            store_test_sorted = store_test.sort_values('영업일자')
            last_date = store_test_sorted['영업일자'].iloc[-1]

            future_df = mv.make_variables_test(date = last_date, test_df = store_test, predict = 7)
            encoded = encoder.transform(future_df[enc_cols])
            encoded_df = pd.DataFrame(encoded, columns = enc_cols, index = future_df.index)
            future_df[enc_cols] = encoded_df
            future_df = future_df[cols]

            # 예측일자: TEST_00+1일 ~ TEST_00+7일
            pred_dates = [f"{test_prefix}+{i+1}일" for i in range(predict)]

            for d, val in zip(pred_dates, model.predict(future_df)):
                results.append({
                    '영업일자': d,
                    '영업장명_메뉴명': store_menu,
                    '매출수량': val
                })

        return pd.DataFrame(results)

    def predict_lstm(self, test_df, trained_models, test_prefix : str, cols : list, enc_cols : list, num_cols : list, lookback = 28, predict = 7):
        """
        Input : test_df - test data, trained_models - list(menu : { model}), cols - x 변수들
        Output : [영업일자, 영업장명_메뉴명, 매출수량] DataFrame
        """
        results = []

        # 매장, 메뉴별로 그룹화해서 예측
        for store_menu, store_test in test_df.groupby(['영업장명_메뉴명']):
            # 훈련된 모델에 메뉴가 있는 경우만 진행
            if store_menu not in trained_models:
                continue

            # 모델, scaler 불러오기
            model = trained_models[store_menu]['model']
            encoders = trained_models[store_menu]['encoders']
            features_scaler = trained_models[store_menu]['features_scaler']
            target_scaler = trained_models[store_menu]['target_scaler']

            # LSTM 입력으로 활용할 최근 lookback 만큼의 데이터 가져오기
            mv = Make_Variables()
            store_test = mv.make_variables_train(data = store_test) ############ 수정함 ! 맞는지 모르겠음..
            store_test_sorted = store_test.sort_values('영업일자')

            features = cols + ["매출수량"]
            if len(store_test_sorted) < lookback:
                continue

            recent_df = store_test_sorted[features].iloc[-lookback:].copy()
            if len(recent_df) < lookback:
                continue # lookback 만큼의 데이터가 없으면 예측 안 하고 넘어가기

            ##### 요기서 변수 추가
            last_date = store_test_sorted['영업일자'].iloc[-1]
            recent_df_for_mv = store_test_sorted[features + ['영업장명_메뉴명', '영업일자']].iloc[-lookback:].copy()
            future_df = mv.make_variables_test(date = last_date, test_df = recent_df_for_mv, predict = 7)
            future_df['매출수량'] = 0.0
            full_df = pd.concat([recent_df, future_df[features]], axis = 0)

            # 정규화, 스케일링
            for col in enc_cols:
                if col not in full_df.columns:
                    continue
                if col in encoders:
                    le = encoders[col]
                    full_df[col] = le.transform(full_df[col])
                else:
                    full_df[col] = full_df[col].astype(int)
            full_df[num_cols] = features_scaler.transform(full_df[num_cols])

            x_input_vals = full_df[cols].values
            x_input = x_input_vals[:lookback]
            x_input = torch.tensor([x_input]).float().to(device)

            # 예측 수행
            with torch.no_grad():
                pred_scaled = model(x_input).squeeze().cpu().numpy()

            # 역정규화
            restored = []
            for i in range(predict):
                dummy = np.zeros((1, len(features)))
                dummy[0, features.index("매출수량")] = pred_scaled[i]
                restored_val = target_scaler.inverse_transform(dummy)[0, features.index("매출수량")]
                restored.append(max(restored_val, 0)) # 음수 나오면 0으로 처리

            # 예측일자: TEST_00+1일 ~ TEST_00+7일
            pred_dates = [f"{test_prefix}+{i+1}일" for i in range(predict)]

            for d, val in zip(pred_dates, restored):
                results.append({
                    '영업일자': d,
                    '영업장명_메뉴명': store_menu,
                    '매출수량(lstm)': val
                })

        return pd.DataFrame(results)

#### 예측값 생성

In [None]:
import re
all_preds_class = []
all_preds_reg = []
all_preds_lstm = []

# 모든 test_*.csv 순회
test_files = sorted(glob.glob('DATA/test/TEST_*.csv'))
predictions = PredictionFunctions()

for path in test_files:
    test_df = pd.read_csv(path)

    # 파일명에서 접두어 추출 (예: TEST_00)
    filename = os.path.basename(path)
    test_prefix = re.search(r'(TEST_\d+)', filename).group(1)

    # lstm 넣고
    pred_lstm = predictions.predict_lstm(test_df, trained_lstm, test_prefix, cols, enc_cols, num_cols)
    all_preds_lstm.append(pred_lstm)

df_lstm  = pd.concat(all_preds_lstm, ignore_index=True)

In [None]:
len(df_lstm)

13510

In [None]:
import re
all_preds_class = []
all_preds_reg = []
all_preds_lstm = []


# 모든 test_*.csv 순회
test_files = sorted(glob.glob('DATA/test/TEST_*.csv'))
predictions = PredictionFunctions()

for path in test_files:
    test_df = pd.read_csv(path)

    # 파일명에서 접두어 추출 (예: TEST_00)
    filename = os.path.basename(path)
    test_prefix = re.search(r'(TEST_\d+)', filename).group(1)

    # 일단 분류 모델 넣고
    pred_class = predictions.predict_class(test_df, models_class, test_prefix, cols, enc_cols)
    all_preds_class.append(pred_class)

    # 1 나오면 회귀 모델 넣고
    pred_reg = predictions.predict_reg(test_df, models_reg, test_prefix, cols, enc_cols)
    all_preds_reg.append(pred_reg)

    # lstm 넣고
    pred_lstm = predictions.predict_lstm(test_df, trained_lstm, test_prefix, cols, enc_cols, num_cols)
    all_preds_lstm.append(pred_lstm)

    # 합치기 (가중치.. 일단은 1.5 / 8.5 정도....)

df_class = pd.concat(all_preds_class, ignore_index=True)
df_reg   = pd.concat(all_preds_reg, ignore_index=True)
df_lstm  = pd.concat(all_preds_lstm, ignore_index=True)

In [None]:
full_pred_df = pd.merge(df_class, df_reg, on=['영업일자', '영업장명_메뉴명'], how='outer')
full_pred_df.rename(columns={'매출수량': '매출수량(reg)'}, inplace=True)

df_lstm_plz = df_lstm.copy()
df_lstm_plz['영업장명_메뉴명'] = df_lstm['영업장명_메뉴명'].apply(lambda x: x[0] if isinstance(x, tuple) else x)

full_pred_df = pd.merge(df_lstm_plz, full_pred_df, on=['영업일자', '영업장명_메뉴명'], how='outer')
full_pred_df.head()

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량(lstm),매출여부,매출수량(reg)
0,TEST_00+1일,느티나무 셀프BBQ_1인 수저세트,9.489268,1,6.983611
1,TEST_00+1일,느티나무 셀프BBQ_BBQ55(단체),0.0,0,47.027332
2,TEST_00+1일,"느티나무 셀프BBQ_대여료 30,000원",5.194102,1,8.765734
3,TEST_00+1일,"느티나무 셀프BBQ_대여료 60,000원",3.209607,1,3.018527
4,TEST_00+1일,"느티나무 셀프BBQ_대여료 90,000원",0.0,1,0.918773


In [None]:
import re
# 모든 test_*.csv 순회
test_files = sorted(glob.glob('DATA/test/TEST_*.csv'))

for path in test_files:
    test_df = pd.read_csv(path)

    negative = len(test_df[test_df["매출수량"] < 0])
    print(f"{negative}개의 음수 데이터가 있어요..")

0개의 음수 데이터가 있어요..
0개의 음수 데이터가 있어요..
4개의 음수 데이터가 있어요..
1개의 음수 데이터가 있어요..
0개의 음수 데이터가 있어요..
2개의 음수 데이터가 있어요..
0개의 음수 데이터가 있어요..
1개의 음수 데이터가 있어요..
0개의 음수 데이터가 있어요..
0개의 음수 데이터가 있어요..


In [None]:
full_pred_df['매출수량'] = np.where(
    full_pred_df['매출여부'] == 1,
    full_pred_df['매출수량(reg)'] * 0.1 + full_pred_df['매출수량(lstm)'] * 0.9,
    full_pred_df['매출수량(lstm)']
)

full_pred_df.drop(columns=['매출여부', '매출수량(reg)', '매출수량(lstm)'], inplace=True)
full_pred_df

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량
0,TEST_00+1일,느티나무 셀프BBQ_1인 수저세트,9.238703
1,TEST_00+1일,느티나무 셀프BBQ_BBQ55(단체),0.000000
2,TEST_00+1일,"느티나무 셀프BBQ_대여료 30,000원",5.551265
3,TEST_00+1일,"느티나무 셀프BBQ_대여료 60,000원",3.190499
4,TEST_00+1일,"느티나무 셀프BBQ_대여료 90,000원",0.091877
...,...,...,...
13505,TEST_09+7일,화담숲카페_메밀미숫가루,26.205309
13506,TEST_09+7일,화담숲카페_아메리카노 HOT,27.769089
13507,TEST_09+7일,화담숲카페_아메리카노 ICE,164.840401
13508,TEST_09+7일,화담숲카페_카페라떼 ICE,24.054512


In [None]:
def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    # (영업일자, 메뉴) → 매출수량 딕셔너리로 변환
    pred_dict = dict(zip(
        zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']),
        pred_df['매출수량'].astype(float)
    ))

    final_df = sample_submission.copy()

    menu_cols = final_df.columns[1:]
    final_df[menu_cols] = final_df[menu_cols].astype(float)

    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:  # 메뉴명들
            final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)


    return final_df

In [None]:
sample_submission = pd.read_csv('DATA/sample_submission.csv')
final_hybrid = convert_to_submission_format(full_pred_df, sample_submission)
final_hybrid.to_csv('baseline_submission_hybrid.csv', index=False, encoding='utf-8-sig')

In [None]:
df_lstm_end = df_lstm_plz.rename(columns = {'매출수량(lstm)' : '매출수량'})
sample_submission = pd.read_csv('DATA/sample_submission.csv')
final_lstm = convert_to_submission_format(df_lstm_end, sample_submission)
final_lstm.to_csv('baseline_submission_lstm.csv', index=False, encoding='utf-8-sig')

In [None]:
full_pred_df_notzero = full_pred_df.copy()
full_pred_df_notzero.loc[full_pred_df_notzero['매출수량'].abs() < 1e-9, '매출수량'] = 1

In [None]:
sample_submission = pd.read_csv('DATA/sample_submission.csv')
final_hybrid = convert_to_submission_format(full_pred_df_notzero, sample_submission)
final_hybrid.to_csv('baseline_submission_hybrid_notzero.csv', index=False, encoding='utf-8-sig')