In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

import datetime
from datetime import timedelta
# !pip install holidays
import holidays

os.chdir("/content/drive/MyDrive/3. Grad School/LG Aimers")

In [None]:
# 데이터 불러오기
data = pd.read_csv("DATA/train/train.csv")

#### 파생변수 생성

In [2]:
# 월(month) -> 계절 매핑 딕셔너리
month_to_season = {
    1: "Winter", 2: "Winter", 12: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Autumn", 10: "Autumn", 11: "Autumn"}

season_weights = {
    "Winter" : 11.4,
    "Spring": 6.5,
    "Summer" : 6.3,
    "Autumn" : 18.4}

# 월별 가중치 매핑
monthly_weights = {
    1: 2.2, 2: 1.8, 3: 0.3,
    4: 1.01, 5: 0.7, 6: 0.8,
    7: 0.5, 8: 0.5, 9: 0.8,
    10: 1.55, 11: 1.03, 12: 1.4}

# 요일별 가중치 매핑
weekly_weights = {
    "Monday": 0.78, "Tuesday": 0.85, "Wednesday": 0.81,
    "Thursday": 9.9, "Friday": 1.2, "Saturday": 1.53,
    "Sunday": 1.3}

In [98]:
class Make_Variables():
        def __init__(self, data = None, date = None, predict = 7, month_to_season = None, monthly_weights = None, weekly_weights = None):
            self.data = data
            self.date = date
            self.predict = predict
            self.month_to_season = month_to_season
            self.monthly_weights = monthly_weights
            self.weekly_weights = weekly_weights

        def update_kor_holidays(self):
            """국경일 추가"""
            kor_holidays = holidays.KR(years = [2023, 2024, 2025])
            kor_holidays.update({
                datetime.date(2024,10,1) : "Temporary Holiday", # 국군의 날 임시공휴일
                datetime.date(2025,1,27) : "Temporary Holiday", # 설날 임시공휴일
                datetime.date(2025,3,3) : "Temporary Holiday", # 삼일절 대체공휴일
                datetime.date(2025, 5, 29) : "Election Period",
                datetime.date(2025, 5, 30) : "Election Period",
                datetime.date(2025, 6, 3) : "Presidential Election Day"})
            return kor_holidays

        def check_holidays(self, date, kor_holidays) -> int:
            """날짜 받아서 공휴일/주말 여부 출력"""
            # date = pd.Timestamp(date)
            if isinstance(date, pd.Series):
                check_holiday = date.dt.date.isin(kor_holidays)
                check_weekend = date.dt.weekday >= 5
            else:
                check_holiday = date.date() in kor_holidays
                check_weekend = date.weekday() >= 5
            is_holiday = (check_holiday | check_weekend)
            return is_holiday

        def get_sandwich_score(self, data, is_holiday_col) -> pd.DataFrame:
            """데이터프레임 기준으로 샌드위치 점수 계산"""
            data = data.reset_index(drop = True)
            data['is_sandwich'] = 0
            is_holiday = data[is_holiday_col].astype(int)
            for idx in range(len(data)):
                if idx == 0 or idx == len(data) - 1: # 첫날, 마지막 날
                    continue

                # 앞/뒤 하루씩 봤을 때 모두 휴일 -> 5점
                if (is_holiday.iloc[idx - 1] == 1) and (is_holiday.iloc[idx + 1] == 1): # 하루 전이랑 다음 날이 공휴일이면
                    data.iloc[idx, data.columns.get_loc('is_sandwich')] = 5

                # 앞/뒤 이틀씩 봤을 때 휴일 3일 -> 3점, 2일 -> 2점
                elif idx > 1 and idx < len(is_holiday) - 2: # 셋째날, 마지막에서 세 번째 날
                    start_idx = idx - 2
                    end_idx = idx + 2
                    nearby_holidays = (is_holiday.iloc[start_idx : end_idx + 1].sum() - is_holiday.iloc[idx])
                    if nearby_holidays == 3:
                        data.iloc[idx, data.columns.get_loc('is_sandwich')] = 3
                    elif nearby_holidays == 2:
                        data.iloc[idx, data.columns.get_loc('is_sandwich')] = 2
                    else:
                        data.iloc[idx, data.columns.get_loc('is_sandwich')] = 0
            return data

        def get_sandwich_score_for_dates(self, date, kor_holidays) -> int:
            """특정 날짜를 받아와서 앞뒤 날짜를 구하고, 샌드위치 점수 계산"""
            # 하루씩
            prev_date, next_date = date - timedelta(days = 1), date + timedelta(days = 1)
            prev_hol, next_hol = self.check_holidays(prev_date, kor_holidays), self.check_holidays(next_date, kor_holidays) # T/F Bool
            if prev_hol and next_hol: # 바로 다음 날들이 휴일이라면
                return 3
            days_offsets = [-2, -1, 1, 2] # 앞뒤로 이틀 살펴보기
            nearby_holidays = sum(self.check_holidays(date + timedelta(days = d), kor_holidays) for d in days_offsets)
            if nearby_holidays == 3: # 앞뒤 4일 중에 3일이 휴일이면
                return 2
            elif nearby_holidays == 2: # 앞뒤 4일 중에 2일이 휴일이면
                return 2
            else:
                return 0

        def get_season_weights(self, data = None, season_weights = season_weights):
            """계절별 가중치 부여"""
            # 데이터프레임 들어오면
            if data is not None:
                data['season_weight'] = data['season'].map(season_weights)
                return data

        def get_month_weights(self, data = None, monthly_weights = monthly_weights):
            """월별 가중치 부여"""
            # 데이터프레임 들어오면
            if data is not None:
                data['month_weight'] = data['month'].map(monthly_weights)
                return data

        def get_week_weights(self, data = None, weekly_weights = weekly_weights):
            """요일별 가중치 부여"""
            # 데이터프레임 들어오면
            if data is not None:
                data['week_weight'] = data['weekday'].map(weekly_weights)
                return data

        def get_prev_days(self, data, test_df = None, date = None, menu = None, howmany = 7):
            """
            일요일 날짜 받아와서 직전 주차의 일-토 매출수량 평균 계산
            주의 - test data에서 생성할 때는 참고할 데이터와 붙여넣을 데이터가 다름
            data : 참고할 데이터
            test_df : 참고할 데이터
            """
            if test_df is None:
                # 혹시 모르니까 검증
                if date.weekday() == 6:
                    # 이전 날짜들
                    prev_start = date - timedelta(days = howmany)
                    prev_end = date - timedelta(days = 1)
                    prev_data = data[(data['영업일자'] >= prev_start) & (data['영업일자'] <= prev_end) & (data['영업장명_메뉴명'] == menu)]
                    prev_avg = prev_data['매출수량'].mean()
                    prev_sd = prev_data['매출수량'].std()
                    # 첫 주 0으로 처리
                    if pd.isna(prev_avg):
                        prev_avg = 0
                    if pd.isna(prev_sd):
                        prev_sd = 0
                    week_end = date + timedelta(days = 6)
                    curr_mask = (data['영업일자'] >= date) & (data['영업일자'] <= week_end) & (data['영업장명_메뉴명'] == menu)
                    colname_mean = f"prev_avg_{howmany}"
                    colname_sd = f"prev_sd_{howmany}"
                    data.loc[curr_mask, colname_mean] = prev_avg
                    data.loc[curr_mask, colname_sd] = prev_sd
                    return data
                else:
                    return np.nan

            # test data라면
            else:
                # 혹시 모르니까 검증
                if date.weekday() == 6:
                    # 이전 날짜들
                    prev_start = date - timedelta(days = howmany)
                    prev_end = date - timedelta(days = 1)
                    prev_data = test_df[(test_df['영업일자'] >= prev_start) & (test_df['영업일자'] <= prev_end) & (test_df['영업장명_메뉴명'] == menu)]
                    prev_avg = prev_data['매출수량'].mean()
                    prev_sd = prev_data['매출수량'].std()
                    # 첫 주 0으로 처리
                    if pd.isna(prev_avg):
                        prev_avg = 0
                    if pd.isna(prev_sd):
                        prev_sd = 0
                    week_end = date + timedelta(days = 6)
                    curr_mask = (data['영업일자'] >= date) & (data['영업일자'] <= week_end) & (data['영업장명_메뉴명'] == menu)
                    colname_mean = f"prev_avg_{howmany}"
                    colname_sd = f"prev_sd_{howmany}"
                    data.loc[curr_mask, colname_mean] = prev_avg
                    data.loc[curr_mask, colname_sd] = prev_sd
                    return data
                else:
                    return np.nan

        def get_means(self, data, original_data = None):
            # train 단계일 때
            if original_data is None:
                store_avg = data.groupby("영업장명_메뉴명")['매출수량'].mean()
                menu_avg = data.groupby("영업장명")['매출수량'].mean()
                return store_avg, menu_avg
            else:
                merged = pd.concat([data, original_data], axis = 0, ignore_index = True)

                store_avg = merged.groupby("영업장명_메뉴명")['매출수량'].mean()
                menu_avg = merged.groupby("영업장명")['매출수량'].mean()
                return store_avg, menu_avg

        def get_seasonal(self, data):
            # 영업장별
            store_season = (data.groupby(['영업장명', 'season'])['매출수량'].mean().reset_index().rename(columns = {'매출수량' : 'store_season'}))
            data = data.merge(store_season, on = ['영업장명', 'season'], how = 'left')
            data['store_season_ratio'] = data['store_season'] / data['store_avg']

            # 메뉴별
            menu_season = (data.groupby(['영업장명_메뉴명', 'season'])['매출수량'].mean().reset_index().rename(columns = {'매출수량' : 'menu_season'}))
            data = data.merge(menu_season, on = ['영업장명_메뉴명', 'season'], how = 'left')
            data['menu_season_ratio'] = data['menu_season'] / data['menu_avg']

            return data

        #########################################################################
        def add_features(self, data, one = False, two = False, three = False):
            """
            one - 전체 평균 vs 업장별 평균
            two - 전체 평균 vs 메뉴별 평균
            three - 업장별 평균 vs 메뉴별 평균
            """
            total_avg = data['매출수량'].mean()
            store_avg = data.groupby('영업장명')['매출수량'].mean()
            menu_avg = data.groupby('영업장명_메뉴명')['매출수량'].mean()

            if one:
                # 0/1로 표현
                data['store_feat'] = data['store_avg'] > total_avg
                # 비율로 표현
                data['store_ratio'] = data['store_avg'] / total_avg

            if two:
                # 0/1로 표현
                data['menu_feat'] = data['menu_avg'] > total_avg
                # 비율로 표현
                data['menu_ratio'] = data['menu_avg'] / total_avg

            if three:
                # 0/1로 표현
                data['store_menu_feat'] = data['menu_avg'] > data['store_avg']
                # 비율로 표현
                data['store_menu_ratio'] = data['menu_avg'] /  data['store_avg']

            return data
        #########################################################################

        # train, test 공통
        def make_fund_variables(self, data, month_to_season = month_to_season):
            # 영업일자 -> datetime
            data['영업일자'] = pd.to_datetime(data['영업일자'])

            # 연, 월, 일, 요일 분리
            data['year'] = data['영업일자'].dt.year
            data['month'] = data['영업일자'].dt.month
            data['day'] = data['영업일자'].dt.day
            data['weekday'] = data['영업일자'].dt.day_name()
            data['weekday_enc'] = data['영업일자'].dt.weekday

            # 계절 변수 생성
            data['season'] = data['month'].map(month_to_season)

            # 연도 차이 변수 생성
            data['year_enc'] = data['year'] - 2023

            # 월, 일, 요일 사이클릭 변환
            data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
            data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)

            data['day_sin'] = np.sin(2 * np.pi * data['day'] / 31)
            data['day_cos'] = np.cos(2 * np.pi * data['day'] / 31)

            data['weekday_sin'] = np.sin(2 * np.pi * data['weekday_enc'] / 7)
            data['weekday_cos'] = np.cos(2 * np.pi * data['weekday_enc'] / 7)

            # 공휴일 확인
            kor_holidays = self.update_kor_holidays()
            check_holiday = data['영업일자'].dt.date.isin(kor_holidays)
            check_weekend = data['weekday'].isin(['Saturday', 'Sunday'])
            data['is_holiday'] = (check_holiday | check_weekend).astype(int) # 공휴일 + 주말
            data['holiday_name'] = data['영업일자'].dt.date.map(kor_holidays)

            ### 영업장명, 메뉴명 분리
            if '영업장명_메뉴명' in data.columns:
                data[['영업장명', '메뉴명']] = data['영업장명_메뉴명'].str.split('_', expand = True)

            ### 단체 변수
            group_words = ["단체", "6인석", "12인석", "2인", "4인", "3인"]
            mask = data['메뉴명'].astype(str).apply(lambda x : any(k in x for k in group_words))
            data['group'] = mask.astype(int)

            ### 계절 영향 추가
            data = self.get_seasonal(data)

            return data

        # train의 입력 데이터 + lookback 28일 데이터
        def make_variables_train(self, data, one = True, two = True, three = True):
            data = self.make_fund_variables(data)
            kor_holidays = self.update_kor_holidays()

            ### 샌드위치 데이
            data = self.get_sandwich_score(data, 'is_holiday')

            # 샌드위치 - 첫날
            first = data['영업일자'].min()
            data.loc[data['영업일자'] == first, 'is_sandwich'] = self.get_sandwich_score_for_dates(first, kor_holidays)
            second = data['영업일자'].min() + timedelta(days = 1)
            data.loc[data['영업일자'] == second, 'is_sandwich'] = self.get_sandwich_score_for_dates(second, kor_holidays)

            # 샌드위치 - 마지막 날
            last = data['영업일자'].max()
            data.loc[data['영업일자'] == last, 'is_sandwich'] = self.get_sandwich_score_for_dates(last, kor_holidays)
            before = data['영업일자'].max() - timedelta(days = 1)
            data.loc[data['영업일자'] == before, 'is_sandwich'] = self.get_sandwich_score_for_dates(before, kor_holidays)

            # 서브웨이 샌드위치 포함한 공휴일
            data['is_holiday_sandwich'] = data['is_holiday'].astype(int) | (data['is_sandwich'] > 0).astype(int)

            ### 계절별 가중치
            data = self.get_season_weights(data, season_weights)

            ### 월별 가중치
            data = self.get_month_weights(data, monthly_weights)

            ### 요일별 가중치
            data = self.get_week_weights(data, weekly_weights)

            ### 직전 주차 평균
            sundays = data[data['weekday'] == "Sunday"][["영업일자", "영업장명_메뉴명"]].copy()
            for _, row in sundays.iterrows():
                date = row['영업일자']
                menu = row['영업장명_메뉴명']
                data = self.get_prev_days(data = data, date = date, menu = menu, howmany = 7)
                data = self.get_prev_days(data = data, date = date, menu = menu, howmany = 14)
                data = self.get_prev_days(data = data, date = date, menu = menu, howmany = 21)

            ### 음수 처리
            negative = data[data['매출수량'] < 0]

            for idx, row in negative.iterrows():
                num = row['매출수량']
                if num < -10:
                    date = row['영업일자']
                    menu = row['영업장명_메뉴명']
                    prev_date = pd.to_datetime(date) - pd.Timedelta(days = 1)
                    prev_row = data[(data['영업일자'] == prev_date) & (data['영업장명_메뉴명'] == menu)]

                    if prev_row.iloc[0]["매출수량"] >= abs(num):
                        data.loc[prev_row.index[0], '매출수량'] += num

            # 남은 건 전부 0으로
            data.loc[data['매출수량'] < 0, '매출수량'] = 0

            # 평균 매핑 - data에 test_df 들어가고, original_data에 기존 train에 사용한 데이터 넣기
            store_avg, menu_avg = self.get_means(data, original_data = None)
            data['store_avg'] = data['영업장명_메뉴명'].map(store_avg)
            data['menu_avg'] = data['영업장명'].map(menu_avg)

            # 시도해보기
            data = add_features(data, one, two, three)

            return data

        # 예측하고자 하는 날들
        def make_variables_test(self, date, test_df, original_data = None, predict = 7, one = True, two = True, three = True):
            """
            date : 최종 날짜 (입력 7일 중 가장 마지막) - TimeStamp
            test_df : 예측할 때 참고해올 데이터 -> 이거로 직전 주차 평균 생성
            original_data -> train에 사용한 데이터 (data로 저장)
            """
            date = pd.to_datetime(date)
            future_dates = [date + timedelta(days = i + 1) for i in range(predict)]
            future_df = pd.DataFrame({'영업일자' : future_dates})

            menus_df = (test_df[['영업장명_메뉴명']].drop_duplicates().reset_index(drop = True))
            future_df = future_df.merge(menus_df, how='cross')

            kor_holidays = self.update_kor_holidays()

            # 기본적인 변수들
            future_df = self.make_fund_variables(future_df)

            future_df['영업일자'] = pd.to_datetime(future_df['영업일자']).dt.normalize()

            # 샌드위치
            future_df['is_sandwich'] = future_df['영업일자'].apply(lambda d: self.get_sandwich_score_for_dates(d, kor_holidays))

            # 샌드위치 포함한 공휴일
            future_df['is_holiday_sandwich'] = future_df['is_holiday'].astype(int) | (future_df['is_sandwich'] > 0).astype(int)

            ### 계절별 가중치
            future_df = self.get_season_weights(future_df, season_weights)

            # 월별 가중치
            future_df = self.get_month_weights(future_df, monthly_weights)

            # 요일별 가중치
            future_df = self.get_week_weights(future_df, weekly_weights)

            # 직전 주차 평균 -> 이거는 test 까지 받아오고 생각해야 함..
            sundays =  future_df.loc[future_df['weekday'] == "Sunday", ['영업일자', '영업장명_메뉴명']].copy()
            for _, row in sundays.iterrows():
                date = row['영업일자']
                menu = row['영업장명_메뉴명']
                future_df = self.get_prev_days(data = future_df, test_df = test_df, date = date, menu = menu, howmany = 7)
                future_df = self.get_prev_days(data = future_df, test_df = test_df, date = date, menu = menu, howmany = 14)
                future_df = self.get_prev_days(data = future_df, test_df = test_df, date = date, menu = menu, howmany = 21)

            # 앞서 계산한 값들로 매핑
            store_avg, menu_avg = self.get_means(future_df, original_data = original_data)
            future_df['store_avg'] = future_df['영업장명_메뉴명'].map(store_avg)
            future_df['menu_avg'] = future_df['영업장명'].map(menu_avg)

            # 전체 평균 계산해서 변수 추가 ############################################################
            total_df = pd.concat([test_df, original_data], axis = 0, ignore_index = True)
            total_avg = total_df['매출수량'].mean()

            if one:
                # 0/1로 표현
                future_df['store_feat'] = (future_df['store_avg'] > total_avg).astype(int)
                # 비율로 표현
                future_df['store_ratio'] = future_df['store_avg'] / total_avg

            if two:
                # 0/1로 표현
                future_df['menu_feat'] = (future_df['menu_avg'] > total_avg).astype(int)
                # 비율로 표현
                future_df['menu_ratio'] = future_df['menu_avg'] / total_avg

            if three:
                # 0/1로 표현
                future_df['store_menu_feat'] = (future_df['menu_avg'] > future_df['store_avg']).astype(int)
                # 비율로 표현
                future_df['store_menu_ratio'] = future_df['menu_avg'] /  future_df['store_avg']

            return future_df

In [None]:
# 그냥 전부 만들면 돼
mv = Make_Variables()
data = mv.make_variables_train(data = data)

In [11]:
import pickle
data.to_pickle("/content/drive/MyDrive/3. Grad School/LG Aimers/DATA/train_data_all.pickle")

#### 영업장별, 메뉴별 특징

In [81]:
data['메뉴명'].unique()

array(['1인 수저세트', 'BBQ55(단체)', '대여료 30,000원', '대여료 60,000원',
       '대여료 90,000원', '본삼겹 (단품,실내)', '스프라이트 (단체)', '신라면', '쌈야채세트', '쌈장',
       '육개장 사발면', '일회용 소주컵', '일회용 종이컵', '잔디그늘집 대여료 (12인석)',
       '잔디그늘집 대여료 (6인석)', '잔디그늘집 의자 추가', '참이슬 (단체)', '친환경 접시 14cm',
       '친환경 접시 23cm', '카스 병(단체)', '콜라 (단체)', '햇반', '허브솔트', '(단체) 공깃밥',
       '(단체) 생목살 김치전골 2.0', '(단체) 은이버섯 갈비탕', '(단체) 한우 우거지 국밥',
       '(단체) 황태해장국 3/27까지', '(정식) 된장찌개', '(정식) 물냉면 ', '(정식) 비빔냉면',
       '(후식) 된장찌개', '(후식) 물냉면', '(후식) 비빔냉면', '갑오징어 비빔밥', '갱시기', '공깃밥',
       '꼬막 비빔밥', '느린마을 막걸리', '담하 한우 불고기', '담하 한우 불고기 정식', '더덕 한우 지짐',
       '들깨 양지탕', '라면사리', '룸 이용료', '메밀면 사리', '명인안동소주', '명태회 비빔냉면',
       '문막 복분자 칵테일', '봉평메밀 물냉면', '생목살 김치찌개', '스프라이트', '은이버섯 갈비탕', '제로콜라',
       '참이슬', '처음처럼', '카스', '콜라', '테라', '하동 매실 칵테일', '한우 떡갈비 정식',
       '한우 미역국 정식', '한우 우거지 국밥', '한우 차돌박이 된장찌개', '황태해장국', 'AUS (200g)',
       'G-Charge(3)', 'Gls.Sileni', 'Gls.미션 서드', 'Open Food',
       '그릴드 비프 샐러드', '까르보나라', '모둠 해산물 플래터', '미션 서드 카베르네 

In [83]:
data.groupby(['영업장명', 'season'])['매출수량'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,매출수량
영업장명,season,Unnamed: 2_level_1
느티나무 셀프BBQ,Autumn,7.957955
느티나무 셀프BBQ,Spring,5.705104
느티나무 셀프BBQ,Summer,8.494921
느티나무 셀프BBQ,Winter,2.347536
담하,Autumn,7.44113
담하,Spring,4.533644
담하,Summer,5.128838
담하,Winter,6.093968
라그로타,Autumn,1.318681
라그로타,Spring,1.17087


In [91]:
store_season = (data.groupby(['영업장명', 'season'])['매출수량'].mean().reset_index().rename(columns = {'매출수량' : 'store_season'}))
data = data.merge(store_season, on = ['영업장명', 'season'], how = 'left')

In [92]:
data.head(2)

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량,year,month,day,weekday,weekday_enc,season,year_enc,...,store_avg,menu_avg,group,store_feat,store_ratio,menu_feat,menu_ratio,store_menu_feat,store_menu_ratio,store_season
0,2023-01-01,느티나무 셀프BBQ_1인 수저세트,0,2023,1,1,Sunday,6,Winter,0,...,5.088346,5.704887,0,False,0.477623,False,0.535496,True,1.121167,2.347536
1,2023-01-02,느티나무 셀프BBQ_1인 수저세트,0,2023,1,2,Monday,0,Winter,0,...,5.088346,5.704887,0,False,0.477623,False,0.535496,True,1.121167,2.347536


In [90]:
data.groupby(['영업장명_메뉴명', 'season'])['매출수량'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,매출수량
영업장명_메뉴명,season,Unnamed: 2_level_1
느티나무 셀프BBQ_1인 수저세트,Autumn,7.219780
느티나무 셀프BBQ_1인 수저세트,Spring,3.815217
느티나무 셀프BBQ_1인 수저세트,Summer,8.915888
느티나무 셀프BBQ_1인 수저세트,Winter,2.626667
느티나무 셀프BBQ_BBQ55(단체),Autumn,35.296703
...,...,...
화담숲카페_카페라떼 ICE,Winter,0.000000
화담숲카페_현미뻥스크림,Autumn,47.769231
화담숲카페_현미뻥스크림,Spring,26.630435
화담숲카페_현미뻥스크림,Summer,14.364486


In [94]:
menu_season = (data.groupby(['영업장명_메뉴명', 'season'])['매출수량'].mean().reset_index().rename(columns = {'매출수량' : 'menu_season'}))
data = data.merge(menu_season, on = ['영업장명_메뉴명', 'season'], how = 'left')

In [95]:
data.head(2)

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량,year,month,day,weekday,weekday_enc,season,year_enc,...,menu_avg,group,store_feat,store_ratio,menu_feat,menu_ratio,store_menu_feat,store_menu_ratio,store_season,menu_season
0,2023-01-01,느티나무 셀프BBQ_1인 수저세트,0,2023,1,1,Sunday,6,Winter,0,...,5.704887,0,False,0.477623,False,0.535496,True,1.121167,2.347536,2.626667
1,2023-01-02,느티나무 셀프BBQ_1인 수저세트,0,2023,1,2,Monday,0,Winter,0,...,5.704887,0,False,0.477623,False,0.535496,True,1.121167,2.347536,2.626667


In [96]:
data['store_season_ratio'] = data['store_season'] / data['store_avg']
data['menu_season_ratio'] = data['menu_season'] / data['menu_avg']

In [None]:
def get_seasonal(self, data):
    # 영업장별
    store_season = (data.groupby(['영업장명', 'season'])['매출수량'].mean().reset_index().rename(columns = {'매출수량' : 'store_season'}))
    data = data.merge(store_season, on = ['영업장명', 'season'], how = 'left')
    data['store_season_ratio'] = data['store_season'] / data['store_avg']

    # 메뉴별
    menu_season = (data.groupby(['영업장명_메뉴명', 'season'])['매출수량'].mean().reset_index().rename(columns = {'매출수량' : 'menu_season'}))
    data = data.merge(menu_season, on = ['영업장명_메뉴명', 'season'], how = 'left')
    data['menu_season_ratio'] = data['menu_season'] / data['menu_avg']

    return data

#### 저장된 데이터 불러오기

In [100]:
import pickle
data = pd.read_pickle("/content/drive/MyDrive/3. Grad School/LG Aimers/DATA/train_data_all.pickle")

In [101]:
group_words = ["단체", "6인석", "12인석", "2인", "4인", "3인"]
mask = data['메뉴명'].astype(str).apply(lambda x : any(k in x for k in group_words))
data['group'] = mask.astype(int)

In [102]:
### 피쳐 추가해보기
def add_features(data, one = False, two = False, three = False):
    """
    one - 전체 평균 vs 업장별 평균
    two - 전체 평균 vs 메뉴별 평균
    three - 업장별 평균 vs 메뉴별 평균
    """
    total_avg = data['매출수량'].mean()
    store_avg = data.groupby('영업장명')['매출수량'].mean()
    menu_avg = data.groupby('영업장명_메뉴명')['매출수량'].mean()

    if one:
        # 0/1로 표현
        data['store_feat'] = data['store_avg'] > total_avg
        # 비율로 표현
        data['store_ratio'] = data['store_avg'] / total_avg

    if two:
        # 0/1로 표현
        data['menu_feat'] = data['menu_avg'] > total_avg
        # 비율로 표현
        data['menu_ratio'] = data['menu_avg'] / total_avg

    if three:
        # 0/1로 표현
        data['store_menu_feat'] = data['menu_avg'] > data['store_avg']
        # 비율로 표현
        data['store_menu_ratio'] = data['menu_avg'] /  data['store_avg']

    return data

In [103]:
data = add_features(data, one = True, two = True, three = True)

In [104]:
def get_seasonal(data):
    # 영업장별
    store_season = (data.groupby(['영업장명', 'season'])['매출수량'].mean().reset_index().rename(columns = {'매출수량' : 'store_season'}))
    data = data.merge(store_season, on = ['영업장명', 'season'], how = 'left')
    data['store_season_ratio'] = data['store_season'] / data['store_avg']

    # 메뉴별
    menu_season = (data.groupby(['영업장명_메뉴명', 'season'])['매출수량'].mean().reset_index().rename(columns = {'매출수량' : 'menu_season'}))
    data = data.merge(menu_season, on = ['영업장명_메뉴명', 'season'], how = 'left')
    data['menu_season_ratio'] = data['menu_season'] / data['menu_avg']

    return data

In [105]:
data = get_seasonal(data)

In [106]:
data.to_pickle("/content/drive/MyDrive/3. Grad School/LG Aimers/DATA/train_data_all_123_seasonal.pickle")

In [110]:
data = pd.read_pickle("/content/drive/MyDrive/3. Grad School/LG Aimers/DATA/train_data_all_123_seasonal.pickle")

In [108]:
cols =  ["year_enc", "month_sin", "month_cos", "day_sin", "day_cos", "weekday_sin", "weekday_cos",
         "season", "is_holiday", "is_sandwich", "is_holiday_sandwich", "season_weight", "month_weight", "week_weight",
         "prev_avg_7", "prev_avg_14", "prev_avg_21", "prev_sd_7", "prev_sd_14", "prev_sd_21",
         "영업장명", "메뉴명", "store_avg", "menu_avg", "group"]

cols = cols + ["store_feat", "store_ratio"]
cols = cols + ["menu_feat", "menu_ratio"]
cols = cols + ["store_menu_feat", "store_menu_ratio"]

cols = cols + ['store_season', 'store_season_ratio', 'menu_season', 'menu_season_ratio']

enc_cols = ["season", "영업장명", "메뉴명"]

#### [Regression] XGBoost

In [8]:
import numpy as np
import pandas as pd
from typing import List, Dict, Any, Callable, Optional
from itertools import product
from xgboost import XGBRegressor,  callback
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
! pip install category_encoders
from category_encoders import TargetEncoder
import joblib

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [112]:
class Reg_XGBoost():
    def __init__(self, horizon = 7, lookback = 28, early_stopping = 50, embargo = 7, verbose = True):
        self.horizon = horizon
        self.lookback = lookback
        self.early_stopping = early_stopping
        self.embargo = embargo
        self.verbose = verbose

    def smape_score(self, y_true, y_pred, eps = 1e-8):
        y_true = np.asarray(y_true, dtype=float)
        y_pred = np.asarray(y_pred, dtype=float)
        denom = np.abs(y_true) + np.abs(y_pred)
        diff = np.abs(y_true - y_pred)
        term = np.zeros_like(denom)
        mask = denom > eps
        term[mask] = diff[mask] / denom[mask]

        return 2 * np.mean(term)

    def tuning_cv(self, data : pd.DataFrame, cols : List[str], enc_cols : List[str] = None, param_grid = None, eval_metric = "rmse") -> Dict:
        """
        전체 데이터 기반 TimeSeries CV 진행
        """
        best_params = {}
        data = data.sort_values("영업일자").reset_index(drop = True)

        x = data[cols]
        y = data["매출수량"]

        if param_grid is None:
            param_grid = {
                "min_child_weight" : [1, 3, 6], # 상진님 튜닝값 추가
                "max_depth" : [4, 5, 6, 10],
                "subsample" : [0.653, 0.7, 0.9], # 상진님 튜닝값 추가
                "learning_rate" : [0.0074, 0.03, 0.05]} # 상진님 튜닝값 추가

        keys = list(param_grid.keys())
        combos = list(product(*[param_grid[k] for k in keys]))
        total_idx = len(combos)


        # 초기값 정의
        best_smape = np.inf
        best_params = {}

        # bast parameters 정의
        base_params = {
            "random_state" : 1488,
            "n_estimators" : 3000,
            "colsample_bytree" : 0.62, # 상진님 튜닝값
            "gamma" : 2.1242208097284327e-07, # 상진님 튜닝값
            "n_jobs" : -1,
            "tree_method": "hist"
        }

        for trial_idx, values in enumerate(combos, start = 1):
            trial_params = dict(zip(keys, values))
            params_trying = {**base_params, **trial_params}

            fold_smapes = []

            for fold, (train_idx, val_idx) in enumerate(tscv.split(x)):

                if self.embargo > 0:
                        val_start = val_idx[0]
                        train_idx = train_idx[train_idx < (val_start - self.embargo)]

                x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
                x_val, y_val = x.iloc[val_idx], y.iloc[val_idx]


                # 범주형 변수 인코딩
                if enc_cols:
                    target_encoder = TargetEncoder()
                    target_encoder.fit(x_train[enc_cols], y_train)

                    x_train = pd.concat([
                        target_encoder.transform(x_train[enc_cols]),
                        x_train[[c for c in cols if c not in enc_cols]]
                    ], axis = 1)

                    x_val = pd.concat([
                        target_encoder.transform(x_val[enc_cols]),
                        x_val[[c for c in cols if c not in enc_cols]]
                    ], axis = 1)

                # 모델 설정
                xgb_model = XGBRegressor(**params_trying, eval_metric = "rmse", early_stopping_rounds=self.early_stopping)

                # 모델 학습
                xgb_model.fit(x_train, y_train,
                                eval_set = [(x_val, y_val)],
                                verbose = False
                )

                pred = xgb_model.predict(x_val)
                smape = self.smape_score(y_val, pred)
                fold_smapes.append(smape)

            if not fold_smapes:
                continue

            mean_smape = float(np.mean(fold_smapes))

            if self.verbose:
                print("===========================")
                print(f"[Trial {trial_idx} / {total_idx}] \n Parameters : {trial_params} \n mean SMAPE : {mean_smape:.3f}")

            if mean_smape < best_smape:
                best_smape = mean_smape
                best_params = params_trying

        if self.verbose:
            print("======================================================")
            print("Best params:", best_params)
            print("Best SMAPE:", best_smape)

        return best_params

    def get_final_model(self, data, cols, enc_cols, hyperparameters : Dict[str, Any])  -> Dict[str, Dict[str, Any]]:
        """
        data - 전체 데이터셋
        cols - 전체 변수들
        enc_cols - 인코딩 진행할 변수들 (범주형)
        """
        data = data.sort_values("영업일자").reset_index(drop=True)

        # 범주형 변수 처리
        if enc_cols:
            target_encoder_full = TargetEncoder()
            data[enc_cols] = target_encoder_full.fit_transform(data[enc_cols], data['매출수량'])
        else:
            target_encoder_full = []

        # x, y 분리
        x_full = data[cols]
        y_full = data["매출수량"].astype(float)

        # 하이퍼파라미터 불러오기
        base_params = {
            "random_state" : 1488,
            "n_estimators" : 3000,
            "colsample_bytree" : 0.62, # 상진님 튜닝값
            "gamma" : 2.1242208097284327e-07, # 상진님 튜닝값
            "n_jobs" : -1,
            "tree_method": "hist"
        }
        base_params.update(hyperparameters)

        n = len(data)
        val_size = max(int(n * 0.15), 28)
        train_end = n - val_size

        x_train, y_train = x_full.iloc[:train_end], y_full.iloc[:train_end]
        x_val, y_val = x_full.iloc[train_end:], y_full.iloc[train_end:]

        # 모델 설정
        xgb_model_full = XGBRegressor(**base_params, eval_metric = "rmse", early_stopping_rounds = self.early_stopping)

        # 모델 학습
        xgb_model_full.fit(x_train, y_train,
                            eval_set = [(x_val, y_val)],
                            verbose = False
        )

        best_iter = getattr(xgb_model_full, "best_iteration", None)
        if best_iter is None:
            best_iter = getattr(xgb_model_full, "best_ntree_limit", None)

        if best_iter is not None:
            best_n = int(best_iter) + 1
            params_refit = base_params.copy()
            params_refit["n_estimators"] = best_n
            final_model = XGBRegressor(**params_refit)
            used_params = params_refit

        else:
            final_model = xgb_model_full
            used_params = base_params

        final_model.fit(x_full, y_full)

        final_bundle = {
            "model" : final_model,
            "hyperparameters" : used_params,
            "encoder" : target_encoder_full,
            "cols" : cols,
            "enc_cols" : enc_cols
        }

        return final_bundle

    def fit_whole_model(self, data, cols, enc_cols) -> Dict[str, Dict[str, Any]]:
        print("튜닝 시작..")
        best_params = self.tuning_cv(data = data, cols = cols, enc_cols = enc_cols)
        print("튜닝 완료!")
        models = self.get_final_model(data = data, cols = cols, enc_cols = enc_cols, hyperparameters = best_params)
        return models

    def save_reg_model(self, models, model_path):
        joblib.dump(models, model_path)
        print("모델 저장 완료!")

    def load_saved_model(self, model_path):
        models = joblib.load(model_path)
        return models

    def predict_reg_model(self, test_df, trained_model, test_prefix : str, cols : list, enc_cols : list, data, lookback = 28, predict = 7):
        """
        Input : test_df - test data, trained_model - {~~}, original_data : train할 때 사용한 데이터 (data)
        Output : [영업일자, 영업장명_메뉴명, 매출수량] DataFrame
        """
        results = []

        # 모델 불러오기
        model = trained_model["model"]
        encoder = trained_model["encoder"]

        # 변수 추가하기
        mv = Make_Variables()

        for store_menu, store_df in test_df.groupby(['영업장명_메뉴명']):

            store_df['영업일자'] = pd.to_datetime(store_df['영업일자'])
            store_df_sorted = store_df.sort_values('영업일자')
            last_date = store_df_sorted['영업일자'].iloc[-1]

            future_df = mv.make_variables_test(date = last_date, test_df = store_df, original_data = data, predict = 7,
                                               one = True, two = True, three = True)

            if enc_cols:
                future_df[enc_cols] = encoder.transform(future_df[enc_cols])

            # 사용할 변수만
            future_df = future_df[cols]

            # 로그 변환 처리
            predicted = model.predict(future_df)

            # 예측일자: TEST_00+1일 ~ TEST_00+7일
            pred_dates = [f"{test_prefix}+{i+1}일" for i in range(predict)]

            for d, val in zip(pred_dates, predicted):
                results.append({
                    '영업일자': d,
                    '영업장명_메뉴명': store_menu,
                    '매출수량': val
                })

        return pd.DataFrame(results)

In [113]:
regression = Reg_XGBoost()
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 11/xgboost_reg_global.pkl'

models = regression.fit_whole_model(data = data, cols = cols, enc_cols = enc_cols)
regression.save_reg_model(models, model_path)

튜닝 시작..
[Trial 1 / 108] 
 Parameters : {'min_child_weight': 1, 'max_depth': 4, 'subsample': 0.653, 'learning_rate': 0.0074} 
 mean SMAPE : 1.295


KeyboardInterrupt: 

In [16]:
# 다시 불러오기
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 10/xgboost_reg_global.pkl'
regression = Reg_XGBoost()
models_reg = regression.load_saved_model(model_path)

#### [Classification] XGBoost

In [17]:
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
! pip install category_encoders
from category_encoders import TargetEncoder
from sklearn.metrics import f1_score
from typing import Dict, List, Optional, Any
from collections import defaultdict
import pickle
import joblib



마지막 30일 잘라서 validation 돌리기

In [48]:
class Cls_XGBoost():
    def __init__(self, horizon = 7, lookback = 28, early_stopping = 50, embargo = 7, verbose = False):
        self.horizon = horizon
        self.lookback = lookback
        self.early_stopping = early_stopping
        self.embargo = embargo
        self.verbose = verbose

    def fit_model_cv(self, data: pd.DataFrame, cols: List[str], enc_cols: List[str]) -> pd.DataFrame:
        """
        전체 데이터를 기반으로 TimeSeries CV 수행 → OOF 예측 반환
        """
        data = data.sort_values("영업일자").reset_index(drop=True)
        x = data[cols]
        y = data["매출_여부"]

        oof_proba = np.full(len(data), np.nan)

        tscv = TimeSeriesSplit(n_splits=3)
        for fold, (train_idx, val_idx) in enumerate(tscv.split(x)):

            if self.embargo > 0:
                val_start = val_idx[0]
                train_idx = train_idx[train_idx < (val_start - self.embargo)]

            x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
            x_val, y_val = x.iloc[val_idx], y.iloc[val_idx]

            if y_train.nunique() < 2:
                print(f"Fold {fold} 학습 불가")
                continue

            # 범주형 인코딩
            te = TargetEncoder()
            te.fit(x_train[enc_cols], y_train)

            x_train_enc = pd.concat([
                te.transform(x_train[enc_cols]),
                x_train[[c for c in cols if c not in enc_cols]]
            ], axis=1)

            x_val_enc = pd.concat([
                te.transform(x_val[enc_cols]),
                x_val[[c for c in cols if c not in enc_cols]]
            ], axis=1)

            # 모델 학습
            model = XGBClassifier(random_state = 1488, early_stopping_rounds = self.early_stopping)
            model.fit(x_train_enc, y_train,
                      eval_set = [(x_val_enc, y_val)],
                      verbose = 0
                      )

            pos_idx = int(np.where(model.classes_ == 1)[0][0]) if hasattr(model, "classes_") else 1
            oof_proba[val_idx] = model.predict_proba(x_val_enc)[:, pos_idx]

        oof_df = pd.DataFrame({
            "y_true": y.values,
            "y_proba": oof_proba
        }, index=data.index)

        return oof_df

    def tune_threshold(self, oof_df: pd.DataFrame) -> float:
        """
        전체 데이터 기준 하나의 threshold 최적화
        """
        y_true = oof_df["y_true"].values
        y_proba = oof_df["y_proba"].values
        mask = ~np.isnan(y_proba)

        best_score, best_thr = -1, 0.5
        for thr in np.linspace(0.3, 0.7, 51):
            y_pred = (y_proba[mask] >= thr).astype(int)
            score = f1_score(y_true[mask], y_pred, zero_division=0)
            if score > best_score:
                best_score = score
                best_thr = thr

        return best_thr

    def tuning_cv(self, data, cols, enc_cols, param_grid = None):
        data = data.sort_values("영업일자").reset_index(drop=True)
        x, y = data[cols], data["매출_여부"]

        if param_grid is None:
            param_grid = {
                "max_depth": [3, 5, 7],
                "learning_rate": [0.05, 0.1],
                "subsample": [0.7, 0.9],
                "colsample_bytree": [0.7, 0.9],
                "min_child_weight": [1, 3]
            }

        keys = list(param_grid.keys())
        combos = list(product(*[param_grid[k] for k in keys]))
        total = len(combos)

        tscv = TimeSeriesSplit(n_splits=3)
        best_score, best_params = -1, None

        for i, vals in enumerate(combos, 1):
            trial_params = dict(zip(keys, vals))
            fold_scores = []

            for train_idx, val_idx in tscv.split(x):
                if self.embargo > 0:
                    val_start = val_idx[0]
                    train_idx = train_idx[train_idx < (val_start - self.embargo)]
                if len(train_idx) == 0:
                    continue

                x_tr, y_tr = x.iloc[train_idx], y.iloc[train_idx]
                x_va, y_va = x.iloc[val_idx], y.iloc[val_idx]

                te = TargetEncoder()
                te.fit(x_tr[enc_cols], y_tr)

                x_tr_enc = pd.concat([
                    te.transform(x_tr[enc_cols]),
                    x_tr[[c for c in cols if c not in enc_cols]]
                ], axis=1)
                x_va_enc = pd.concat([
                    te.transform(x_va[enc_cols]),
                    x_va[[c for c in cols if c not in enc_cols]]
                ], axis=1)

                model = XGBClassifier(
                    random_state=1488,
                    eval_metric="logloss",
                    n_estimators=1000,
                    early_stopping_rounds=self.early_stopping,
                    **trial_params
                )
                model.fit(x_tr_enc, y_tr,
                          eval_set=[(x_va_enc, y_va)],
                          verbose=0)

                pos_idx = int(np.where(model.classes_ == 1)[0][0])
                y_pred = (model.predict_proba(x_va_enc)[:, pos_idx] >= 0.5).astype(int)
                f1 = f1_score(y_va, y_pred, zero_division=0)
                fold_scores.append(f1)

            if fold_scores:
                mean_f1 = np.mean(fold_scores)
                if self.verbose:
                    print(f"[{i}/{total}] params={trial_params}, f1={mean_f1:.3f}")
                if mean_f1 > best_score:
                    best_score, best_params = mean_f1, trial_params

        if self.verbose:
            print("Best params:", best_params, "Best F1:", best_score)
        return best_params

    def get_final_model(self, data, cols, enc_cols, threshold, hyperparameters: Dict[str, Any] = None) -> Dict[str, Any]:
        """
        최종 글로벌 모델 학습
        """
        data = data.sort_values("영업일자").reset_index(drop=True)

        te = TargetEncoder()
        data[enc_cols] = te.fit_transform(data[enc_cols], data["매출_여부"])

        x_full = data[cols]
        y_full = data["매출_여부"]

        if hyperparameters is None:
            hyperparameters = {}

        model = XGBClassifier(random_state = 1488, **hyperparameters)
        model.fit(x_full, y_full)

        bundle = {
            "model": model,
            "encoder": te,
            "threshold": threshold,
            "cols": cols,
            "enc_cols": enc_cols
        }
        return bundle

    def fit_whole_model(self, data, cols, enc_cols) -> Dict[str, Any]:
        oof_df = self.fit_model_cv(data, cols, enc_cols)
        print("CV 완료!")

        threshold = self.tune_threshold(oof_df)
        print(f"Threshold 탐색 완료! 최적 threshold = {threshold:.2f}")

        parameters = self.tuning_cv(data, cols, enc_cols)
        print(f"Hyperparameter 튜닝 완료! 최적 Hyperparamter : {parameters}")

        model_bundle = self.get_final_model(data, cols, enc_cols, threshold)
        return model_bundle

    def save_cls_model(self, model_bundle, model_path):
        joblib.dump(model_bundle, model_path)
        print("모델 저장 완료!")

    def load_saved_model(self, model_path):
        model_bundle = joblib.load(model_path)
        return model_bundle

    def predict_class_model(self, test_df, trained_models, test_prefix : str, cols : list, enc_cols : list, data, lookback = 28, predict = 7):
        """
        Input : test_df - test data, trained_model - {~~}, original_data : train할 때 사용한 데이터 (data)
        Output : [영업일자, 영업장명_메뉴명, 매출수량] DataFrame
        """
        results = []

        model = trained_models["model"]
        encoder = trained_models.get("encoder", None)
        threshold = float(trained_models.get("threshold", 0.5))

        mv = Make_Variables()

        for store_menu, store_df in test_df.groupby(['영업장명_메뉴명']):

            # 변수 추가하기
            store_df['영업일자'] = pd.to_datetime(store_df['영업일자'])
            store_df_sorted = store_df.sort_values('영업일자')
            last_date = store_df_sorted['영업일자'].iloc[-1]

            future_df = mv.make_variables_test(date = last_date, test_df = store_df, original_data = data, predict = 7,
                                               one = True, two = True, three = True)

            if enc_cols:
                future_df[enc_cols] = encoder.transform(future_df[enc_cols])

            # 사용할 변수만
            x = future_df[cols]

            if hasattr(model, "classes_"):
                pos_idx = int(np.where(model.classes_ == 1)[0][0])
            else:
                pos_idx = 1

            proba = model.predict_proba(x)[:, pos_idx]
            y_hat = (proba >= threshold).astype(int)


            # 예측일자: TEST_00+1일 ~ TEST_00+7일
            pred_dates = [f"{test_prefix}+{i+1}일" for i in range(predict)]

            for d, val in zip(pred_dates, y_hat):
                results.append({
                    '영업일자': d,
                    '영업장명_메뉴명': store_menu,
                    '매출여부': val
                })

        return pd.DataFrame(results)

In [None]:
classification = Cls_XGBoost()
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 10/xgboost_cls.pkl'

data_zero = data.copy()
data_zero['매출_여부'] = data_zero['매출수량'].apply(lambda x:1 if x > 0 else 0)

models = classification.fit_whole_model(data = data_zero, cols = cols, enc_cols = enc_cols)
classification.save_cls_model(models, model_path)

CV 완료!
Threshold 탐색 완료! 최적 threshold = 0.39
Hyperparameter 튜닝 완료! 최적 Hyperparamter : {'max_depth': 5, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 1}
모델 저장 완료!


In [19]:
# 다시 불러오기
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 10/xgboost_cls.pkl'
classification = Cls_XGBoost()
models_cls = classification.load_saved_model(model_path)

#### [Regression] XGBoost - 양수만

In [None]:
regression = Reg_XGBoost()
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 10/xgboost_reg_pos.pkl'

positive = data[data['매출수량'] > 0]
models = regression.fit_whole_model(data = positive, cols = cols, enc_cols = enc_cols)
regression.save_reg_model(models, model_path)

In [20]:
# 다시 불러오기
model_path = '/content/drive/MyDrive/3. Grad School/LG Aimers/Models/Trial 10/xgboost_reg_pos.pkl'
regression = Reg_XGBoost()
models_pos = regression.load_saved_model(model_path)

#### 예측값 생성

In [50]:
import re
import glob

all_preds_reg = []
all_preds_pos = []
all_preds_cls = []

# 모든 test_*.csv 순회
test_files = sorted(glob.glob('DATA/test/TEST_*.csv'))
regression = Reg_XGBoost()
classification = Cls_XGBoost()

for path in test_files:
    test_df = pd.read_csv(path)

    # 파일명에서 접두어 추출 (예: TEST_00)
    filename = os.path.basename(path)
    test_prefix = re.search(r'(TEST_\d+)', filename).group(1)

    pred_reg = regression.predict_reg_model(test_df, models_reg, test_prefix, cols, enc_cols = enc_cols, data = data)
    all_preds_reg.append(pred_reg)
    print("일반 회귀 예측 완료 !")

    pred_pos = regression.predict_reg_model(test_df, models_pos, test_prefix, cols, enc_cols = enc_cols, data = data)
    all_preds_pos.append(pred_pos)
    print("양수 데이터 회귀 예측 완료 !")

    pred_cls = classification.predict_class_model(test_df, models_cls, test_prefix, cols, enc_cols = enc_cols, data = data)
    all_preds_cls.append(pred_cls)
    print("분류 예측 완료 !")

df_reg = pd.concat(all_preds_reg, ignore_index = True)
df_pos   = pd.concat(all_preds_pos, ignore_index = True)
df_cls  = pd.concat(all_preds_cls, ignore_index = True)

일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !
일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !
일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !
일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !
일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !
일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !
일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !
일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !
일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !
일반 회귀 예측 완료 !
양수 데이터 회귀 예측 완료 !
분류 예측 완료 !


In [51]:
full_pred_df = pd.merge(df_cls, df_pos, on=['영업일자', '영업장명_메뉴명'], how='outer')

In [52]:
full_pred_df

Unnamed: 0,영업일자,영업장명_메뉴명,매출여부,매출수량
0,TEST_00+1일,"(느티나무 셀프BBQ_1인 수저세트,)",1,10.211239
1,TEST_00+1일,"(느티나무 셀프BBQ_BBQ55(단체),)",0,102.189827
2,TEST_00+1일,"(느티나무 셀프BBQ_대여료 30,000원,)",1,7.862879
3,TEST_00+1일,"(느티나무 셀프BBQ_대여료 60,000원,)",1,4.409324
4,TEST_00+1일,"(느티나무 셀프BBQ_대여료 90,000원,)",1,1.519170
...,...,...,...,...
13505,TEST_09+7일,"(화담숲카페_메밀미숫가루,)",1,45.699654
13506,TEST_09+7일,"(화담숲카페_아메리카노 HOT,)",1,61.803707
13507,TEST_09+7일,"(화담숲카페_아메리카노 ICE,)",1,108.466278
13508,TEST_09+7일,"(화담숲카페_카페라떼 ICE,)",1,31.803053


In [55]:
full_pred_df["매출수량(up)"] = full_pred_df["매출수량"] * full_pred_df["매출여부"]

In [71]:
full_df_fin = pd.merge(df_reg, full_pred_df[["영업일자", "영업장명_메뉴명", "매출수량(up)"]], on=['영업일자', '영업장명_메뉴명'], how='outer')
full_df_fin

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량,매출수량(up)
0,TEST_00+1일,"(느티나무 셀프BBQ_1인 수저세트,)",8.443606,10.211239
1,TEST_00+1일,"(느티나무 셀프BBQ_BBQ55(단체),)",12.941634,0.000000
2,TEST_00+1일,"(느티나무 셀프BBQ_대여료 30,000원,)",6.135452,7.862879
3,TEST_00+1일,"(느티나무 셀프BBQ_대여료 60,000원,)",2.924091,4.409324
4,TEST_00+1일,"(느티나무 셀프BBQ_대여료 90,000원,)",0.518820,1.519170
...,...,...,...,...
13505,TEST_09+7일,"(화담숲카페_메밀미숫가루,)",45.622295,45.699654
13506,TEST_09+7일,"(화담숲카페_아메리카노 HOT,)",60.671352,61.803707
13507,TEST_09+7일,"(화담숲카페_아메리카노 ICE,)",115.263031,108.466278
13508,TEST_09+7일,"(화담숲카페_카페라떼 ICE,)",29.513021,31.803053


In [59]:
full_df_fin["매출수량"] = full_df_fin["매출수량"] * 0.7 + full_df_fin["매출수량(up)"] * 0.3
full_df_fin.drop(columns=["매출수량(up)"], inplace=True)

In [72]:
full_df_fin["매출수량"] = full_df_fin["매출수량"] * 0.6 + full_df_fin["매출수량(up)"] * 0.4
full_df_fin.drop(columns=["매출수량(up)"], inplace=True)

In [73]:
full_df_fin
full_df_fin['영업장명_메뉴명'] = full_df_fin['영업장명_메뉴명'].apply(lambda x: x[0] if isinstance(x, tuple) else x)

In [62]:
def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    # (영업일자, 메뉴) → 매출수량 딕셔너리로 변환
    pred_dict = dict(zip(
        zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']),
        pred_df['매출수량'].astype(float)
    ))

    final_df = sample_submission.copy()

    menu_cols = final_df.columns[1:]
    final_df[menu_cols] = final_df[menu_cols].astype(float)

    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:  # 메뉴명들
            final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)

    return final_df

In [74]:
full_df_fin_notzero = full_df_fin.copy()
full_df_fin_notzero.loc[full_df_fin_notzero['매출수량'].abs() < 1e-9, '매출수량'] = 1
full_df_fin_notzero.loc[full_df_fin_notzero['매출수량'] < 0, '매출수량'] = 1

sample_submission = pd.read_csv('DATA/sample_submission.csv')
final_hybrid = convert_to_submission_format(full_df_fin_notzero, sample_submission)
final_hybrid.to_csv('xgboost_123_hybrid.csv', index=False, encoding='utf-8-sig')

7/3이 더 성능이 좋았음,,,