In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import category_encoders as ce

import holidays
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor


import warnings
warnings.filterwarnings('ignore')

In [36]:
train = pd.read_csv(r'./data/train.csv')
test = pd.read_csv(r'./data/test.csv')
international = pd.read_csv(r'./data/international_trade.csv')

In [46]:
test['year'] = test['timestamp'].apply(lambda x: int(x[0:4]))
test['month'] = test['timestamp'].apply(lambda x: int(x[5:7]))
test['day'] = test['timestamp'].apply(lambda x: int(x[8:10]))
test['Weekday'] = pd.to_datetime(test['timestamp']).dt.weekday

In [37]:
def group_season(df):
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    return df['season']


def holiday(df):
    kr_holidays = holidays.KR()
    df['holiday'] = df.timestamp.apply(lambda x: 'holiday' if x in kr_holidays else 'non-holiday')
    return df['holiday']


def cyclical_feature(df, time=12):
    df['sin_time'] = np.sin(2*np.pi*df.month/time)
    df['cos_time'] = np.cos(2*np.pi*df.month/time)


def post_preprocessing(test, submission):
    idx_list = test[(test['Weekday'] == 6)].index
    submission.loc[idx_list, 'answer'] = 0 # Weekday == 6 (일요일)이면 가격 0원
    submission['answer'] = submission['answer'].apply(lambda x: max(0, x)) # 가격에 음수가 있다면 가격 0원으로 변경
    return submission


# 날짜를 기반으로 주 수확 시기인지를 판단하는 함수를 정의합니다.
def determine_harvest_weight(item, month):
    harvest_times = {
    'TG': {'main': [(10, 1)]},  # 감귤: 10월부터 이듬해 1월까지
    'BC': {'main': [(4, 6), (9, 11)]},  # 브로콜리: 4월-6월, 9월-11월
    'RD': {'main': [(5, 6), (11, 12)]},  # 무: 5월, 11월
    'CR': {'main': [(7, 8), (10, 11)]},  # 당근: 7월-8월, 10월-12월
    'CB': {'main': [(6, 6), (11, 11)]}  # 양배추: 6월, 11월
}
    main_harvest = harvest_times[item]['main']
    for start, end in main_harvest:
        if start <= month <= end:
            return 1
    return 0

In [38]:
class DataPreprocessing:
    def __init__(self, train, test):
        self.train = train
        self.test = test

    @staticmethod
    def label_encode(train):
        categorical_col = train.select_dtypes(include=['object']).columns.tolist()
        print(f"Category columns: {categorical_col}")
        for i in categorical_col:
            le = LabelEncoder()
            train[i] = le.fit_transform(train[i])

        return train

    @staticmethod
    def remove_outliers(train):
        print('Remove outliers')        
        train.loc[(train['Weekday'] == 6) & (train['price(원/kg)'] >= 0), 'price(원/kg)'] = 0
        
        # # item_id에서 평균가격이 낮은 데이터를 0으로 치환
        train.loc[(train['item_id']=="RD_C_S") & (train['price(원/kg)'] > 0), ['price(원/kg)', 'supply(kg)']] = 0

        train.loc[(train['item_id']=="BC_C_S") & (train['price(원/kg)'] > 0), ['price(원/kg)', 'supply(kg)']] = 0
        train.loc[(train['item_id']=="BC_B_S") & (train['price(원/kg)'] > 0), ['price(원/kg)', 'supply(kg)']] = 0
        
        train.loc[(train['item_id']=="CR_E_S") & (train['price(원/kg)'] > 0), ['price(원/kg)', 'supply(kg)']] = 0
        train.loc[(train['item_id']=="CR_D_S") & (train['price(원/kg)'] > 0), ['price(원/kg)', 'supply(kg)']] = 0

        train.loc[(train['item_id']=="CB_A_S") & (train['price(원/kg)'] > 0), ['price(원/kg)', 'supply(kg)']] = 0
        
        # item_location에서 평균 가격이 낮은 데이터를 0으로 치환        
        train.loc[(train['item_location']=="CRS") & (train['price(원/kg)'] > 0), ['price(원/kg)', 'supply(kg)']] = 0
                
        return train

    
    @staticmethod
    def preprocessing(data):
        print('Preprocessing Start')
        # time feature
        data['year'] = data['timestamp'].apply(lambda x: int(x[0:4]))
        data['month'] = data['timestamp'].apply(lambda x: int(x[5:7]))
        data['day'] = data['timestamp'].apply(lambda x: int(x[8:10]))
        
        data['Weekday'] = pd.to_datetime(data['timestamp']).dt.weekday
        data['is_weekend'] = data['Weekday'].apply(lambda x: 1 if x >= 6 else 0)
        data['year'] = data['year'] - 2019
        data['season'] = group_season(data)
        data['holiday'] = holiday(data)
        cyclical_feature(data)
        
        # item feature
        data['item_id'] = data.ID.str[0:6]
        
        data['total_value_month'] = data['item_id'] + data['month'].astype(str)
        
        data['item_location'] = data['item']+data['location']
        data['item_corporation'] = data['item']+data['corporation']
        data['item_month_Weekday'] = data['item'].astype(str) + "_" + data['month'].astype(str) + data['Weekday'].astype(str)
        
        data['item_month_corp'] = data['item']+data['month'].astype(str)+data['corporation']
        
        data['location_cooperation'] = data['location']+data['corporation']
        data['location_cooperation_month'] = data['location']+data['corporation']+data['month'].astype(str)
        
        data['item_month_day'] = data['item'].astype(str) + "_" + data['month'].astype(str) + data['day'].astype(str)

        data['month_day'] = data['month'].astype(str) + "_" + data['day'].astype(str)
        
        data['item_corp_Weekday'] = data['item'].astype(str) + "_" + data['corporation'].astype(str) + data['Weekday'].astype(str)
        data['item_location_Weekday'] = data['item'].astype(str) + "_" + data['location'].astype(str) + data['Weekday'].astype(str)

        data['timestamp'] = pd.to_datetime(data['timestamp'])
        data['harvest_weight'] = data.apply(lambda row: determine_harvest_weight(row['item'], row['timestamp'].month), axis=1)

        data['price_harvest_weight'] = data['harvest_weight'] * data['price(원/kg)']
        data['supply_harvest_weight'] = data['harvest_weight'] * data['supply(kg)']
        
        return data

    def fit(self):
        self.train = self.preprocessing(self.train)

        self.train = self.remove_outliers(self.train)

        x_train = self.train.drop(columns=['ID', 'price(원/kg)'])
        y_train = self.train['price(원/kg)']


        x_train = self.label_encode(x_train)

        return x_train, y_train

In [39]:
preprocessing = DataPreprocessing(train, test)
x, y = preprocessing.fit()
train_set = pd.concat([x, y], axis=1)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1103)
train_autogluon = pd.concat([x, y], axis=1)

Preprocessing Start
Remove outliers
Category columns: ['item', 'corporation', 'location', 'season', 'holiday', 'item_id', 'total_value_month', 'item_location', 'item_corporation', 'item_month_Weekday', 'item_month_corp', 'location_cooperation', 'location_cooperation_month', 'item_month_day', 'month_day', 'item_corp_Weekday', 'item_location_Weekday']


In [40]:
train_autogluon['price_supply'] = train_autogluon['supply(kg)'] / train_autogluon['price(원/kg)'] 
train_autogluon = train_autogluon.replace([np.inf, -np.inf, np.nan], 0)

In [41]:
data = TimeSeriesDataFrame(train_autogluon)

In [42]:
data['이동평균_7'] = data['price(원/kg)'].rolling(window=7, min_periods=1).mean()
data['이동평균_14'] = data['price(원/kg)'].rolling(window=14, min_periods=1).mean()
data['이동평균_21'] = data['price(원/kg)'].rolling(window=21, min_periods=1).mean()

In [54]:
from autogluon.common import space

predictor = TimeSeriesPredictor( 
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE"
)

predictor.fit(data,
              excluded_model_types=['Naive'])

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': ['Naive'],
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': None,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
AutoGluon will save models to AutogluonModels/ag-20231115_051024/
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price(원/kg)'
	past covariates:  ['item', 'corporation', 'location', 'supply(kg)', 'year', 'month', 'day', 'Weekday', 'is_weekend', 'season', 'holiday', '

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7fe116c0fc40>

In [55]:
# predictor.refit_full()
predictions = predictor.predict(data)
predictions

Global seed set to 123
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
29,2023-03-04,3056.359763,924.916304,1672.807437,2153.490951,2631.464461,3084.867475,3503.474057,3954.003842,4476.529101,5300.137481
29,2023-03-05,54.435801,-2116.708400,-1375.652277,-844.292198,-408.628781,-14.069933,424.802022,874.270940,1395.528610,2102.936814
29,2023-03-06,2919.639656,203.576323,1176.047167,1824.036324,2374.601109,2861.085531,3344.827637,3907.094980,4564.904992,5507.326981
29,2023-03-07,3406.427318,490.874191,1510.370007,2193.076220,2806.009871,3334.582979,3900.200753,4483.530580,5148.902544,6230.641627
29,2023-03-08,3143.173340,229.517413,1239.974611,1994.250690,2586.360926,3137.065732,3689.698994,4312.603671,5076.803002,6122.531213
...,...,...,...,...,...,...,...,...,...,...,...
28,2023-03-27,522.175480,62.268452,231.043153,337.593480,435.018247,522.879343,603.863611,695.437758,800.248514,958.995133
28,2023-03-28,533.380205,60.161890,243.104465,349.580462,447.235850,539.092275,631.292276,718.433336,824.061442,983.990677
28,2023-03-29,553.050920,46.410691,236.634616,366.280009,459.300879,552.161203,634.359027,731.769041,851.660999,1006.847839
28,2023-03-30,520.276850,10.156017,220.184859,337.971811,438.617799,522.702887,609.082942,702.054877,819.362568,973.467384


In [56]:
submission = pd.read_csv(r'./data/sample_submission.csv')
submission['answer'] = predictions.reset_index()['mean']
submission['answer'] = np.round(submission['answer'])
submission = post_preprocessing(test, submission)
submission.to_csv('./auto_time_submission24.csv', index=False)