In [None]:
!pip install optuna
!pip install -U -qqq hiplot

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.5.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m863.2/863.2 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/

In [None]:
import zipfile
zip_file = zipfile.ZipFile("/content/DataSet.zip")
zip_file.extractall("/content/")

In [None]:
TRAIN_PATH = '/content/train/train.csv'
SAMPLE_SUB_PATH = '/content/sample_submission.csv'
TEST_DIR = '/content/test'          # Folder with 10 TEST_*.csv files
OUT_PATH = '/content/submission.csv'
VALID_DAYS = 35

[I 2025-08-23 17:53:47,795] Trial 12 finished with value: 15.279699298235053 and parameters: {'max_depth': 9, 'min_samples_leaf': 11, 'n_estimators': 191, 'learning_rate': 0.0227700153996597, 'loss': 'square'}. Best is trial 12 with value: 15.279699298235053.


## 4) Train & Evaluate

In [None]:
import os, re, glob
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import root_mean_squared_error
from datetime import timedelta
#import optuna

HORIZON = 7
LAGS = 28

NUMERIC_FEATS_DEFAULT = [
    'is_weekend', 'month', 'weekday','rolling_mean_3',
   'rolling_sum_28', 'season_cos', 'season_sin',
]

def ensure_calendar_feats(df):
    if 'date' not in df.columns and '영업일자' in df.columns:
        df = df.rename(columns={'영업일자':'date'})
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    df['weekday'] = df['date'].dt.weekday
    if 'is_weekend' not in df.columns:
        df['is_weekend'] = (df['date'].dt.weekday >= 5).astype(int)
    #if 'dayofmonth' not in df.columns:
        #df['dayofmonth'] = df['date'].dt.day
    if 'month' not in df.columns:
        df['month'] = df['date'].dt.month
    if 'rolling_mean_3' not in df.columns or 'rolling_sum_14' not in df.columns:
        df = df.sort_values(['영업장명_메뉴명','date'])
        df['rolling_mean_3'] = df.groupby('영업장명_메뉴명')['매출수량']\
                                  .transform(lambda s: s.rolling(window=3, min_periods=1).mean())
        df['rolling_sum_28'] = df.groupby('영업장명_메뉴명')['매출수량']\
                                  .transform(lambda s: s.rolling(window=14, min_periods=1).sum())
        df['season_cos'] = np.cos(df['date'].dt.month)
        df['season_sin'] = np.sin(df['date'].dt.month)



    return df

def add_lags(df, lags=LAGS):
    df = df.sort_values(['영업장명_메뉴명','date'])
    for i in range(1, lags+1):
        df[f'lag_{i}'] = df.groupby('영업장명_메뉴명')['매출수량'].shift(i)
    return df

def build_supervised(df):
    df = df.copy()
    if 'date' not in df.columns and '영업일자' in df.columns:
        df = df.rename(columns={'영업일자':'date'})
    df['date'] = pd.to_datetime(df['date'])

    df = ensure_calendar_feats(df)
    df = add_lags(df, LAGS)
    df = df.dropna(subset=[f'lag_{i}' for i in range(1, LAGS+1)])

    feats, targets, ref_dates = [], [], []
    num_cols = [c for c in df.columns if c.startswith('lag_')] + [c for c in NUMERIC_FEATS_DEFAULT if c in df.columns]

    feat_cols = num_cols

    df = df.sort_values(['영업장명_메뉴명','date'])
    for key, g in df.groupby('영업장명_메뉴명', sort=False):
        g = g.reset_index(drop=True)
        for t in range(len(g) - HORIZON):
            row = g.iloc[t]
            y = g['매출수량'].iloc[t+1:t+1+HORIZON].to_numpy(dtype=np.float32)
            feats.append(row[feat_cols].to_numpy(dtype=np.float32))
            targets.append(y)
            ref_dates.append(row['date'])

    X = np.vstack(feats) if len(feats) else np.empty((0, len(feat_cols)), dtype=np.float32)
    y = np.vstack(targets) if len(targets) else np.empty((0, HORIZON), dtype=np.float32)
    ref_dates = pd.to_datetime(pd.Series(ref_dates, name='ref_date'))
    return X, y, ref_dates, feat_cols

def time_based_split(ref_dates, valid_days=35):
    cutoff = ref_dates.max() - pd.Timedelta(days=valid_days)
    trn_idx = ref_dates <= cutoff
    val_idx = ref_dates > cutoff
    return trn_idx.values, val_idx.values

def rmse(a, b):
    return float(np.sqrt(np.mean((a - b)**2)))



def train_adaboost(Xtr, ytr, seed=42):
    # AdaBoost requires a base estimator. DecisionTreeRegressor is a common choice.
    # We wrap it in MultiOutputRegressor to handle the multi-step forecast (HORIZON=7).
    base_estimator = DecisionTreeRegressor(max_depth=9, min_samples_leaf = 11, random_state = 42)

    model = MultiOutputRegressor(
        AdaBoostRegressor(
            estimator=base_estimator,
            n_estimators=191,         # Number of boosting stages
            learning_rate= 0.0227700153996597,        # Shrinks the contribution of each regressor
            loss='square',
            random_state=seed
        )
    )

    model.fit(Xtr, ytr)
    return model

def make_test_features(test_df, feat_cols):
    df = test_df.copy()
    if 'date' not in df.columns and '영업일자' in df.columns:
        df = df.rename(columns={'영업일자':'date'})
    df['date'] = pd.to_datetime(df['date'])
    df = ensure_calendar_feats(df)

    rows, order = [], []
    for key, g in df.groupby('영업장명_메뉴명', sort=False):
        g = g.sort_values('date')
        vals = g['매출수량'].tail(LAGS).to_numpy()
        if len(vals) < LAGS:
            raise ValueError(f'{key}: need {LAGS} days, got {len(vals)}')
        last_row = g.iloc[-1]
        named = {f'lag_{i}': vals[-i] for i in range(1, LAGS+1)}
        named.update({
            'is_weekend': last_row['is_weekend'],
            #'dayofmonth': last_row['dayofmonth'],
            'weekday' : last_row['weekday'],
            'month': last_row['month'],
            'rolling_mean_3': last_row['rolling_mean_3'],
            'rolling_sum_28': last_row['rolling_sum_28'],
            'season_cos' : last_row['season_cos'],
            'season_sin' : last_row['season_sin'],

        })

        feat_vec = [named[c] for c in feat_cols]
        rows.append(feat_vec); order.append(key)
    return np.array(rows, dtype=np.float32), order

def predict_one_test(model, scaler, feat_cols, scaling_cols_indices, non_scaling_cols_indices, test_csv_path):
    test_df = pd.read_csv(test_csv_path)
    Xtest, order = make_test_features(test_df, feat_cols)

    # Apply selective scaling
    Xtest_to_scale = Xtest[:, scaling_cols_indices]
    Xtest_no_scale = Xtest[:, non_scaling_cols_indices]
    Xtest_scaled = scaler.transform(Xtest_to_scale)
    Xtest_s = np.concatenate([Xtest_scaled, Xtest_no_scale], axis=1)

    yhat = model.predict(Xtest_s)
    filename = os.path.basename(test_csv_path)
    prefix = re.search(r'(TEST_\d+)', filename).group(1)
    out_rows = []
    for j, menu in enumerate(order):
        for day in range(1, HORIZON+1):
            out_rows.append({
                '영업일자': f'{prefix}+{day}일',
                '영업장명_메뉴명': menu,
                '매출수량': max(0, int(round(float(yhat[j, day-1]))))
            })
    return pd.DataFrame(out_rows)

def convert_to_submission_format(pred_df, sample_submission):
    pred_dict = dict(zip(zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']), pred_df['매출수량']))
    final_df = sample_submission.copy()
    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:
            final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
    return final_df

In [None]:
# Load data
train = pd.read_csv(TRAIN_PATH)
if '영업일자' in train.columns and 'date' not in train.columns:
    train = train.rename(columns={'영업일자':'date'})
train['date'] = pd.to_datetime(train['date'])

# Build supervised learning dataset
X, y, ref_dates, feat_cols = build_supervised(train)
trn_idx, val_idx = time_based_split(ref_dates, valid_days=VALID_DAYS)
Xtr, ytr = X[trn_idx], y[trn_idx]
Xva, yva = X[val_idx], y[val_idx]

# --- Selective Scaling ---
non_scaling_cols = ['season_cos', 'season_sin']
scaling_cols = [c for c in feat_cols if c not in non_scaling_cols]

# Get indices for splitting the arrays
scaling_cols_indices = [feat_cols.index(c) for c in scaling_cols]
non_scaling_cols_indices = [feat_cols.index(c) for c in non_scaling_cols]

# Separate data for scaling
Xtr_to_scale = Xtr[:, scaling_cols_indices]
Xtr_no_scale = Xtr[:, non_scaling_cols_indices]
Xva_to_scale = Xva[:, scaling_cols_indices]
Xva_no_scale = Xva[:, non_scaling_cols_indices]

# Fit scaler ONLY on the columns to be scaled from the training set
scaler = MinMaxScaler().fit(Xtr_to_scale)

# Transform and concatenate
Xtr_scaled = scaler.transform(Xtr_to_scale)
Xtr_s = np.concatenate([Xtr_scaled, Xtr_no_scale], axis=1)

Xva_scaled = scaler.transform(Xva_to_scale) if len(Xva) > 0 else None
Xva_s = np.concatenate([Xva_scaled, Xva_no_scale], axis=1) if Xva_scaled is not None else None

# Train AdaBoost model
print("Training AdaBoost Regressor...")
model = train_adaboost(Xtr_s, ytr)
print("Training complete.")

#Evaluate
if Xva_s is not None and len(Xva_s):
    yhat = model.predict(Xva_s)
    val_rmse = rmse(yva, yhat)
    print(f'[Validation RMSE] {val_rmse:.4f}')
else:
    print('Validation set is empty (check VALID_DAYS).')

Training AdaBoost Regressor...
Training complete.
[Validation RMSE] 15.3142


rolling mean(o):15.2867

rolling mean(x): 15.3342

rolling sum 28 : 15.3198

day of month(X) : 15.2736

## 5) Inference & Submission File Generation

In [None]:
# Collect TEST files
test_files = sorted(glob.glob(os.path.join(TEST_DIR, 'TEST_*.csv')))
print('Found test files:', len(test_files))

# Combine predictions
all_preds = []
for path in test_files:
    # Pass the necessary indices to the prediction function
    pred_df = predict_one_test(model, scaler, feat_cols, scaling_cols_indices, non_scaling_cols_indices, path)
    all_preds.append(pred_df)
full_pred_df = pd.concat(all_preds, ignore_index=True)

# Convert to submission format
sample_submission = pd.read_csv(SAMPLE_SUB_PATH)
submission = convert_to_submission_format(full_pred_df, sample_submission)

# Save
submission.to_csv(OUT_PATH, index=False, encoding='utf-8-sig')
print('Saved:', OUT_PATH)

Found test files: 10
Saved: /content/submission.csv


#Optuna parameter tuning

In [None]:
# Load data
train = pd.read_csv(TRAIN_PATH)
if '영업일자' in train.columns and 'date' not in train.columns:
    train = train.rename(columns={'영업일자':'date'})
train['date'] = pd.to_datetime(train['date'])

# Build supervised learning dataset
X, y, ref_dates, feat_cols = build_supervised(train)
trn_idx, val_idx = time_based_split(ref_dates, valid_days=VALID_DAYS)
Xtr, ytr = X[trn_idx], y[trn_idx]
Xva, yva = X[val_idx], y[val_idx]

# --- Selective Scaling ---
non_scaling_cols = ['season_cos', 'season_sin']
scaling_cols = [c for c in feat_cols if c not in non_scaling_cols]

# Get indices for splitting the arrays
scaling_cols_indices = [feat_cols.index(c) for c in scaling_cols]
non_scaling_cols_indices = [feat_cols.index(c) for c in non_scaling_cols]

# Separate data for scaling
Xtr_to_scale = Xtr[:, scaling_cols_indices]
Xtr_no_scale = Xtr[:, non_scaling_cols_indices]
Xva_to_scale = Xva[:, scaling_cols_indices]
Xva_no_scale = Xva[:, non_scaling_cols_indices]

# Fit scaler ONLY on the columns to be scaled from the training set
scaler = MinMaxScaler().fit(Xtr_to_scale)

# Transform and concatenate
Xtr_scaled = scaler.transform(Xtr_to_scale)
Xtr_s = np.concatenate([Xtr_scaled, Xtr_no_scale], axis=1)

Xva_scaled = scaler.transform(Xva_to_scale) if len(Xva) > 0 else None
Xva_s = np.concatenate([Xva_scaled, Xva_no_scale], axis=1) if Xva_scaled is not None else None

def objective(trial):
  max_depth = trial.suggest_int("max_depth", 2, 9)
  min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 30)
##############################adaboost parameters###########################
  n_estimators = trial.suggest_int("n_estimators", 70, 2000)
  learning_rate = trial.suggest_float("learning_rate", 0.01, 0.5)
  loss = trial.suggest_categorical("loss", ['linear', 'square'])

  base_estimator = DecisionTreeRegressor(
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
  model = MultiOutputRegressor(
      AdaBoostRegressor(
          estimator = base_estimator,
          n_estimators = n_estimators,
          learning_rate = learning_rate,
          loss = loss,
          random_state = 42
      )
  )
  model.fit(Xtr_s, ytr)


  y_pred = model.predict(Xva_s)


  score = root_mean_squared_error(yva, y_pred)


  return score



study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40)

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_trial.params)

[I 2025-08-23 15:35:42,615] A new study created in memory with name: no-name-66238b35-9d0d-4c1b-b1d8-f1ffac24f8fa
[I 2025-08-23 15:39:19,036] Trial 0 finished with value: 26.384389419505432 and parameters: {'max_depth': 4, 'min_samples_leaf': 21, 'n_estimators': 1185, 'learning_rate': 0.3906222061594458, 'loss': 'linear'}. Best is trial 0 with value: 26.384389419505432.
[I 2025-08-23 15:45:56,512] Trial 1 finished with value: 116.09273541238916 and parameters: {'max_depth': 4, 'min_samples_leaf': 5, 'n_estimators': 331, 'learning_rate': 0.3942612878325853, 'loss': 'square'}. Best is trial 0 with value: 26.384389419505432.
[I 2025-08-23 15:49:02,368] Trial 2 finished with value: 15.735558227043224 and parameters: {'max_depth': 8, 'min_samples_leaf': 6, 'n_estimators': 1818, 'learning_rate': 0.30634118040344516, 'loss': 'linear'}. Best is trial 2 with value: 15.735558227043224.
[I 2025-08-23 15:51:19,823] Trial 3 finished with value: 18.313978143453618 and parameters: {'max_depth': 5, 'm