In [1]:
from hossam import *
from pandas import concat
from pycaret.regression import *

In [2]:
origin=load_data('restaurant_sales_preprocessed')
origin.set_index('date', inplace=True)
origin['holiday']=origin['holiday'].astype('category')
origin['weekend']=origin['weekend'].astype('category')
origin.info()

[94m어느 식당의 1년간 일별 매출을 기록한 데이터의 전처리 완료 버전(명목형이 이진변수만 있으므로 더미변수는 처리하지 않음)[0m
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 353 entries, 2024-01-01 to 2024-12-30
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   sales           353 non-null    float64 
 1   visitors        353 non-null    int64   
 2   avg_price       353 non-null    int64   
 3   marketing_cost  353 non-null    float64 
 4   delivery_ratio  353 non-null    float64 
 5   rain_mm         353 non-null    float64 
 6   temperature     353 non-null    float64 
 7   holiday         353 non-null    category
 8   weekend         353 non-null    category
dtypes: category(2), float64(5), int64(2)
memory usage: 23.0 KB


In [3]:
s=RegressionExperiment()
s.setup(
    #----------------------------------------
    # 필수 지정
    #----------------------------------------
    # 데이터셋 지정
    data=origin,
    # 예측 목표 변수 지정(종속변수)
    target='sales',
    # 랜덤 시드 고정
    session_id=52,
    # 훈련 데이터 비율 (기본값=0.7)
    train_size=0.75,
    # 교차검증 폴드 수 지정
    fold=5,
    # 처리과정 출력 안함
    verbose=False,
    # GPU 사용 여부 지정
    use_gpu=False,
    #----------------------------------------
    # 전처리 설정 (1)
    #----------------------------------------
    # 범주형 변수 지정 (기본값 None)
    categorical_features=['weekend', 'holiday'],
    # 생략할 변수 지정 (기본값 None)
    ignore_features=[],
    # 데이터 정규화/표준화 활성화 (기본값 False)
    normalize=True,
    # 데이터 정규화/표준화 방법 선택
    # 'minmax', 'maxabs', 'robust', 'zscore'
    normalize_method='zscore',
    #----------------------------------------
    # 전처리 설정 (2) - 왠만하면 쓰지 말기
    #----------------------------------------
    # 이상치 제거 (기본값 Flase, IQR 범위 밖의 값을 삭제함)
    remove_outliers=False,
    # 이상치 제거 임계값 (기본값 0.05, 상하위 몇 %를 이상치로 간주할 지 지정)
    outliers_threshold=0.05,
    # 종속변수 변환 (기본값 False)
    transform_target=False,
    # 변수 선택 (기본값 False)
    feature_selection=False
)

<pycaret.regression.oop.RegressionExperiment at 0x141dbd285d0>

In [4]:
s.pull()

Unnamed: 0,Description,Value
0,Session id,52
1,Target,sales
2,Target type,Regression
3,Original data shape,"(353, 9)"
4,Transformed data shape,"(353, 9)"
5,Transformed train set shape,"(264, 9)"
6,Transformed test set shape,"(89, 9)"
7,Numeric features,6
8,Categorical features,2
9,Preprocess,True


In [5]:
s.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.ElasticNet,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPursuit,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.PassiveAggressiveRegressor,True


In [6]:
best5models=s.compare_models(sort='RMSE', n_select=5, fold=5)
best5models

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,0.1654,0.0427,0.2056,0.6959,0.0119,0.0101,0.014
ridge,Ridge Regression,0.1657,0.0428,0.206,0.6948,0.0119,0.0101,0.454
lr,Linear Regression,0.1658,0.0429,0.2061,0.6943,0.0119,0.0101,0.58
lar,Least Angle Regression,0.1658,0.0429,0.2061,0.6943,0.0119,0.0101,0.016
huber,Huber Regressor,0.1653,0.0429,0.2062,0.6938,0.0119,0.0101,0.014
rf,Random Forest Regressor,0.1751,0.0494,0.2214,0.6438,0.0128,0.0107,0.036
ada,AdaBoost Regressor,0.1758,0.05,0.2227,0.643,0.0128,0.0108,0.024
et,Extra Trees Regressor,0.1783,0.0502,0.2231,0.6391,0.0129,0.0109,0.036
lightgbm,Light Gradient Boosting Machine,0.1799,0.0517,0.2256,0.6329,0.013,0.011,0.036
catboost,CatBoost Regressor,0.1851,0.0534,0.2299,0.6235,0.0132,0.0113,0.386


[BayesianRidge(),
 Ridge(random_state=52),
 LinearRegression(n_jobs=-1),
 Lars(random_state=52),
 HuberRegressor()]

In [7]:
best5models=s.compare_models(include=['lr', 'ridge', 'lasso', 'en', 'knn', 'svm', 'dt', 'xgboost', 'lightgbm', 'catboost'], sort='RMSE', n_select=5, fold=5)
best5models

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ridge,Ridge Regression,0.1657,0.0428,0.206,0.6948,0.0119,0.0101,0.012
lr,Linear Regression,0.1658,0.0429,0.2061,0.6943,0.0119,0.0101,0.016
lightgbm,Light Gradient Boosting Machine,0.1799,0.0517,0.2256,0.6329,0.013,0.011,0.03
svm,Support Vector Regression,0.1811,0.0526,0.2284,0.6323,0.0131,0.0111,0.014
catboost,CatBoost Regressor,0.1851,0.0534,0.2299,0.6235,0.0132,0.0113,0.338
xgboost,Extreme Gradient Boosting,0.1884,0.0552,0.2344,0.6114,0.0135,0.0115,0.024
knn,K Neighbors Regressor,0.1878,0.0558,0.2358,0.6041,0.0136,0.0115,0.014
dt,Decision Tree Regressor,0.2458,0.0994,0.3135,0.2846,0.0181,0.015,0.012
lasso,Lasso Regression,0.3208,0.147,0.3824,-0.0212,0.022,0.0196,0.012
en,Elastic Net,0.3208,0.147,0.3824,-0.0212,0.022,0.0196,0.012


[Ridge(random_state=52),
 LinearRegression(n_jobs=-1),
 LGBMRegressor(n_jobs=-1, random_state=52),
 SVR(),
 <catboost.core.CatBoostRegressor at 0x141dee6e690>]

In [8]:
blended=s.blend_models(estimator_list=best5models, fold=5)
blended

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1922,0.0578,0.2404,0.6162,0.0138,0.0117
1,0.1795,0.0475,0.218,0.7314,0.0126,0.011
2,0.1467,0.033,0.1817,0.7925,0.0106,0.009
3,0.1535,0.0364,0.1908,0.684,0.011,0.0094
4,0.1757,0.0494,0.2223,0.5888,0.0128,0.0107
Mean,0.1695,0.0448,0.2106,0.6826,0.0121,0.0104
Std,0.0169,0.009,0.0215,0.0744,0.0012,0.001


In [None]:
%%time

tuned=s.tune_model(
    estimator=blended,
    optimize='RMSE',
    n_iter=30,
    fold=5,
    choose_better=True,
    verbose=False,
    early_stopping=True,
    search_algorithm='grid',
    custom_grid={
        'Ridge Regression__alpha':[0.01, 0.1, 1, 10, 100],
        'Light Gradient Boosting Machine__n_estimators':[200],
        'Light Gradient Boosting Machine__learning_rate':[0.05, 0.1],
        'Light Gradient Boosting Machine__num_leaves':[31, 63],
        'Light Gradient Boosting Machine__max_depth':[-1, 5],
        'Light Gradient Boosting Machine__min_child_samples':[20, 50],
        'Light Gradient Boosting Machine__subsample':[0.8],
        'Light Gradient Boosting Machine__reg_alpha':[0, 0.1, 1],
        'Light Gradient Boosting Machine__reg_lambda':[0, 1, 5],
        'Support Vector Regression__kernel':['rbf'],
        'Support Vector Regression__C':[0.1, 1, 10, 100],
        'Support Vector Regression__epsilon':[0.01, 0.05, 0.1, 0.2],
        'Support Vector Regression__gamma':['scale', 'auto', 0.01, 0.1, 1],
        'CatBoost Regressor__iterations':[300, 500],
        'CatBoost Regressor__learning_rate':[0.01, 0.03, 0.1],
        'CatBoost Regressor__depth':[4, 6, 8],
        'CatBoost Regressor__l2_leaf_reg':[1, 3, 5],
        'CatBoost Regressor__subsample':[0.8, 1.0]
    }
)
tuned

CPU times: total: 51min 48s
Wall time: 17h 8min 49s


KeyboardInterrupt: 

: 