In [2]:
pip install autogluon



# import / 라이브러리 호출

In [4]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

from autogluon.tabular import TabularPredictor, TabularDataset

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Fixed RandomSeed / 랜덤시드 고정

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(96) # Seed 고정

In [7]:
train = pd.read_csv("/content/drive/MyDrive/Dacon/추석 맞이 추석 선물 수요량 예측 AI 경진대회/open/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Dacon/추석 맞이 추석 선물 수요량 예측 AI 경진대회/open/test.csv")

In [8]:
train.head()

Unnamed: 0,ID,추석까지 남은 기간(주),쇼핑몰 구분,가격(원),프로모션 여부,도시 유형,지역 유형,쇼핑몰 유형,선물 유형,수요량
0,TRAIN_0000,1,쇼핑몰 15,212000,0,도시 6,지역 1,쇼핑몰 유형 2,명절혼합과일선물세트,28
1,TRAIN_0001,2,쇼핑몰 72,113000,0,도시 21,지역 1,쇼핑몰 유형 1,발효홍삼선물세트,27
2,TRAIN_0002,0,쇼핑몰 15,67000,0,도시 6,지역 1,쇼핑몰 유형 2,실속스팸선물세트,769
3,TRAIN_0003,1,쇼핑몰 13,206000,0,도시 12,지역 3,쇼핑몰 유형 1,자연산프리미엄버섯선물세트,27
4,TRAIN_0004,1,쇼핑몰 65,140000,0,도시 16,지역 2,쇼핑몰 유형 2,자연산새우선물세트,337


# 파생변수

In [11]:
# 선물세트별 수요량 변수
gift_mean_map = train.groupby('선물 유형')['수요량'].mean().to_dict()
gift_std_map = train.groupby('선물 유형')['수요량'].std().to_dict()
# 도시별 수요량 변수
city_mean_map = train.groupby('도시 유형')['수요량'].mean().to_dict()
city_std_map = train.groupby('도시 유형')['수요량'].std().to_dict()
# 쇼핑몰별 수요량 변수
mall_mean_map = train.groupby('쇼핑몰 구분')['수요량'].mean().to_dict()
mall_std_map = train.groupby('쇼핑몰 구분')['수요량'].std().to_dict()

In [12]:
def preprocess(df):
    df = df.copy()
    df['gift_mean'] = df['선물 유형'].map(gift_mean_map)
    df['gift_std'] = df['선물 유형'].map(gift_std_map)

    df['city_mean'] = df['도시 유형'].map(city_mean_map)
    df['city_std'] = df['도시 유형'].map(city_std_map)

    df['mall_mean'] = df['쇼핑몰 구분'].map(mall_mean_map)
    df['mall_std']  = df['쇼핑몰 구분'].map(mall_std_map)

    return df

train = preprocess(train)
test  = preprocess(test)

In [14]:
train.head()

Unnamed: 0,추석까지 남은 기간(주),쇼핑몰 구분,가격(원),프로모션 여부,도시 유형,지역 유형,쇼핑몰 유형,선물 유형,수요량,gift_mean,gift_std,city_mean,city_std,mall_mean,mall_std
0,1,쇼핑몰 15,212000,0,도시 6,지역 1,쇼핑몰 유형 2,명절혼합과일선물세트,28,28.246575,17.155968,376.163237,500.805236,291.8,335.444806
1,2,쇼핑몰 72,113000,0,도시 21,지역 1,쇼핑몰 유형 1,발효홍삼선물세트,27,65.962963,54.296517,262.490798,393.222668,272.363636,360.951685
2,0,쇼핑몰 15,67000,0,도시 6,지역 1,쇼핑몰 유형 2,실속스팸선물세트,769,614.645833,522.824022,376.163237,500.805236,291.8,335.444806
3,1,쇼핑몰 13,206000,0,도시 12,지역 3,쇼핑몰 유형 1,자연산프리미엄버섯선물세트,27,83.656489,61.146579,117.254237,104.802539,117.254237,104.802539
4,1,쇼핑몰 65,140000,0,도시 16,지역 2,쇼핑몰 유형 2,자연산새우선물세트,337,696.89781,392.970387,238.316847,292.827209,272.651163,311.518247


# AutoGluon

In [13]:
train.drop("ID", axis = 1, inplace = True)
test.drop("ID", axis = 1, inplace = True)

In [15]:
tune_kwargs = {
    'searcher': 'random',
    'num_trials': 7,
    'scheduler' : 'local'
}

In [16]:
pred = TabularPredictor(label="수요량", eval_metric="rmse", problem_type="regression").fit(train_data=train,
                                                                                          presets=["best_quality"],
                                                                                          hyperparameter_tune_kwargs=tune_kwargs,
                                                                                          num_bag_folds=20,
                                                                                          refit_full=True,
                                                                                          time_limit=43200)

Fitted model: NeuralNetTorch_BAG_L2/58fdb_00000 ...
	-124.7208	 = Validation score   (-root_mean_squared_error)
	435.2s	 = Training   runtime
	0.0s	 = Validation runtime
Fitted model: NeuralNetTorch_BAG_L2/58fdb_00001 ...
	-120.5973	 = Validation score   (-root_mean_squared_error)
	512.05s	 = Training   runtime
	0.0s	 = Validation runtime
Fitted model: NeuralNetTorch_BAG_L2/58fdb_00002 ...
	-123.4523	 = Validation score   (-root_mean_squared_error)
	565.26s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 1730.02s of the 7733.87s of remaining time.
	Fitting 20 child models (S1F1 - S1F20) | Fitting with ParallelLocalFoldFittingStrategy
	-115.424	 = Validation score   (-root_mean_squared_error)
	319.4s	 = Training   runtime
	0.77s	 = Validation runtime
Completed 1/20 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L3 ... Training model for up to 1730.02s of the 7409.48s of remaining time.
	-108.4257	 = Validatio

In [17]:
print(pred.fit_summary())

*** Summary of fit() ***
Estimated performance of each model:
                                       model   score_val  pred_time_val      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0                        WeightedEnsemble_L3 -108.425658     202.478063  26979.405441                0.002039           1.962040            3       True         69
1         NeuralNetFastAI_BAG_L2/92905_00001 -109.284852     201.708389  25472.655535                0.000108         360.378225            2       True         54
2                        WeightedEnsemble_L2 -110.914767      25.349198   7784.057272                0.001075           1.499935            2       True         34
3         NeuralNetFastAI_BAG_L2/92905_00004 -111.472375     201.708436  25327.980789                0.000155         215.703479            2       True         57
4         NeuralNetFastAI_BAG_L2/92905_00003 -112.088007     201.708417  25588.163031                0.000135         

In [19]:
# 리더보드
ld_board = pred.leaderboard(train, silent=True)
ld_board

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,KNeighborsDist_BAG_L1_FULL,-0.000000,,0.080742,0.058355,0.018410,0.080742,0.058355,0.018410,1,True,71
1,KNeighborsDist_BAG_L1,-0.000000,-188.980936,0.091472,0.000282,0.081864,0.091472,0.000282,0.081864,1,True,2
2,LightGBMLarge_BAG_L1_FULL,-4.175385,,4.581804,,8.930989,4.581804,,8.930989,1,True,102
3,XGBoost_BAG_L1/T5_FULL,-9.655401,,1.301996,,2.659493,1.301996,,2.659493,1,True,97
4,LightGBM_BAG_L1/T2_FULL,-9.707971,,1.656280,,3.214337,1.656280,,3.214337,1,True,80
...,...,...,...,...,...,...,...,...,...,...,...,...
133,NeuralNetFastAI_BAG_L1/f22ad_00002,-170.653062,-178.506242,3.790379,0.999739,333.893344,3.790379,0.999739,333.893344,1,True,22
134,NeuralNetFastAI_BAG_L1/f22ad_00002_FULL,-174.532234,,0.065151,,1.397370,0.065151,,1.397370,1,True,91
135,NeuralNetFastAI_BAG_L2/92905_00002,-183.459429,-203.992841,4228.871614,201.708436,25289.640607,2.340575,0.000154,177.363297,2,True,55
136,NeuralNetFastAI_BAG_L2/92905_00002_FULL,-184.078993,,86.414081,,403.602880,0.084157,,2.116618,2,True,124


In [21]:
predict = pred.predict(test)

In [22]:
sub = pd.read_csv("/content/drive/MyDrive/Dacon/추석 맞이 추석 선물 수요량 예측 AI 경진대회/open/sample_submission.csv")

In [23]:
sub["수요량"] = predict

In [24]:
sub.to_csv("AutoGluon_ver2.csv", index = False)

In [25]:
from google.colab import files
files.download('AutoGluon_ver2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

public점수: 104.6383707827
private점수: 116.9503805462