In [4]:
import pandas as pd

In [81]:
train = pd.read_csv('./data/rossman_data/lspoons_train.csv')
test = pd.read_csv('./data/rossman_data/lspoons_test.csv')
store = pd.read_csv('./data/rossman_data/store.csv')

In [82]:
train.head()

Unnamed: 0,id,Store,Date,Sales,Promo,StateHoliday,SchoolHoliday
0,14929,85,2015-05-01,11360,1,a,0
1,14930,512,2015-05-01,10534,1,a,0
2,14931,1097,2015-05-01,17039,1,a,0
3,14932,1,2015-04-30,6228,1,0,0
4,14933,9,2015-04-30,9717,1,0,0


# 1. StateHoliday 변수 변환
### 피처 엔지니어링 활용

In [83]:
train['StateHoliday'].value_counts()
# StateHoliday 컬럼의 값이 0 a b c 4종류의 문자열로 이루어져 있음

0    62208
a       18
b       13
c        5
Name: StateHoliday, dtype: int64

In [84]:
train.head()
#  train 에는 4종류의 StateHoliday 가 모두 나옴

Unnamed: 0,id,Store,Date,Sales,Promo,StateHoliday,SchoolHoliday
0,14929,85,2015-05-01,11360,1,a,0
1,14930,512,2015-05-01,10534,1,a,0
2,14931,1097,2015-05-01,17039,1,a,0
3,14932,1,2015-04-30,6228,1,0,0
4,14933,9,2015-04-30,9717,1,0,0


In [85]:
# 원핫 인코딩도 피처 엔지니어링의 한 종류라 할 수 있다.
# 원핫 인코딩을 통해 StateHoliday 4가지 값을 컬럼으로 바꿔주자
train = pd.get_dummies(columns=['StateHoliday'], data=train)
test = pd.get_dummies(columns=['StateHoliday'], data=test)

In [86]:
test.head()
# test에는 StateHoliday 가 0과 a 뿐. b와 c는 아예 값이 없다는 뜻. 
# train데이터와 test데이터의 컬럼이 서로 다르면 학습과 예측 과정에서 문제가 생긴다.
# 때문에 test에 b 와 c 를 임의로 만들어줘야 한다.

Unnamed: 0,id,Store,Date,Promo,SchoolHoliday,StateHoliday_0,StateHoliday_a
0,0,1,2015-07-01,1,0,1,0
1,1,9,2015-07-01,1,1,1,0
2,2,11,2015-07-01,1,0,1,0
3,3,12,2015-07-01,1,0,1,0
4,4,14,2015-07-01,1,1,1,0


In [87]:
# test 데이터에 임의로 b와 c 컬럼을 만들어서 모두 0값을 부여한다.
test['StateHoliday_b'] = 0
test['StateHoliday_c'] = 0

In [88]:
test.head()

Unnamed: 0,id,Store,Date,Promo,SchoolHoliday,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,0,1,2015-07-01,1,0,1,0,0,0
1,1,9,2015-07-01,1,1,1,0,0,0
2,2,11,2015-07-01,1,0,1,0,0,0
3,3,12,2015-07-01,1,0,1,0,0,0
4,4,14,2015-07-01,1,1,1,0,0,0


# 2. Date 변수를 활용한 피처 엔지니어링

In [89]:
# 판다스의 datetime을 사용하면 해당 데이터를 날짜형태로 바꿔서 활용할 수 있다
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [90]:
# datetime 을 이용하면 연(year)/월(month)/요일(weekday) 계산 등이 쉬워진다!
train['Date'].dt.year.head()

0    2015
1    2015
2    2015
3    2015
4    2015
Name: Date, dtype: int64

In [91]:
train['year'] = train['Date'].dt.year
test['year'] = test['Date'].dt.year

In [92]:
train['month'] = train['Date'].dt.month
test['month'] = test['Date'].dt.month

In [93]:
train['weekday'] = train['Date'].dt.weekday
test['weekday'] = test['Date'].dt.weekday

In [94]:
train.head()

Unnamed: 0,id,Store,Date,Sales,Promo,SchoolHoliday,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,year,month,weekday
0,14929,85,2015-05-01,11360,1,0,0,1,0,0,2015,5,4
1,14930,512,2015-05-01,10534,1,0,0,1,0,0,2015,5,4
2,14931,1097,2015-05-01,17039,1,0,0,1,0,0,2015,5,4
3,14932,1,2015-04-30,6228,1,0,1,0,0,0,2015,4,3
4,14933,9,2015-04-30,9717,1,0,1,0,0,0,2015,4,3


# 3. 베이스 라인 모델링

In [95]:
from xgboost import XGBRegressor

In [96]:
xgb = XGBRegressor(m_estimators=300, learning_rate=0.1, random_state=2020)

In [97]:
# 인풋변수로 활용할 데이터 지정
xgb.fit(train[['Promo', 'SchoolHoliday', 'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c', 'year', 'month', 'weekday']],
       train['Sales'])  # 예측할 데이터

Parameters: { m_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, m_estimators=300, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=16, num_parallel_tree=1,
             random_state=2020, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [104]:
# test 예측
test['Sales'] = xgb.predict(test[['Promo', 'SchoolHoliday', 'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c', 'year', 'month', 'weekday']])

### 위 데이터를 저장해서 kaggle - 해당 대회 - Submit Predictions로 제출

In [65]:
test[['id', 'Sales']].to_csv("submission.csv", index=False)