## 1. 라이브러리 가져오기
## Import Library

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [2]:
np.random.seed(1234)

## 2. 데이터 전처리
## Data Cleansing & Pre-Processing  

In [3]:
def grab_year(data):
    data = str(data)
    return int(data[:4])

def grab_month(data):
    data = str(data)
    return int(data[4:6])

In [5]:
data = pd.read_csv('./data/jeju_data_ver1/201901-202003.csv')

In [6]:
data = data.fillna('세종')

In [7]:
# 날짜 처리
data['year'] = data['REG_YYMM'].apply(lambda x: grab_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grab_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [8]:
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD'], axis=1)

In [None]:
# 일부 데이터만 사용
# df = df[((df['year'] == 2019) & (df['month'].isin([4, 7]))) | (df['year'] == 2020)]
# df = df[df['year'] == 2020]

In [9]:
# 업종 카운트 데이터
tmp = df.groupby(['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'year', 'month']).count()
tmp = tmp[['CNT']].rename(columns={'CNT':'std_cnt'})

In [10]:
# 데이터 정제
columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'year', 'month']
# df.drop('FLC', axis=1, inplace=True)
df = df.groupby(columns).sum().reset_index(drop=False)
df = df.merge(tmp, how='left', on=columns)
# df = df.merge(flcs, how='left', on=columns)
df.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,year,month,CSTMR_CNT,AMT,CNT,std_cnt
0,강원,건강보조식품 소매업,1,2019,1,4,311200,4,1
1,강원,건강보조식품 소매업,1,2019,2,7,1517000,8,2
2,강원,건강보조식품 소매업,1,2019,3,16,982750,17,3
3,강원,건강보조식품 소매업,1,2019,4,4,266000,4,1
4,강원,건강보조식품 소매업,1,2019,5,13,1057200,15,3


In [11]:
# 단골 지수
def regular(x, y):
  if x == 0:
    x += 1
  return y / x

df['regular_power'] = df.apply(lambda x: regular(x['CSTMR_CNT'], x['CNT']), axis=1)

In [None]:
# df.groupby(['CARD_SIDO_NM', 'STD_CLSS_NM'])[['AMT']].sum()

In [None]:
# df[df['AMT'] > 100000000].STD_CLSS_NM.unique()

In [None]:
# np.log(df['AMT']).hist(bins=40)

In [12]:
df['FLC'] = df['FLC'].astype('object')
df['month'] = df['month'].astype('object')

In [54]:
df

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,year,month,CSTMR_CNT,AMT,CNT,std_cnt,regular_power
0,강원,건강보조식품 소매업,1,2019,1,4,311200,4,1,1.000000
1,강원,건강보조식품 소매업,1,2019,2,7,1517000,8,2,1.142857
2,강원,건강보조식품 소매업,1,2019,3,16,982750,17,3,1.062500
3,강원,건강보조식품 소매업,1,2019,4,4,266000,4,1,1.000000
4,강원,건강보조식품 소매업,1,2019,5,13,1057200,15,3,1.153846
...,...,...,...,...,...,...,...,...,...,...
44977,충북,휴양콘도 운영업,5,2019,11,167,18735900,246,28,1.473054
44978,충북,휴양콘도 운영업,5,2019,12,128,9446100,190,21,1.484375
44979,충북,휴양콘도 운영업,5,2020,1,149,12968610,233,21,1.563758
44980,충북,휴양콘도 운영업,5,2020,2,61,3895100,97,7,1.590164


In [48]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [51]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44982 entries, 0 to 44981
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CARD_SIDO_NM   44982 non-null  int32  
 1   STD_CLSS_NM    44982 non-null  int32  
 2   FLC            44982 non-null  int32  
 3   year           44982 non-null  int64  
 4   month          44982 non-null  int32  
 5   CSTMR_CNT      44982 non-null  int64  
 6   AMT            44982 non-null  int64  
 7   CNT            44982 non-null  int64  
 8   std_cnt        44982 non-null  int64  
 9   regular_power  44982 non-null  float64
dtypes: float64(1), int32(4), int64(5)
memory usage: 3.1 MB


In [52]:
df_num

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,year,month,CSTMR_CNT,AMT,CNT,std_cnt,regular_power
0,0,0,0,2019,0,4,311200,4,1,1.000000
1,0,0,0,2019,1,7,1517000,8,2,1.142857
2,0,0,0,2019,2,16,982750,17,3,1.062500
3,0,0,0,2019,3,4,266000,4,1,1.000000
4,0,0,0,2019,4,13,1057200,15,3,1.153846
...,...,...,...,...,...,...,...,...,...,...
44977,16,40,4,2019,10,167,18735900,246,28,1.473054
44978,16,40,4,2019,11,128,9446100,190,21,1.484375
44979,16,40,4,2020,0,149,12968610,233,21,1.563758
44980,16,40,4,2020,1,61,3895100,97,7,1.590164


## 4. 변수 선택 및 모델 구축
## Feature Engineering & Initial Modeling  

In [16]:
# feature, target 설정
x = df_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = np.log1p(df_num['AMT'])

In [4]:
# folds = KFold(n_splits=5, shuffle=True, random_state=1234)
# oof_preds = np.zeros(x.shape[0])
# sub_preds = np.zeros(temp.shape[0])
# categorical_features = list(df.dtypes[df.dtypes=='object'].index)

In [17]:
# 서울, 경기, 그외 지역으로 분리
s_train_ind = df_num[~((df_num['year'] == 2020) & (df_num['month'] == 2)) & (df_num['CARD_SIDO_NM'] == 8)].index
s_valid_ind = df_num[(df_num['year'] == 2020) & (df_num['month'] == 2) & (df_num['CARD_SIDO_NM'] == 8)].index

k_train_ind = df_num[~((df_num['year'] == 2020) & (df_num['month'] == 2)) & (df_num['CARD_SIDO_NM'] == 1)].index
k_valid_ind = df_num[(df_num['year'] == 2020) & (df_num['month'] == 2) & (df_num['CARD_SIDO_NM'] == 1)].index

other_train_ind = df_num[~((df_num['year'] == 2020) & (df_num['month'] == 2)) & ~(df_num['CARD_SIDO_NM'].isin([1, 8]))].index
other_valid_ind = df_num[~df_num['CARD_SIDO_NM'].isin([1, 8]) & (df_num['year'] == 2020) & (df_num['month'] == 2)].index

In [34]:
x_s_train = x.iloc[s_train_ind]
x_s_val = x.iloc[s_valid_ind]
y_s_train = y.iloc[s_train_ind]
y_s_val = y.iloc[s_valid_ind]

x_k_train = x.iloc[k_train_ind]
x_k_val = x.iloc[k_valid_ind]
y_k_train = y.iloc[k_train_ind]
y_k_val = y.iloc[k_valid_ind]

x_oth_train = x.iloc[other_train_ind]
x_oth_val = x.iloc[other_valid_ind]
y_oth_train = y.iloc[other_train_ind]
y_oth_val = y.iloc[other_valid_ind]

In [23]:
print(len(x_s_train))
print(len(y_s_train))
print(len(x_s_val))
print(len(y_s_val))

2710
2710
192
192


  ## 5. 모델 학습 및 검증
## Model Tuning & Evaluation

In [24]:
import lightgbm as lgb

In [25]:
def rmsle_lgbm(y_pred, data):
  y_true = np.array(data.get_label())
  score = np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2)))
  
  return 'rmsle', score, False

In [26]:
# 서울
categorical_features = list(df.dtypes[df.dtypes=='object'].index)
train_s = lgb.Dataset(x_s_train, label=y_s_train, categorical_feature=categorical_features)
val_s = lgb.Dataset(x_s_val, label=y_s_val)

In [27]:
params = {
            'learning_rate' : 0.05,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'custom',
            'sub_row' : 0.75,
          # 'lambda_l1' : 0.1,
            'lambda_l2' : 0.1,
          # 'max_depth' : -1,
          # 'bagging_freq' : 5,
          # 'max_bin': 128
        }

In [28]:
model_s = lgb.train(params,
                  train_s,
                  20000,
                  val_s,
                  verbose_eval = 100,
                  early_stopping_rounds = 100,
                  feval=rmsle_lgbm
                 )

New categorical_feature is ['CARD_SIDO_NM', 'FLC', 'STD_CLSS_NM', 'month']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmsle: 0.0313482
[200]	valid_0's rmsle: 0.0296659
[300]	valid_0's rmsle: 0.0293572
[400]	valid_0's rmsle: 0.0290892
[500]	valid_0's rmsle: 0.0289958
[600]	valid_0's rmsle: 0.0289845
[700]	valid_0's rmsle: 0.0290163
Early stopping, best iteration is:
[642]	valid_0's rmsle: 0.0289689


In [29]:
# 경기
categorical_features = list(df.dtypes[df.dtypes=='object'].index)
train_k = lgb.Dataset(x_k_train, label=y_k_train, categorical_feature=categorical_features)
val_k = lgb.Dataset(x_k_val, label=y_k_val)

In [31]:
model_k = lgb.train(params,
                  train_k,
                  20000,
                  val_k,
                  verbose_eval = 100,
                  early_stopping_rounds = 100,
                  feval=rmsle_lgbm
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmsle: 0.0213722
[200]	valid_0's rmsle: 0.0208884
Early stopping, best iteration is:
[133]	valid_0's rmsle: 0.0207108


In [35]:
# 그 외 지역
categorical_features = list(df.dtypes[df.dtypes=='object'].index)
train_oth = lgb.Dataset(x_oth_train, label=y_oth_train, categorical_feature=categorical_features)
val_oth = lgb.Dataset(x_oth_val, label=y_oth_val)

In [36]:
# params = {
#             'learning_rate' : 0.05,
#             'boosting_type': 'gbdt',
#             'objective': 'tweedie',
#             'tweedie_variance_power': 1.1,
#             'metric': 'rmse',
#             'sub_row' : 0.75,
#             'lambda_l2' : 0.1,
#           # 'device' : 'gpu'
#         }

In [37]:
model_oth = lgb.train(params,
                  train_oth,
                  20000,
                  val_oth,
                  verbose_eval = 100,
                  early_stopping_rounds = 100,
                  feval=rmsle_lgbm
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmsle: 0.0264684
[200]	valid_0's rmsle: 0.0247928
[300]	valid_0's rmsle: 0.0241045
[400]	valid_0's rmsle: 0.0237347
[500]	valid_0's rmsle: 0.0233833
[600]	valid_0's rmsle: 0.0233642
[700]	valid_0's rmsle: 0.0233041
[800]	valid_0's rmsle: 0.0232741
Early stopping, best iteration is:
[739]	valid_0's rmsle: 0.023252


In [None]:
xgb_pars = {'min_child_weight': 1, 'eta': 0.5, 'colsample_bytree': 0.9, 
            'max_depth': 12,
'subsample': 0.9, 'lambda': 1., 'nthread': -1, 'booster' : 'gbtree', 'silent': 1,
'eval_metric': 'rmse', 'objective': 'reg:linear'}
model = xgb.train(xgb_pars, dtrain, 10, watchlist, early_stopping_rounds=2,
      maximize=False, verbose_eval=1)
print('Modeling RMSLE %.5f' % model.best_score)

## 6. 결과 및 결언
## Conclusion & Discussion

In [58]:
x.columns

Index(['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'year', 'month', 'std_cnt',
       'regular_power'],
      dtype='object')

In [60]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
# HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [3, 6]

temp = []
from itertools import product
comb_list = [CARD_SIDO_NMs, STD_CLSS_NMs, FLCs, years, months]
temp = np.array(list(product(*comb_list)))
temp = pd.DataFrame(data=temp, columns=x.columns[:5])

In [56]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,year,month
0,0,0,0,2020,3
1,0,0,0,2020,6
2,0,0,1,2020,3
3,0,0,1,2020,6
4,0,0,2,2020,3
...,...,...,...,...,...
6965,16,30,2,2020,6
6966,16,30,3,2020,3
6967,16,30,3,2020,6
6968,16,30,4,2020,3


In [40]:
# 7월은 // 2하면 안됨. 수정 필요
import math

tmp = df_num.groupby(['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'month'])[['CSTMR_CNT', 'CNT', 'std_cnt']].sum().reset_index(drop=False)
tmp['regular_power'] = tmp.apply(lambda x : math.ceil(regular(x['CSTMR_CNT'], x['CNT'])/2), axis=1)
tmp['std_cnt'] = tmp['std_cnt'] // 2 + 1
tmp = tmp[tmp['month'].isin(months)].reset_index(drop=True)
tmp.drop(['CSTMR_CNT', 'CNT'], axis=1, inplace=True)
tmp.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,month,std_cnt,regular_power
0,0,0,0,3,1,1
1,0,0,0,6,3,1
2,0,0,1,3,9,1
3,0,0,1,6,8,1
4,0,0,2,3,8,1


In [41]:
temp = temp.merge(tmp, on=['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'month'], how='left')
temp.fillna(1.0, inplace=True)

In [62]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,year,month
0,0,0,0,2020,3
1,0,0,0,2020,6
2,0,0,1,2020,3
3,0,0,1,2020,6
4,0,0,2,2020,3
...,...,...,...,...,...
6965,16,30,2,2020,6
6966,16,30,3,2020,3
6967,16,30,3,2020,6
6968,16,30,4,2020,3


In [1]:
# tmp = df_num.groupby(['CARD_SIDO_NM', 'STD_CLSS_NM', 'month'])[['FLC_1', 'FLC_2', 'FLC_3', 'FLC_4', 'FLC_5']].sum().reset_index(drop=False)
# temp = temp.merge(tmp, on=['CARD_SIDO_NM', 'STD_CLSS_NM', 'month'], how='left')
# temp.fillna(0, inplace=True)
# temp.loc[temp['month']==2, ['FLC_1', 'FLC_2', 'FLC_3', 'FLC_4', 'FLC_5']] = temp.loc[temp['month']==2, ['FLC_1', 'FLC_2', 'FLC_3', 'FLC_4', 'FLC_5']] * 0.4

In [43]:
# 예측
pred_s = model_s.predict(temp[temp['CARD_SIDO_NM'] == 8])
pred_s = np.expm1(pred_s)

pred_k = model_k.predict(temp[temp['CARD_SIDO_NM'] == 1])
pred_k = np.expm1(pred_k)

pred_oth = model_oth.predict(temp[~temp['CARD_SIDO_NM'].isin([1,8])])
pred_oth = np.expm1(pred_oth)

In [44]:
# 디코딩
temp.loc[temp['CARD_SIDO_NM'] == 8, 'AMT'] = np.round(pred_s, 0)
temp.loc[temp['CARD_SIDO_NM'] == 1, 'AMT'] = np.round(pred_k, 0)
temp.loc[~temp['CARD_SIDO_NM'].isin([1,8]), 'AMT'] = np.round(pred_oth, 0)

temp['REG_YYMM'] = temp['year']*100 + temp['month'] + 1
# # temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).AMT.sum().reset_index(drop=False)
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [45]:
# 2020년 3월 기준 특정 업종이 없는 지역
sido = df['CARD_SIDO_NM'].unique()
std = df['STD_CLSS_NM'].unique()

tmp = df[(df['month'] == 3) & (df['year'] == 2020)]
tmp = tmp[['CARD_SIDO_NM', 'STD_CLSS_NM']]
tmp = tmp.drop_duplicates(['CARD_SIDO_NM', 'STD_CLSS_NM'], keep='first').reset_index(drop=True)

d = {}

for s in sido:
  d[s] = []
  for c in std:
    d[s].append(c)

for i in tmp.index:
  d[tmp.loc[i, 'CARD_SIDO_NM']].remove(tmp.loc[i, 'STD_CLSS_NM'])

In [46]:
for i in temp.index:
  for k, v in d.items():
    if k == temp.loc[i, 'CARD_SIDO_NM']:
      if temp.loc[i, 'STD_CLSS_NM'] in v:
        temp.loc[i, 'AMT'] = 0

In [None]:
# 제출 파일 만들기
submission = pd.read_csv('./submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'

# for i in range(len(submission)):
#   for j in range(len(sido)):
#     if (submission.loc[i, 'CARD_SIDO_NM'] == sido[j]) and (submission.loc[i, 'STD_CLSS_NM'] == std[j]):
#       submission.loc[i, 'AMT'] = 0

In [None]:
submission.to_csv('submission.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,113586600.0
1,202004,강원,골프장 운영업,2054297000.0
2,202004,강원,과실 및 채소 소매업,866735000.0
3,202004,강원,관광 민예품 및 선물용품 소매업,17025740.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0.0


In [None]:
# 1.920