In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, matplotlib as mpl, seaborn as sns, warnings

%matplotlib inline
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family='NanumGothic')
rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False

plt.style.use('ggplot')
color_pal = ["#F8766D", "#D39200", "#93AA00", "#00BA38", "#00C19F", "#00B9E3", "#619CFF", "#DB72FB"]

warnings.filterwarnings(action='ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR, SVR
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from xgboost import XGBRegressor

In [4]:
df_raw = pd.read_csv('./data/jeju_data_ver1/201901-202003.csv')

In [5]:
df_raw.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,1,1,4,311200,4
1,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,7,1374500,8
2,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,6,818700,6
3,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,4,1717000,5
4,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,1047300,3


# Data Preprocessing

In [6]:
df = df_raw

In [7]:
df.fillna('세종시', inplace=True)

In [8]:
df.columns

Index(['REG_YYMM', 'CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM',
       'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'CSTMR_CNT', 'AMT', 'CNT'],
      dtype='object')

In [9]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [10]:
df['year'] = df['REG_YYMM'].apply(lambda x: grap_year(x))
df['month'] = df['REG_YYMM'].apply(lambda x: grap_month(x))
df = df.drop(['REG_YYMM'], axis=1)

In [11]:
df.drop(['CARD_CCG_NM', 'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD'], axis=1, inplace=True)

# 업종 카운트 데이터 생성

In [12]:
# 업종 카운트 데이터
tmp = df.groupby(['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'year', 'month']).count()
tmp = tmp[['CNT']].rename(columns={'CNT':'std_cnt'})

In [13]:
# 데이터 정제
columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'year', 'month']

df = df.groupby(columns).sum().reset_index(drop=False)
df = df.merge(tmp, how='left', on=columns)
# df = df.merge(flcs, how='left', on=columns)
df.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,year,month,CSTMR_CNT,AMT,CNT,std_cnt
0,강원,건강보조식품 소매업,1,2019,1,4,311200,4,1
1,강원,건강보조식품 소매업,1,2019,2,7,1517000,8,2
2,강원,건강보조식품 소매업,1,2019,3,16,982750,17,3
3,강원,건강보조식품 소매업,1,2019,4,4,266000,4,1
4,강원,건강보조식품 소매업,1,2019,5,13,1057200,15,3


In [14]:
# 단골 지수
def regular(x, y):
    if x == 0:
        x += 1
    return y / x

df['regular_power'] = df.apply(lambda x: regular(x['CSTMR_CNT'], x['CNT']), axis=1)

In [15]:
def make_season(data):
    if data in [12, 1, 2]:
        return 'Winter'
    elif data in [3, 4, 5]:
        return 'Spring'
    elif data in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

In [16]:
df['Season'] = df['month'].apply(lambda x: make_season(x))

In [17]:
df['CV'] = 0
df.loc[(df['year']==2020) & (df['month']==1), 'CV'] = 1
df.loc[(df['year']==2020) & (df['month']==2), 'CV'] = 2
df.loc[(df['year']==2020) & (df['month']==3), 'CV'] = 3

In [18]:
df['FLC'] = df['FLC'].astype('object')
df['month'] = df['month'].astype('object')

In [19]:
df

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,year,month,CSTMR_CNT,AMT,CNT,std_cnt,regular_power,Season,CV
0,강원,건강보조식품 소매업,1,2019,1,4,311200,4,1,1.000000,Winter,0
1,강원,건강보조식품 소매업,1,2019,2,7,1517000,8,2,1.142857,Winter,0
2,강원,건강보조식품 소매업,1,2019,3,16,982750,17,3,1.062500,Spring,0
3,강원,건강보조식품 소매업,1,2019,4,4,266000,4,1,1.000000,Spring,0
4,강원,건강보조식품 소매업,1,2019,5,13,1057200,15,3,1.153846,Spring,0
...,...,...,...,...,...,...,...,...,...,...,...,...
44977,충북,휴양콘도 운영업,5,2019,11,167,18735900,246,28,1.473054,Autumn,0
44978,충북,휴양콘도 운영업,5,2019,12,128,9446100,190,21,1.484375,Winter,0
44979,충북,휴양콘도 운영업,5,2020,1,149,12968610,233,21,1.563758,Winter,1
44980,충북,휴양콘도 운영업,5,2020,2,61,3895100,97,7,1.590164,Winter,2


In [20]:
df['FLC'] = df['FLC'].astype('object')
df['month'] = df['month'].astype('object')

In [21]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [22]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44982 entries, 0 to 44981
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CARD_SIDO_NM   44982 non-null  int32  
 1   STD_CLSS_NM    44982 non-null  int32  
 2   FLC            44982 non-null  int32  
 3   year           44982 non-null  int64  
 4   month          44982 non-null  int32  
 5   CSTMR_CNT      44982 non-null  int64  
 6   AMT            44982 non-null  int64  
 7   CNT            44982 non-null  int64  
 8   std_cnt        44982 non-null  int64  
 9   regular_power  44982 non-null  float64
 10  Season         44982 non-null  int32  
 11  CV             44982 non-null  int64  
dtypes: float64(1), int32(5), int64(6)
memory usage: 3.6 MB


In [23]:
df.CARD_SIDO_NM.unique()

array(['강원', '경기', '경남', '경북', '광주', '대구', '대전', '부산', '서울', '세종', '울산',
       '인천', '전남', '전북', '제주', '충남', '충북'], dtype=object)

## 4. 변수 선택 및 모델 구축
## Feature Engineering & Initial Modeling  

In [26]:
# feature, target 설정
x = df_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = np.log1p(df_num['AMT'])

In [27]:
s_train_ind = s_train_ind = df[~((df['year']==2020) & (df['month']==3)) & (df['CARD_SIDO_NM']=='서울')].index

In [28]:
# 서울, 경기, 그외 지역으로 분리
s_train_ind = df[~((df['year']==2020) & (df['month']==3)) & (df['CARD_SIDO_NM']=='서울')].index
s_valid_ind = df[(df['year']==2020) & (df['month']==3) & (df['CARD_SIDO_NM']=='서울')].index

k_train_ind = df[~((df['year']==2020) & (df['month']==3)) & (df['CARD_SIDO_NM']=='경기')].index
k_valid_ind = df[(df['year']==2020) & (df['month']==3) & (df['CARD_SIDO_NM']=='경기')].index

j_train_ind = df[~((df['year']==2020) & (df['month']==3)) & (df['CARD_SIDO_NM']=='제주')].index
j_valid_ind = df[(df['year']==2020) & (df['month']==3) & (df['CARD_SIDO_NM']=='제주')].index

other_train_ind = df[~((df['year'] == 2020) & (df['month'] == 3)) & ~(df['CARD_SIDO_NM'].isin(['서울', '경기', '제주']))].index
other_valid_ind = df[~df['CARD_SIDO_NM'].isin(['서울', '경기', '제주']) & (df['year'] == 2020) & (df['month'] == 3)].index

In [29]:
x_s_train = x.iloc[s_train_ind]
x_s_val = x.iloc[s_valid_ind]
y_s_train = y.iloc[s_train_ind]
y_s_val = y.iloc[s_valid_ind]

x_k_train = x.iloc[k_train_ind]
x_k_val = x.iloc[k_valid_ind]
y_k_train = y.iloc[k_train_ind]
y_k_val = y.iloc[k_valid_ind]

x_j_train = x.iloc[j_train_ind]
x_j_val = x.iloc[j_valid_ind]
y_j_train = y.iloc[j_train_ind]
y_j_val = y.iloc[j_valid_ind]

x_oth_train = x.iloc[other_train_ind]
x_oth_val = x.iloc[other_valid_ind]
y_oth_train = y.iloc[other_train_ind]
y_oth_val = y.iloc[other_valid_ind]

In [30]:
print(len(x_s_train))
print(len(y_s_train))
print(len(x_s_val))
print(len(y_s_val))

2710
2710
192
192


In [31]:
XGB_model = XGBRegressor(learning_rate=0.3, max_depth=6, n_estimators=500)
XGB_model.fit(x_s_train, y_s_train)
y_XGB_predict = XGB_model.predict(x_s_val)

In [32]:
mean_squared_error(y_XGB_predict, y_s_val)

0.171100881231924

In [33]:
y_XGB_predict

array([16.310707, 18.414347, 18.418999, 19.402185, 19.560858, 13.849648,
       16.632502, 18.033276, 19.030632, 18.521885, 19.84083 , 21.435293,
       21.13557 , 22.166647, 22.059439, 16.475061, 16.8193  , 15.8172  ,
       16.223171, 16.546776, 14.71153 , 14.732451, 14.889422, 13.897188,
       11.803713, 21.707321, 23.529276, 22.782612, 22.835949, 22.391802,
       22.822538, 23.705297, 23.203386, 23.759031, 23.218279, 14.625119,
       14.004339, 13.283588, 11.064463, 21.01227 , 21.298172, 21.039614,
       21.570726, 20.755268, 18.728514, 19.185207, 18.714361, 20.063076,
       19.697653, 23.439865, 22.734674, 21.431362, 22.006685, 20.989592,
       18.438498, 19.297857, 19.244724, 20.035732, 19.438349, 16.42789 ,
       17.42209 , 17.2213  , 17.0502  , 16.750021, 15.216626, 17.032782,
       16.159264, 15.991626, 20.822563, 20.461645, 20.187824, 20.515354,
       20.781752, 22.913118, 22.749744, 22.214422, 22.093842, 21.780527,
       21.57746 , 22.162842, 21.850693, 21.657724, 

  ## 5. 모델 학습 및 검증
## Model Tuning & Evaluation

In [177]:
import lightgbm as lgb

In [185]:
def rmsle_lgbm(y_pred, data):
    y_true = np.array(data.get_label())
    score = np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2)))
  
    return 'rmsle', score, False

In [202]:
params = {
            'learning_rate' : 0.05,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
          # 'lambda_l1' : 0.1,
            'lambda_l2' : 0.1,
          # 'max_depth' : -1,
          # 'bagging_freq' : 5,
          # 'max_bin': 128
        }

In [203]:
# 서울
categorical_features = list(df.dtypes[df.dtypes=='object'].index)
train_s = lgb.Dataset(x_s_train, label=y_s_train, categorical_feature=categorical_features)
val_s = lgb.Dataset(x_s_val, label=y_s_val)

In [204]:
model_s = lgb.train(params,
                  train_s,
                  20000,
                  val_s,
                  verbose_eval = 100,
                  early_stopping_rounds = 100,
                  feval=rmsle_lgbm
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.581347	valid_0's rmsle: 0.0307954
[200]	valid_0's rmse: 0.539808	valid_0's rmsle: 0.0287269
[300]	valid_0's rmse: 0.535693	valid_0's rmsle: 0.028477
[400]	valid_0's rmse: 0.535967	valid_0's rmsle: 0.0285085
[500]	valid_0's rmse: 0.535819	valid_0's rmsle: 0.0284761
Early stopping, best iteration is:
[429]	valid_0's rmse: 0.534556	valid_0's rmsle: 0.0284377


In [205]:
# 경기
categorical_features = list(df.dtypes[df.dtypes=='object'].index)
train_k = lgb.Dataset(x_k_train, label=y_k_train, categorical_feature=categorical_features)
val_k = lgb.Dataset(x_k_val, label=y_k_val)

In [206]:
model_k = lgb.train(params,
                  train_k,
                  20000,
                  val_k,
                  verbose_eval = 100,
                  early_stopping_rounds = 100,
                  feval=rmsle_lgbm
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.370306	valid_0's rmsle: 0.0215524
[200]	valid_0's rmse: 0.363244	valid_0's rmsle: 0.0212331
[300]	valid_0's rmse: 0.360985	valid_0's rmsle: 0.0212317
Early stopping, best iteration is:
[219]	valid_0's rmse: 0.361739	valid_0's rmsle: 0.0211595


In [207]:
# 제주
categorical_features = list(df.dtypes[df.dtypes=='object'].index)
train_j = lgb.Dataset(x_j_train, label=y_j_train, categorical_feature=categorical_features)
val_j = lgb.Dataset(x_j_val, label=y_j_val)

In [208]:
model_j = lgb.train(params,
                  train_j,
                  20000,
                  val_j,
                  verbose_eval = 100,
                  early_stopping_rounds = 100,
                  feval=rmsle_lgbm
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.53614	valid_0's rmsle: 0.0311135
[200]	valid_0's rmse: 0.470563	valid_0's rmsle: 0.028472
[300]	valid_0's rmse: 0.444586	valid_0's rmsle: 0.0270741
[400]	valid_0's rmse: 0.436168	valid_0's rmsle: 0.0265152
[500]	valid_0's rmse: 0.431651	valid_0's rmsle: 0.0262495
[600]	valid_0's rmse: 0.428663	valid_0's rmsle: 0.026081
[700]	valid_0's rmse: 0.425419	valid_0's rmsle: 0.0258633
[800]	valid_0's rmse: 0.42364	valid_0's rmsle: 0.0257863
[900]	valid_0's rmse: 0.421449	valid_0's rmsle: 0.0256217
[1000]	valid_0's rmse: 0.420491	valid_0's rmsle: 0.0255487
[1100]	valid_0's rmse: 0.418918	valid_0's rmsle: 0.0254653
[1200]	valid_0's rmse: 0.417848	valid_0's rmsle: 0.0253899
[1300]	valid_0's rmse: 0.416861	valid_0's rmsle: 0.0253345
[1400]	valid_0's rmse: 0.416766	valid_0's rmsle: 0.0253179
[1500]	valid_0's rmse: 0.415616	valid_0's rmsle: 0.0252608
[1600]	valid_0's rmse: 0.415911	valid_0's rmsle: 0.0252985
Early s

In [209]:
# 그 외 지역
categorical_features = list(df.dtypes[df.dtypes=='object'].index)
train_oth = lgb.Dataset(x_oth_train, label=y_oth_train, categorical_feature=categorical_features)
val_oth = lgb.Dataset(x_oth_val, label=y_oth_val)

In [210]:
model_oth = lgb.train(params,
                  train_oth,
                  20000,
                  val_oth,
                  verbose_eval = 100,
                  early_stopping_rounds = 100,
                  feval=rmsle_lgbm
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.413954	valid_0's rmsle: 0.0256424
[200]	valid_0's rmse: 0.381746	valid_0's rmsle: 0.023897
[300]	valid_0's rmse: 0.366914	valid_0's rmsle: 0.0231998
[400]	valid_0's rmse: 0.364619	valid_0's rmsle: 0.0232819
Early stopping, best iteration is:
[333]	valid_0's rmse: 0.36533	valid_0's rmsle: 0.0231604


## 6. 결과 및 결언
## Conclusion & Discussion

In [211]:
x.columns

Index(['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'year', 'month', 'CV', 'std_cnt',
       'regular_power', 'Season'],
      dtype='object')

In [222]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
FLCs          = df_num['FLC'].unique()
CVs           = [3]
Seasons       = df_num['Season'].unique()
years         = [2020]
months        = [3, 6]

temp = []
from itertools import product
comb_list = [CARD_SIDO_NMs, STD_CLSS_NMs, FLCs, years, months, CVs, Seasons]
temp = np.array(list(product(*comb_list)))
temp = pd.DataFrame(data=temp, columns=x.columns[:7])

In [223]:
temp.drop(['std_cnt'], axis=1, inplace=True)

In [224]:
# 7월은 // 2하면 안됨. 수정 필요
import math

tmp = df_num.groupby(['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'month'])[['CSTMR_CNT', 'CNT', 'std_cnt']].sum().reset_index(drop=False)
tmp['regular_power'] = tmp.apply(lambda x : math.ceil(regular(x['CSTMR_CNT'], x['CNT'])/2), axis=1)
tmp['std_cnt'] = tmp['std_cnt'] // 2 + 1
tmp = tmp[tmp['month'].isin(months)].reset_index(drop=True)
tmp.drop(['CSTMR_CNT', 'CNT'], axis=1, inplace=True)
tmp.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,month,std_cnt,regular_power
0,0,0,0,3,1,1
1,0,0,0,6,3,1
2,0,0,1,3,9,1
3,0,0,1,6,8,1
4,0,0,2,3,8,1


In [225]:
temp = temp.merge(tmp, on=['CARD_SIDO_NM', 'STD_CLSS_NM', 'FLC', 'month'], how='left')
temp.fillna(1.0, inplace=True)

In [228]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,FLC,year,month,CV,std_cnt,regular_power
0,0,0,0,2020,3,3,1.0,1.0
1,0,0,0,2020,3,3,1.0,1.0
2,0,0,0,2020,3,3,1.0,1.0
3,0,0,0,2020,3,3,1.0,1.0
4,0,0,0,2020,6,3,3.0,1.0
...,...,...,...,...,...,...,...,...
27875,16,30,4,2020,3,3,1.0,1.0
27876,16,30,4,2020,6,3,1.0,1.0
27877,16,30,4,2020,6,3,1.0,1.0
27878,16,30,4,2020,6,3,1.0,1.0


In [229]:
df.CARD_SIDO_NM.unique()

array(['강원', '경기', '경남', '경북', '광주', '대구', '대전', '부산', '서울', '세종', '울산',
       '인천', '전남', '전북', '제주', '충남', '충북'], dtype=object)

In [230]:
# 예측
pred_s = model_s.predict(temp[temp['CARD_SIDO_NM'] == 8])
pred_s = np.expm1(pred_s)

pred_k = model_k.predict(temp[temp['CARD_SIDO_NM'] == 1])
pred_k = np.expm1(pred_k)

pred_j = model_j.predict(temp[temp['CARD_SIDO_NM'] == 14])
pred_j = np.expm1(pred_j)

pred_oth = model_oth.predict(temp[~temp['CARD_SIDO_NM'].isin([1,8,14])])
pred_oth = np.expm1(pred_oth)

In [231]:
# 디코딩
temp.loc[temp['CARD_SIDO_NM'] == 8, 'AMT'] = np.round(pred_s, 0)
temp.loc[temp['CARD_SIDO_NM'] == 1, 'AMT'] = np.round(pred_k, 0)
temp.loc[temp['CARD_SIDO_NM'] == 14, 'AMT'] = np.round(pred_j, 0)
temp.loc[~temp['CARD_SIDO_NM'].isin([1,8,14]), 'AMT'] = np.round(pred_oth, 0)

temp['REG_YYMM'] = temp['year']*100 + temp['month'] + 1
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'])['AMT'].sum().reset_index(drop=False)
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [232]:
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,강원,건강보조식품 소매업,2.139321e+08
1,202004,강원,골프장 운영업,3.432951e+09
2,202004,강원,과실 및 채소 소매업,2.099542e+09
3,202004,강원,관광 민예품 및 선물용품 소매업,2.822536e+07
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,2.492472e+06
...,...,...,...,...
1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,2.981553e+09
1390,202007,충북,한식 음식점업,5.796735e+10
1391,202007,충북,호텔업,5.042792e+07
1392,202007,충북,화장품 및 방향제 소매업,1.201859e+09


In [233]:
# 2020년 3월 기준 특정 업종이 없는 지역
sido = df['CARD_SIDO_NM'].unique()
std = df['STD_CLSS_NM'].unique()

tmp = df[(df['month'] == 3) & (df['year'] == 2020)]
tmp = tmp[['CARD_SIDO_NM', 'STD_CLSS_NM']]
tmp = tmp.drop_duplicates(['CARD_SIDO_NM', 'STD_CLSS_NM'], keep='first').reset_index(drop=True)

d = {}

for s in sido:
    d[s] = []
    for c in std:
        d[s].append(c)

for i in tmp.index:
    d[tmp.loc[i, 'CARD_SIDO_NM']].remove(tmp.loc[i, 'STD_CLSS_NM'])

In [234]:
for i in temp.index:
    for k, v in d.items():
        if k == temp.loc[i, 'CARD_SIDO_NM']:
            if temp.loc[i, 'STD_CLSS_NM'] in v:
                temp.loc[i, 'AMT'] = 0

In [237]:
# 제출 파일 만들기
submission = pd.read_csv('../data/jeju_data_ver1/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'

In [238]:
submission

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,2.139321e+08
1,202004,강원,골프장 운영업,3.432951e+09
2,202004,강원,과실 및 채소 소매업,2.099542e+09
3,202004,강원,관광 민예품 및 선물용품 소매업,2.822536e+07
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0.000000e+00
...,...,...,...,...
1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,2.981553e+09
1390,202007,충북,한식 음식점업,5.796735e+10
1391,202007,충북,호텔업,5.042792e+07
1392,202007,충북,화장품 및 방향제 소매업,1.201859e+09


In [239]:
submission.to_csv('submission.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,213932100.0
1,202004,강원,골프장 운영업,3432951000.0
2,202004,강원,과실 및 채소 소매업,2099542000.0
3,202004,강원,관광 민예품 및 선물용품 소매업,28225360.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0.0
