In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv('/content/drive/My Drive/SAMDASOODA_DACON/data/201901-202003.csv')

In [None]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [None]:
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [None]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

In [None]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [None]:
df_num['SIDOCHAI'] = 0

a = df_num['CARD_SIDO_NM'] != df_num['HOM_SIDO_NM']
df_num.loc[a, 'SIDOCHAI'] = 1

In [None]:
df_num['PERSON'] = 0
b = df_num['CSTMR_CNT']>df_num['CNT']
c = df_num['CSTMR_CNT']<=df_num['CNT']
df_num.loc[b,'PERSON'] = df_num['CNT']
df_num.loc[c,'PERSON'] = df_num['CSTMR_CNT']

In [None]:
df_num.PERSON = pd.qcut(df_num.PERSON, 5, labels = False)

In [None]:
df_num

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,SIDOCHAI,PERSON
0,0,0,0,1,1,1,2019,1,4,311200,4,0,0
1,0,0,0,1,1,1,2019,2,3,605000,3,0,0
2,0,0,0,1,1,1,2019,6,3,139000,3,0,0
3,0,0,0,1,1,1,2019,8,3,27500,3,0,0
4,0,0,0,1,1,1,2019,9,3,395500,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057389,16,40,16,6,2,5,2019,3,3,148000,4,0,0
1057390,16,40,16,6,2,5,2019,5,5,329800,7,0,0
1057391,16,40,16,6,2,5,2019,10,7,557800,7,0,1
1057392,16,40,16,6,2,5,2019,12,3,247800,3,0,0


In [None]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
x = train_num.drop(['HOM_SIDO_NM','CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = np.log1p(train_num['AMT'])

In [None]:
k = int(len(x)*0.9)

In [None]:
x_train = x[:k]
y_train = y[:k]
x_val = x[k:]
y_val = y[k:]

In [None]:
import lightgbm as lgb

In [None]:
train_ds = lgb.Dataset(x_train, label=y_train)
val_ds = lgb.Dataset(x_val, label=y_val)

In [None]:
params = {
            'learning_rate' : 0.05,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

In [None]:
model = lgb.train(params,
                  train_ds,
                  1000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 0.815128
[200]	valid_0's rmse: 0.746982
[300]	valid_0's rmse: 0.720203
[400]	valid_0's rmse: 0.704419
[500]	valid_0's rmse: 0.691268
[600]	valid_0's rmse: 0.679819
[700]	valid_0's rmse: 0.672386
[800]	valid_0's rmse: 0.665343
[900]	valid_0's rmse: 0.660949
[1000]	valid_0's rmse: 0.656698
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.656698


In [None]:
df_num
x

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,year,month,SIDOCHAI,PERSON
149245,2,4,2,1,2,2019,4,0,0
554956,8,26,1,1,2,2020,2,1,0
918516,14,33,4,2,4,2019,5,0,4
425414,7,18,2,2,1,2019,10,1,3
640949,9,39,5,2,5,2020,1,1,1
...,...,...,...,...,...,...,...,...,...
359783,6,16,1,1,1,2019,12,1,4
152315,2,7,3,2,3,2020,3,1,0
963395,15,19,5,1,5,2020,2,1,1
117952,1,31,1,1,2,2019,7,1,0


In [None]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
# HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        # for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                # temp.append([CARD_SIDO_NM, STD_CLSS_NM, year, month])
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=['CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD','FLC', 'year', 'month'])

In [None]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,year,month
0,0,0,1,1,1,2020,4
1,0,0,1,1,1,2020,7
2,0,0,1,1,2,2020,4
3,0,0,1,1,2,2020,7
4,0,0,1,1,3,2020,4
...,...,...,...,...,...,...,...
97575,16,30,0,2,3,2020,7
97576,16,30,0,2,4,2020,4
97577,16,30,0,2,4,2020,7
97578,16,30,0,2,5,2020,4


In [None]:
# 예측
pred = model.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [None]:
temp

In [None]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [None]:
# 제출 파일 만들기
submission = pd.read_csv('/content/drive/My Drive/SAMDASOODA_DACON/data/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('/content/drive/My Drive/SAMDASOODA_DACON/data/submission2.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,15367181.0
1,202004,강원,골프장 운영업,20867661.0
2,202004,강원,과실 및 채소 소매업,7995668.0
3,202004,강원,관광 민예품 및 선물용품 소매업,5846756.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,7228213.0


In [None]:
submission.loc[submission.STD_CLSS_NM == '그외 기타 분류안된 오락관련 서비스업', 'AMT'] = 0

In [None]:
submission.to_csv('/content/drive/My Drive/SAMDASOODA_DACON/data/submission2.csv', encoding='utf-8-sig')