In [47]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, matplotlib as mpl, seaborn as sns, warnings

%matplotlib inline
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family='NanumGothic')
rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False

plt.style.use('ggplot')
color_pal = ["#F8766D", "#D39200", "#93AA00", "#00BA38", "#00C19F", "#00B9E3", "#619CFF", "#DB72FB"]

warnings.filterwarnings(action='ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [48]:
df_raw = pd.read_csv('./data/jeju_data_ver1/201901-202003.csv')

In [49]:
df_raw.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,1,1,4,311200,4
1,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,7,1374500,8
2,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,6,818700,6
3,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,4,1717000,5
4,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,1047300,3


In [50]:
df_raw.columns

Index(['REG_YYMM', 'CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM',
       'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'CSTMR_CNT', 'AMT', 'CNT'],
      dtype='object')

#### 1, 2, 3월 데이터만 추출

In [51]:
df = df_raw[(df_raw['REG_YYMM'] == 202001) | (df_raw['REG_YYMM'] == 202002) | (df_raw['REG_YYMM'] == 202003)]
df.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
20425415,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,2,1,3,345000,3
20425416,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,3,1903450,3
20425417,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,14,1520500,15
20425418,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,9,1239200,9
20425419,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,606700,4


In [52]:
df.fillna('세종시', inplace=True)

In [53]:
df.columns

Index(['REG_YYMM', 'CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM',
       'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'CSTMR_CNT', 'AMT', 'CNT'],
      dtype='object')

In [54]:
df.drop(['CARD_CCG_NM','HOM_SIDO_NM','HOM_CCG_NM'], axis=1, inplace=True)

In [55]:
df = df.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC']).sum().reset_index()

In [56]:
df.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,202001,강원,건강보조식품 소매업,20s,1,1,6,168500,6
1,202001,강원,건강보조식품 소매업,20s,2,1,13,764646,13
2,202001,강원,건강보조식품 소매업,30s,1,2,72,12017410,73
3,202001,강원,건강보조식품 소매업,30s,2,2,88,9867625,89
4,202001,강원,건강보조식품 소매업,40s,1,3,90,10645775,91


# 군집 데이터 불러오기

In [61]:
cluster_info = pd.read_csv("./data/class_cluster.txt")

In [62]:
for i in cluster_info.iloc[1]:
    print(i)

['과실 및 채소 소매업' '기타 대형 종합 소매업' '기타 외국식 음식점업' '기타음식료품위주종합소매업' '비알콜 음료점업'
 '빵 및 과자류 소매업' '서양식 음식점업' '수산물 소매업' '슈퍼마켓' '욕탕업' '육류 소매업' '일식 음식점업'
 '중식 음식점업' '차량용 가스 충전업' '차량용 주유소 운영업' '피자 햄버거 샌드위치 및 유사 음식점업' '한식 음식점업'
 '호텔업' '화장품 및 방향제 소매업']


# 2번 군집

In [63]:
class_list = ['과실 및 채소 소매업', '기타 대형 종합 소매업', '기타 외국식 음식점업', '기타음식료품위주종합소매업', '비알콜 음료점업',
 '빵 및 과자류 소매업', '서양식 음식점업', '수산물 소매업', '슈퍼마켓', '욕탕업', '육류 소매업', '일식 음식점업',
 '중식 음식점업', '차량용 가스 충전업', '차량용 주유소 운영업', '피자 햄버거 샌드위치 및 유사 음식점업', '한식 음식점업',
 '호텔업', '화장품 및 방향제 소매업']

In [64]:
df = df[df['STD_CLSS_NM'].map(lambda x: x in class_list)]

In [65]:
df.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
38,202001,강원,과실 및 채소 소매업,20s,1,1,393,11932800,488
39,202001,강원,과실 및 채소 소매업,20s,1,2,25,1069850,47
40,202001,강원,과실 및 채소 소매업,20s,2,1,403,13157240,482
41,202001,강원,과실 및 채소 소매업,20s,2,2,60,2262430,80
42,202001,강원,과실 및 채소 소매업,30s,1,1,41,4812250,76


In [66]:
df.STD_CLSS_NM.unique()

array(['과실 및 채소 소매업', '기타 대형 종합 소매업', '기타 외국식 음식점업', '기타음식료품위주종합소매업',
       '비알콜 음료점업', '빵 및 과자류 소매업', '서양식 음식점업', '수산물 소매업', '슈퍼마켓', '욕탕업',
       '육류 소매업', '일식 음식점업', '중식 음식점업', '차량용 가스 충전업', '차량용 주유소 운영업',
       '피자 햄버거 샌드위치 및 유사 음식점업', '한식 음식점업', '호텔업', '화장품 및 방향제 소매업'],
      dtype=object)

In [67]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [68]:
df['year'] = df['REG_YYMM'].apply(lambda x: grap_year(x))
df['month'] = df['REG_YYMM'].apply(lambda x: grap_month(x))
df = df.drop(['REG_YYMM'], axis=1)

# Label Encoding

In [69]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df[column] = encoder.transform(df[column])

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24093 entries, 38 to 38498
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   CARD_SIDO_NM  24093 non-null  int32
 1   STD_CLSS_NM   24093 non-null  int32
 2   AGE           24093 non-null  int32
 3   SEX_CTGO_CD   24093 non-null  int64
 4   FLC           24093 non-null  int64
 5   CSTMR_CNT     24093 non-null  int64
 6   AMT           24093 non-null  int64
 7   CNT           24093 non-null  int64
 8   year          24093 non-null  int64
 9   month         24093 non-null  int64
dtypes: int32(3), int64(7)
memory usage: 1.7 MB


In [71]:
df.STD_CLSS_NM.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18])

# Preprocessing

In [72]:
# 변수명 지정
X_cols = list(df.columns)
X_cols.remove('AMT')

In [73]:
X = df.drop(['CSTMR_CNT','AMT','CNT'], axis=1)
y = np.log1p(df['AMT'])

In [74]:
# X 변수 Scale 적용
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [76]:
X_train

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,year,month
3955,4,16,3,2,2,2020,1
9083,11,11,0,2,1,2020,1
36100,13,12,3,1,3,2020,3
31145,6,18,2,2,1,2020,3
12100,15,6,3,1,3,2020,1
...,...,...,...,...,...,...,...
7193,8,15,2,2,3,2020,1
28985,3,14,3,1,4,2020,3
37869,16,0,1,1,2,2020,3
2941,3,10,0,1,1,2020,1


# Light Gradient Boosting Machine

In [77]:
import lightgbm as lgb

In [78]:
train_ds = lgb.Dataset(X_train, label=y_train)
val_ds = lgb.Dataset(X_test, label=y_test)

In [79]:
params = {
            'learning_rate' : 0.1,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

In [80]:
model = lgb.train(params,
                  train_ds,
                  5000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.673482
[200]	valid_0's rmse: 0.55039
[300]	valid_0's rmse: 0.501036
[400]	valid_0's rmse: 0.467673
[500]	valid_0's rmse: 0.444994
[600]	valid_0's rmse: 0.430275
[700]	valid_0's rmse: 0.418826
[800]	valid_0's rmse: 0.409999
[900]	valid_0's rmse: 0.404457
[1000]	valid_0's rmse: 0.398762
[1100]	valid_0's rmse: 0.392406
[1200]	valid_0's rmse: 0.38931
[1300]	valid_0's rmse: 0.386431
[1400]	valid_0's rmse: 0.384358
[1500]	valid_0's rmse: 0.382666
[1600]	valid_0's rmse: 0.380584
[1700]	valid_0's rmse: 0.37837
[1800]	valid_0's rmse: 0.376092
[1900]	valid_0's rmse: 0.374164
[2000]	valid_0's rmse: 0.372993
[2100]	valid_0's rmse: 0.371813
[2200]	valid_0's rmse: 0.370806
[2300]	valid_0's rmse: 0.370032
[2400]	valid_0's rmse: 0.369723
[2500]	valid_0's rmse: 0.368749
[2600]	valid_0's rmse: 0.367913
[2700]	valid_0's rmse: 0.367472
[2800]	valid_0's rmse: 0.36687
[2900]	valid_0's rmse: 0.366575
[3000]	valid_0's rmse: 

In [81]:
X.columns

Index(['CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year',
       'month'],
      dtype='object')

In [82]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df['STD_CLSS_NM'].unique()
#CARD_CCG_NMs  = df['CARD_CCG_NM'].unique()
AGEs          = df['AGE'].unique()
SEX_CTGO_CDs  = df['SEX_CTGO_CD'].unique()
FLCs          = df['FLC'].unique()
years         = [2020]
months        = [4]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for AGE in AGEs:
            for SEX_CTGO_CD in SEX_CTGO_CDs:
                for FLC in FLCs:
                    for year in years:
                        for month in months:
                            temp.append([CARD_SIDO_NM, STD_CLSS_NM,  AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=X.columns)

In [83]:
X.columns

Index(['CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year',
       'month'],
      dtype='object')

In [84]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,year,month
0,0,0,1,1,1,2020,4
1,0,0,1,1,2,2020,4
2,0,0,1,1,3,2020,4
3,0,0,1,1,4,2020,4
4,0,0,1,1,5,2020,4
...,...,...,...,...,...,...,...
22605,16,18,0,2,1,2020,4
22606,16,18,0,2,2,2020,4
22607,16,18,0,2,3,2020,4
22608,16,18,0,2,4,2020,4


In [85]:
# 예측
pred = model.predict(temp)

In [86]:
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [87]:
encoders

{'CARD_SIDO_NM': LabelEncoder(),
 'STD_CLSS_NM': LabelEncoder(),
 'AGE': LabelEncoder()}

In [88]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [89]:
temp.to_csv('./data/clusters/cluster2.csv')