In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, matplotlib as mpl, seaborn as sns, warnings

%matplotlib inline
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family='NanumGothic')
rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False

plt.style.use('ggplot')
color_pal = ["#F8766D", "#D39200", "#93AA00", "#00BA38", "#00C19F", "#00B9E3", "#619CFF", "#DB72FB"]

warnings.filterwarnings(action='ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [2]:
df_raw = pd.read_csv('./data/jeju_data_ver1/201901-202003.csv')

In [3]:
df_raw.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,1,1,4,311200,4
1,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,7,1374500,8
2,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,6,818700,6
3,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,4,1717000,5
4,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,1047300,3


In [4]:
df_raw.STD_CLSS_NM.unique().__len__()

41

In [5]:
df_raw.columns

Index(['REG_YYMM', 'CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM',
       'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'CSTMR_CNT', 'AMT', 'CNT'],
      dtype='object')

#### 1, 2, 3월 데이터만 추출

In [6]:
# df = df_raw[(df_raw['REG_YYMM'] == 202001) | (df_raw['REG_YYMM'] == 202002) | (df_raw['REG_YYMM'] == 202003)]
# df.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
20425415,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,2,1,3,345000,3
20425416,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,3,1903450,3
20425417,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,14,1520500,15
20425418,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,9,1239200,9
20425419,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,606700,4


In [8]:
df = df_raw

In [9]:
df.fillna('세종시', inplace=True)

In [10]:
df.columns

Index(['REG_YYMM', 'CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM',
       'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'CSTMR_CNT', 'AMT', 'CNT'],
      dtype='object')

In [11]:
df.drop(['CARD_CCG_NM','HOM_SIDO_NM','HOM_CCG_NM'], axis=1, inplace=True)

In [12]:
df = df.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC']).sum().reset_index()

In [14]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [15]:
df['year'] = df['REG_YYMM'].apply(lambda x: grap_year(x))
df['month'] = df['REG_YYMM'].apply(lambda x: grap_month(x))
df = df.drop(['REG_YYMM'], axis=1)

In [20]:
d = 1
if d == 1 or 2 or 3:
    print('a')

a


In [35]:
def make_season(data):
    if data in [12, 1, 2]:
        return 'Winter'
    elif data in [3, 4, 5]:
        return 'Spring'
    elif data in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

In [36]:
df['Season'] = df['month'].apply(lambda x: make_season(x))

In [37]:
df.Season.unique()

array(['Winter', 'Spring', 'Summer', 'Autumn'], dtype=object)

# Label Encoding

In [39]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df[column] = encoder.transform(df[column])

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197601 entries, 0 to 197600
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   CARD_SIDO_NM  197601 non-null  int32
 1   STD_CLSS_NM   197601 non-null  int32
 2   AGE           197601 non-null  int32
 3   SEX_CTGO_CD   197601 non-null  int64
 4   FLC           197601 non-null  int64
 5   CSTMR_CNT     197601 non-null  int64
 6   AMT           197601 non-null  int64
 7   CNT           197601 non-null  int64
 8   year          197601 non-null  int64
 9   month         197601 non-null  int64
 10  Season        197601 non-null  int32
dtypes: int32(4), int64(7)
memory usage: 13.6 MB


In [41]:
df.STD_CLSS_NM.unique()

array([ 0,  1,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40,  4, 30])

# Preprocessing

In [42]:
# 변수명 지정
X_cols = list(df.columns)
X_cols.remove('AMT')

In [43]:
X = df.drop(['CSTMR_CNT','AMT','CNT'], axis=1)
y = np.log1p(df['AMT'])

In [44]:
# X 변수 Scale 적용
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [46]:
X_train

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,year,month,Season
55707,3,32,2,2,1,2019,5,1
108316,2,35,0,1,1,2019,9,0
112032,7,29,3,2,3,2019,9,0
98642,7,27,3,2,3,2019,8,2
1166,1,18,4,2,4,2019,1,3
...,...,...,...,...,...,...,...,...
115937,12,31,4,1,3,2019,9,0
172520,0,7,5,1,5,2020,2,3
175562,4,0,5,2,5,2020,2,3
67916,2,16,3,1,3,2019,6,2


# Light Gradient Boosting Machine

In [48]:
import lightgbm as lgb

In [49]:
train_ds = lgb.Dataset(X_train, label=y_train)
val_ds = lgb.Dataset(X_test, label=y_test)

In [50]:
params = {
            'learning_rate' : 0.1,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

In [51]:
model = lgb.train(params,
                  train_ds,
                  5000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.968421
[200]	valid_0's rmse: 0.812719
[300]	valid_0's rmse: 0.736406
[400]	valid_0's rmse: 0.689506
[500]	valid_0's rmse: 0.652058
[600]	valid_0's rmse: 0.627332
[700]	valid_0's rmse: 0.604992
[800]	valid_0's rmse: 0.587834
[900]	valid_0's rmse: 0.572083
[1000]	valid_0's rmse: 0.559519
[1100]	valid_0's rmse: 0.546631
[1200]	valid_0's rmse: 0.534704
[1300]	valid_0's rmse: 0.52533
[1400]	valid_0's rmse: 0.514307
[1500]	valid_0's rmse: 0.506834
[1600]	valid_0's rmse: 0.499993
[1700]	valid_0's rmse: 0.495252
[1800]	valid_0's rmse: 0.49021
[1900]	valid_0's rmse: 0.483807
[2000]	valid_0's rmse: 0.479818
[2100]	valid_0's rmse: 0.476782
[2200]	valid_0's rmse: 0.473923
[2300]	valid_0's rmse: 0.470761
[2400]	valid_0's rmse: 0.468445
[2500]	valid_0's rmse: 0.466467
[2600]	valid_0's rmse: 0.464626
[2700]	valid_0's rmse: 0.462223
[2800]	valid_0's rmse: 0.460472
[2900]	valid_0's rmse: 0.45858
[3000]	valid_0's rmse:

In [56]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df['STD_CLSS_NM'].unique()
#CARD_CCG_NMs  = df['CARD_CCG_NM'].unique()
AGEs          = df['AGE'].unique()
SEX_CTGO_CDs  = df['SEX_CTGO_CD'].unique()
FLCs          = df['FLC'].unique()
Seasons = df['Season'].unique()
years         = [2020]
months        = [4]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for AGE in AGEs:
            for SEX_CTGO_CD in SEX_CTGO_CDs:
                for FLC in FLCs:
                    for Season in Seasons:    
                        for year in years:
                            for month in months:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM,  AGE, SEX_CTGO_CD, FLC, Season, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=X.columns)

In [57]:
X.columns

Index(['CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year',
       'month', 'Season'],
      dtype='object')

In [59]:
# 예측
pred = model.predict(temp)

In [117]:
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [118]:
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,0,0,27668297.0
1,202004,0,1,3796918.0
2,202004,0,2,6518359.0
3,202004,0,3,4996608.0
4,202004,0,4,2791240.0
...,...,...,...,...
148,202004,16,4,15461825.0
149,202004,16,5,261179207.0
150,202004,16,6,5082474.0
151,202004,16,7,4450106.0


In [119]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [120]:
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,강원,관광 민예품 및 선물용품 소매업,27668297.0
1,202004,강원,그외 기타 분류안된 오락관련 서비스업,3796918.0
2,202004,강원,그외 기타 스포츠시설 운영업,6518359.0
3,202004,강원,기타 수상오락 서비스업,4996608.0
4,202004,강원,내항 여객 운송업,2791240.0
...,...,...,...,...
148,202004,충북,내항 여객 운송업,15461825.0
149,202004,충북,여관업,261179207.0
150,202004,충북,여행사업,5082474.0
151,202004,충북,전시 및 행사 대행업,4450106.0


In [121]:
temp.to_csv('./data/clusters/cluster1.csv')