In [276]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, matplotlib as mpl, seaborn as sns, warnings

%matplotlib inline
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family='NanumGothic')
rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False

plt.style.use('ggplot')
color_pal = ["#F8766D", "#D39200", "#93AA00", "#00BA38", "#00C19F", "#00B9E3", "#619CFF", "#DB72FB"]

warnings.filterwarnings(action='ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import LinearSVR, SVR
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [277]:
df_raw = pd.read_csv('cl1.csv')

In [438]:
df = df_raw
df.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,year,month
0,201901,서울,건강보조식품 소매업,20s,1,1,115,7809970,140,2019,1
1,201901,서울,건강보조식품 소매업,20s,2,1,407,19074744,516,2019,1
2,201901,서울,건강보조식품 소매업,20s,2,2,11,868200,12,2019,1
3,201901,서울,건강보조식품 소매업,30s,1,1,10,4767250,11,2019,1
4,201901,서울,건강보조식품 소매업,30s,1,2,874,74320644,1045,2019,1


# Data Preprocessing

In [440]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [441]:
df['year'] = df['REG_YYMM'].apply(lambda x: grap_year(x))
df['month'] = df['REG_YYMM'].apply(lambda x: grap_month(x))
df = df.drop(['REG_YYMM'], axis=1)

In [442]:
def make_season(data):
    if data in [12, 1, 2]:
        return 'Winter'
    elif data in [3, 4, 5]:
        return 'Spring'
    elif data in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

In [443]:
df['Season'] = df['month'].apply(lambda x: make_season(x))

In [444]:
df.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,year,month,Season
0,서울,건강보조식품 소매업,20s,1,1,115,7809970,140,2019,1,Winter
1,서울,건강보조식품 소매업,20s,2,1,407,19074744,516,2019,1,Winter
2,서울,건강보조식품 소매업,20s,2,2,11,868200,12,2019,1,Winter
3,서울,건강보조식품 소매업,30s,1,1,10,4767250,11,2019,1,Winter
4,서울,건강보조식품 소매업,30s,1,2,874,74320644,1045,2019,1,Winter


# Label Encoding

In [445]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df[column] = encoder.transform(df[column])

In [446]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11302 entries, 0 to 11301
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   CARD_SIDO_NM  11302 non-null  int32
 1   STD_CLSS_NM   11302 non-null  int32
 2   AGE           11302 non-null  int32
 3   SEX_CTGO_CD   11302 non-null  int64
 4   FLC           11302 non-null  int64
 5   CSTMR_CNT     11302 non-null  int64
 6   AMT           11302 non-null  int64
 7   CNT           11302 non-null  int64
 8   year          11302 non-null  int64
 9   month         11302 non-null  int64
 10  Season        11302 non-null  int32
dtypes: int32(4), int64(7)
memory usage: 794.8 KB


In [None]:
X_list

In [452]:
# 데이터 표준화
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()
scaler1.fit(df['CNT'])
scaler2.fit(df['CSTMR_CNT'])
# 목표변수 y를 trip_total로 설정

df['CNT'] = scaler1.fit_transform(df['CNT'])
df['CSTMR_CNT'] = scaler2.fit_transform(df['CSTMR_CNT'])

ValueError: Expected 2D array, got 1D array instead:
array=[1.4000e+02 5.1600e+02 1.2000e+01 ... 2.2611e+04 2.1460e+03 6.2980e+03].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [341]:
df.STD_CLSS_NM.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [342]:
df['month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64)

# Preprocessing

In [343]:
# 변수명 지정
X_cols = list(df.columns)
X_cols.remove('AMT')

In [344]:
X = df.drop(['AMT'], axis=1)
y = np.log1p(df['AMT'])

In [345]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [346]:
X_train

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,CNT,year,month,Season
4759,0,10,5,1,5,3529,7468,2019,7,2
813,0,3,1,2,2,40,47,2019,2,3
3630,0,26,4,1,4,73505,98781,2019,5,1
3992,0,10,3,1,2,370,715,2019,6,2
6552,0,22,0,1,1,1110,1548,2019,9,0
...,...,...,...,...,...,...,...,...,...,...
6502,0,19,6,2,5,2147,3646,2019,9,0
4507,0,31,3,1,3,9065,12586,2019,6,2
7273,0,20,2,1,3,118,203,2019,10,0
407,0,17,6,1,5,229,367,2019,1,3


In [347]:
rf = RandomForestRegressor(max_depth=8, max_features='auto', n_estimators=300)
rf.fit(X_train, y_train)
# Train 데이터 설명력
print("Score on training set : {:.3f}".format(rf.score(X_train,y_train)))
print("Score on test set : {:.3f}".format(rf.score(X_test,y_test)))

Score on training set : 0.957
Score on test set : 0.950


In [294]:
# param_grid = { 
#     'n_estimators': [200, 500],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4,5,6,7,8],
# }
# CV_rf = GridSearchCV(estimator=rf, cv= 5, param_grid=param_grid)
# CV_rf.fit(X_train, y_train)

In [233]:
CV_rf.best_params_

{'max_depth': 8, 'max_features': 'auto', 'n_estimators': 500}

In [394]:
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
# Train 데이터 설명력
print("Score on training set : {:.3f}".format(gb.score(X_train,y_train)))
print("Score on test set : {:.3f}".format(gb.score(X_test,y_test)))

Score on training set : 0.967
Score on test set : 0.965


In [395]:
estimators = [('rf', RandomForestRegressor(n_estimators=300)),
              ('gb', GradientBoostingRegressor(n_estimators=300))]

In [396]:
# build model
Stacking_Model = StackingRegressor(estimators=estimators)

labels = y_train
features = X_train

# Fit the RF model with features and labels.
st_reg=Stacking_Model.fit(features, labels)

In [397]:
print("Score on training set : {:.3f}".format(st_reg.score(X_train,y_train)))
print("Score on test set : {:.3f}".format(st_reg.score(X_test,y_test)))

Score on training set : 0.990
Score on test set : 0.983


In [398]:
# 모델 실행 및 결과 확인
X_test_predict=pd.DataFrame(
    st_reg.predict(X_test)).rename(
    columns={0:'predict'})
X_train_predict=pd.DataFrame(
    st_reg.predict(X_train)).rename(
    columns={0:'predict'})

# combine the training and testing dataframes to visualize
# and compare.
SR_predict = X_train_predict.append(X_test_predict)

In [399]:
y_train

4759   19.63
813    13.34
3630   21.73
3992   17.28
6552   16.66
        ... 
6502   17.92
4507   20.93
7273   15.29
407    17.90
8859   20.84
Name: AMT, Length: 7911, dtype: float64

In [400]:
st_reg.predict(X_test)

array([18.70697658, 11.76849074, 17.76406061, ..., 18.9451514 ,
       10.05446624, 14.5404819 ])

In [401]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error

In [402]:
np.sqrt(mean_squared_error(y_test, st_reg.predict(X_test)))

0.3583259807764765

In [412]:
# 예측
pred = st_reg.predict(temp)
pred = np.expm1(pred)

In [413]:
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [414]:
pd.options.display.float_format = '{:.2f}'.format
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,서울,건강보조식품 소매업,19957753716.00
1,202004,서울,골프장 운영업,16716055604.00
2,202004,서울,과실 및 채소 소매업,16262097168.00
3,202004,서울,관광 민예품 및 선물용품 소매업,11138645180.00
4,202004,서울,그외 기타 스포츠시설 운영업,8254408636.00
...,...,...,...,...
59,202007,서울,차량용 가스 충전업,15652174776.00
60,202007,서울,차량용 주유소 운영업,27529807888.00
61,202007,서울,피자 햄버거 샌드위치 및 유사 음식점업,10150773440.00
62,202007,서울,호텔업,33701633420.00


In [416]:
19957753716 - 1178972099

18778781617

In [415]:
df_raw[df_raw['REG_YYMM']==202003].groupby(['STD_CLSS_NM'])['AMT'].sum()

STD_CLSS_NM
건강보조식품 소매업                1178972099
골프장 운영업                    364764050
과실 및 채소 소매업               8005379734
관광 민예품 및 선물용품 소매업           53586140
그외 기타 스포츠시설 운영업             15888500
그외 기타 종합 소매업             40433017255
기타 수상오락 서비스업                 3946900
기타 외국식 음식점업               7348439049
기타 주점업                    1463780040
기타음식료품위주종합소매업            25422139867
내항 여객 운송업                 1103089720
마사지업                       144174546
면세점                         35502010
버스 운송업                    2798198940
빵 및 과자류 소매업              14350083255
수산물 소매업                   3342043120
스포츠 및 레크레이션 용품 임대업       13713309096
여관업                       2293624372
여행사업                       313930866
욕탕업                       1368908045
육류 소매업                   16014475439
일반유흥 주점업                  2897164720
일식 음식점업                  14155708402
자동차 임대업                    144784043
전시 및 행사 대행업                164414858
정기 항공 운송업                 2284378045
중식 음식점업                  1

In [316]:
361398501.00 - 53586140

307812361.0

# Light Gradient Boosting Machine

In [417]:
import lightgbm as lgb

In [418]:
train_ds = lgb.Dataset(X_train, label=y_train)
val_ds = lgb.Dataset(X_test, label=y_test)

In [436]:
params = {
            'learning_rate' : 0.1,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1        }

In [437]:
model = lgb.train(params,
                  train_ds,
                  5000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.343543
[200]	valid_0's rmse: 0.316347
[300]	valid_0's rmse: 0.314684
[400]	valid_0's rmse: 0.313226
Early stopping, best iteration is:
[392]	valid_0's rmse: 0.31303


# Template

In [422]:
X.columns

Index(['CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'CSTMR_CNT',
       'CNT', 'year', 'month', 'Season'],
      dtype='object')

In [423]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df['STD_CLSS_NM'].unique()
AGEs          = df['AGE'].unique()
SEX_CTGO_CDs  = df['SEX_CTGO_CD'].unique()
FLCs          = df['FLC'].unique()
Seasons = df['Season'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for AGE in AGEs:
            for SEX_CTGO_CD in SEX_CTGO_CDs:
                for FLC in FLCs:
                    for year in years:
                        for month in months:
                            for Season in Seasons:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, AGE, SEX_CTGO_CD, FLC, year, month, Season])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=['CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month', 'Season'])

In [424]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,year,month,Season
0,0,0,1,1,1,2020,4,3
1,0,0,1,1,1,2020,4,1
2,0,0,1,1,1,2020,4,2
3,0,0,1,1,1,2020,4,0
4,0,0,1,1,1,2020,7,3
...,...,...,...,...,...,...,...,...
17915,0,31,0,2,5,2020,4,0
17916,0,31,0,2,5,2020,7,3
17917,0,31,0,2,5,2020,7,1
17918,0,31,0,2,5,2020,7,2


In [425]:
df2 = df[(df['year']==2020) & (df['month']==2)]
df3 = df[(df['year']==2020) & (df['month']==3)]
df4 = pd.concat([df2, df3], axis=0)

In [426]:
df4.drop(['AMT', 'year', 'month', 'Season'], axis=1, inplace=True)

In [427]:
df4

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,CNT
9847,0,0,0,1,1,4,14
9848,0,0,1,1,1,113,180
9849,0,0,1,2,1,249,327
9850,0,0,2,1,1,6,6
9851,0,0,2,1,2,509,624
...,...,...,...,...,...,...,...
11297,0,31,4,2,4,26021,36726
11298,0,31,5,1,5,5471,7073
11299,0,31,5,2,5,16836,22611
11300,0,31,6,1,5,1776,2146


In [428]:
temp = pd.merge(temp, df4, on=['CARD_SIDO_NM','STD_CLSS_NM','AGE','SEX_CTGO_CD','FLC'])

In [429]:
# 예측
pred = model.predict(temp)
pred = np.expm1(pred)

In [430]:
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [431]:
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,0,0,26539123072.00
1,202004,0,1,23814092292.00
2,202004,0,2,19833665608.00
3,202004,0,3,15711575608.00
4,202004,0,4,10659575056.00
...,...,...,...,...
59,202007,0,27,12263231640.00
60,202007,0,28,22482797812.00
61,202007,0,29,6206765024.00
62,202007,0,30,25834113296.00


In [432]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [433]:
pd.options.display.float_format = '{:.2f}'.format
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,서울,건강보조식품 소매업,26539123072.00
1,202004,서울,골프장 운영업,23814092292.00
2,202004,서울,과실 및 채소 소매업,19833665608.00
3,202004,서울,관광 민예품 및 선물용품 소매업,15711575608.00
4,202004,서울,그외 기타 스포츠시설 운영업,10659575056.00
...,...,...,...,...
59,202007,서울,차량용 가스 충전업,12263231640.00
60,202007,서울,차량용 주유소 운영업,22482797812.00
61,202007,서울,피자 햄버거 샌드위치 및 유사 음식점업,6206765024.00
62,202007,서울,호텔업,25834113296.00


In [435]:
26539123072 - 1178972099

25360150973

In [434]:
df_raw[df_raw['REG_YYMM']==202003].groupby(['STD_CLSS_NM'])['AMT'].sum()

STD_CLSS_NM
건강보조식품 소매업                1178972099
골프장 운영업                    364764050
과실 및 채소 소매업               8005379734
관광 민예품 및 선물용품 소매업           53586140
그외 기타 스포츠시설 운영업             15888500
그외 기타 종합 소매업             40433017255
기타 수상오락 서비스업                 3946900
기타 외국식 음식점업               7348439049
기타 주점업                    1463780040
기타음식료품위주종합소매업            25422139867
내항 여객 운송업                 1103089720
마사지업                       144174546
면세점                         35502010
버스 운송업                    2798198940
빵 및 과자류 소매업              14350083255
수산물 소매업                   3342043120
스포츠 및 레크레이션 용품 임대업       13713309096
여관업                       2293624372
여행사업                       313930866
욕탕업                       1368908045
육류 소매업                   16014475439
일반유흥 주점업                  2897164720
일식 음식점업                  14155708402
자동차 임대업                    144784043
전시 및 행사 대행업                164414858
정기 항공 운송업                 2284378045
중식 음식점업                  1

In [194]:
214160875732 - 85944403598

128216472134

In [393]:
np.sqrt(mean_squared_error(df_raw[df_raw['REG_YYMM']==202003].groupby(['STD_CLSS_NM'])['AMT'].sum().values,temp[temp['REG_YYMM']==202004]['AMT'].values))

26018340129.942093

In [275]:
np.sqrt(mean_squared_error(df_raw[df_raw['REG_YYMM']==202003].groupby(['STD_CLSS_NM'])['AMT'].sum().values,temp[temp['REG_YYMM']==202004]['AMT'].values))

66125091098.94609