In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, matplotlib as mpl, seaborn as sns, warnings

%matplotlib inline
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family='NanumGothic')
rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False

plt.style.use('ggplot')
color_pal = ["#F8766D", "#D39200", "#93AA00", "#00BA38", "#00C19F", "#00B9E3", "#619CFF", "#DB72FB"]

warnings.filterwarnings(action='ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df_raw = pd.read_csv('./data/jeju_data_ver1/201901-202003.csv')

In [3]:
df_raw.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,1,1,4,311200,4
1,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,7,1374500,8
2,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,6,818700,6
3,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,4,1717000,5
4,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,1047300,3


In [4]:
df_raw.columns

Index(['REG_YYMM', 'CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM',
       'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'CSTMR_CNT', 'AMT', 'CNT'],
      dtype='object')

#### 1, 2, 3월 데이터만 추출

In [5]:
df = df_raw[(df_raw['REG_YYMM'] == 202001) | (df_raw['REG_YYMM'] == 202002) | (df_raw['REG_YYMM'] == 202003)]
df.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
20425415,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,2,1,3,345000,3
20425416,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,3,1903450,3
20425417,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,14,1520500,15
20425418,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,9,1239200,9
20425419,202001,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,606700,4


In [6]:
df = df.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM',
       'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD', 'FLC']).sum().reset_index()

# 군집 데이터 불러오기

In [7]:
cluster_info = pd.read_csv("./data/class_cluster.csv")

"['관광 민예품 및 선물용품 소매업' '그외 기타 분류안된 오락관련 서비스업' '그외 기타 스포츠시설 운영업'\n '기타 수상오락 서비스업' '내항 여객 운송업' '수산물 소매업' '여관업' '여행사업' '전시 및 행사 대행업' '호텔업'\n '휴양콘도 운영업']"

# 1번 군집

In [17]:
class_list = ['관광 민예품 및 선물용품 소매업', 
              '그외 기타 분류안된 오락관련 서비스업',
              '그외 기타 스포츠시설 운영업',
              '기타 수상오락 서비스업',
              '내항 여객 운송업',
              '수산물 소매업',
              '여관업',
              '여행사업',
              '전시 및 행사 대행업',
              '호텔업',
              '휴양콘도 운영업']

In [23]:
'여관업' in class_list

True

In [None]:
cond 

In [30]:
df['STD_CLSS_NM'].map(lambda x: x in class_list)

0          False
1          False
2          False
3          False
4          False
           ...  
4234862     True
4234863     True
4234864     True
4234865     True
4234866     True
Name: STD_CLSS_NM, Length: 4234867, dtype: bool

In [34]:
df = df[df['STD_CLSS_NM'].map(lambda x: x in class_list)]

In [35]:
df.STD_CLSS_NM.unique()

array(['관광 민예품 및 선물용품 소매업', '그외 기타 스포츠시설 운영업', '수산물 소매업', '여관업',
       '전시 및 행사 대행업', '호텔업', '휴양콘도 운영업', '여행사업', '기타 수상오락 서비스업',
       '내항 여객 운송업', '그외 기타 분류안된 오락관련 서비스업'], dtype=object)

# Label Encoding

In [37]:
le = LabelEncoder()

In [36]:
df.drop(['HOM_SIDO_NM', 'HOM_CCG_NM'], axis=1, inplace=True)

In [38]:
categorical_cols = ['CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'AGE']

In [39]:
for col in categorical_cols:
    le = le.fit(df['{}'.format(col)])
    df['{}'.format(col)] = le.transform( df['{}'.format(col)])

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211750 entries, 464 to 4234866
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   REG_YYMM      211750 non-null  int64
 1   CARD_SIDO_NM  211750 non-null  int32
 2   CARD_CCG_NM   211750 non-null  int32
 3   STD_CLSS_NM   211750 non-null  int32
 4   AGE           211750 non-null  int32
 5   SEX_CTGO_CD   211750 non-null  int64
 6   FLC           211750 non-null  int64
 7   CSTMR_CNT     211750 non-null  int64
 8   AMT           211750 non-null  int64
 9   CNT           211750 non-null  int64
dtypes: int32(4), int64(6)
memory usage: 14.5 MB


# Preprocessing

In [42]:
# 변수명 지정
X_cols = list(df.columns)
X_cols.remove('AMT')

In [43]:
X = df[X_cols]
y = df['AMT']

In [44]:
# X 변수 Scale 적용
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [48]:
X_train

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,CNT
27097,202001,0,101,8,2,1,2,3,3
220599,202001,1,105,5,2,2,3,6,9
246732,202001,1,108,6,1,1,1,12,13
3945671,202003,8,178,6,1,1,1,4,8
1498211,202001,13,90,6,3,1,4,3,3
...,...,...,...,...,...,...,...,...,...
1315966,202001,10,48,5,2,1,2,4,6
2583050,202002,8,102,5,4,1,4,85,106
844578,202001,7,219,5,3,1,4,4,7
2586117,202002,8,102,8,1,1,1,6,7


# Random Forest

In [47]:
rf_uncustomized = RandomForestRegressor()
rf_uncustomized.fit(X_train, y_train)

RandomForestRegressor()

In [49]:
# Train 데이터 설명력
print("Score on training set : {:.3f}".format(rf_uncustomized.score(X_train,y_train)))
print("Score on test set : {:.3f}".format(rf_uncustomized.score(X_test,y_test)))

Score on training set : 0.939
Score on test set : 0.630


# SVM Regressor

In [51]:
svm_uncustomized = SVR()
svm_uncustomized.fit(X_train, y_train)
# Train 데이터 설명력

SVR()

In [None]:
print("Score on training set : {:.3f}".format(svm_uncustomized.score(X_train,y_train)))
print("Score on test set : {:.3f}".format(svm_uncustomized.score(X_test,y_test)))