In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, matplotlib as mpl, seaborn as sns, warnings

%matplotlib inline
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family='NanumGothic')
rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False

plt.style.use('ggplot')
color_pal = ["#F8766D", "#D39200", "#93AA00", "#00BA38", "#00C19F", "#00B9E3", "#619CFF", "#DB72FB"]

warnings.filterwarnings(action='ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [2]:
df_raw = pd.read_csv('./data/jeju_data_ver1/201901-202003.csv')

In [3]:
df_raw.head()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,1,1,4,311200,4
1,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,7,1374500,8
2,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,6,818700,6
3,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,4,1717000,5
4,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,1047300,3


In [4]:
df_raw.STD_CLSS_NM.unique().__len__()

41

In [5]:
df_raw.columns

Index(['REG_YYMM', 'CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM',
       'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'CSTMR_CNT', 'AMT', 'CNT'],
      dtype='object')

#### 1, 2, 3월 데이터만 추출

In [6]:
# df = df_raw[(df_raw['REG_YYMM'] == 202001) | (df_raw['REG_YYMM'] == 202002) | (df_raw['REG_YYMM'] == 202003)]
# df.head()

In [7]:
df = df_raw

In [8]:
df.fillna('세종시', inplace=True)

In [9]:
df.columns

Index(['REG_YYMM', 'CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM',
       'HOM_CCG_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'CSTMR_CNT', 'AMT', 'CNT'],
      dtype='object')

In [10]:
df.drop(['CARD_CCG_NM','HOM_SIDO_NM','HOM_CCG_NM'], axis=1, inplace=True)

In [11]:
df = df.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC']).sum().reset_index()

In [12]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [13]:
df['year'] = df['REG_YYMM'].apply(lambda x: grap_year(x))
df['month'] = df['REG_YYMM'].apply(lambda x: grap_month(x))
df = df.drop(['REG_YYMM'], axis=1)

In [15]:
def make_season(data):
    if data in [12, 1, 2]:
        return 'Winter'
    elif data in [3, 4, 5]:
        return 'Spring'
    elif data in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

In [16]:
df['Season'] = df['month'].apply(lambda x: make_season(x))

In [17]:
df.Season.unique()

array(['Winter', 'Spring', 'Summer', 'Autumn'], dtype=object)

# Label Encoding

In [18]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df[column] = encoder.transform(df[column])

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197601 entries, 0 to 197600
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   CARD_SIDO_NM  197601 non-null  int32
 1   STD_CLSS_NM   197601 non-null  int32
 2   AGE           197601 non-null  int32
 3   SEX_CTGO_CD   197601 non-null  int64
 4   FLC           197601 non-null  int64
 5   CSTMR_CNT     197601 non-null  int64
 6   AMT           197601 non-null  int64
 7   CNT           197601 non-null  int64
 8   year          197601 non-null  int64
 9   month         197601 non-null  int64
 10  Season        197601 non-null  int32
dtypes: int32(4), int64(7)
memory usage: 13.6 MB


In [20]:
df.STD_CLSS_NM.unique()

array([ 0,  1,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40,  4, 30])

# Preprocessing

In [50]:
# 변수명 지정
X_cols = list(df.columns)
X_cols.remove('AMT')

In [51]:
X = df.drop(['CSTMR_CNT','AMT','CNT'], axis=1)
y = np.log1p(df['AMT'])

In [52]:
# X 변수 Scale 적용
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [54]:
X_train

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,year,month,Season
68254,2,33,2,1,1,2019,6,2
152207,8,2,2,2,2,2019,12,3
80026,1,1,3,2,2,2019,7,2
162883,4,28,4,2,4,2020,1,3
529,0,28,1,1,1,2019,1,3
...,...,...,...,...,...,...,...,...
136622,5,2,3,1,3,2019,11,0
103392,13,33,0,1,1,2019,8,2
57428,6,0,3,1,2,2019,5,1
155747,12,29,3,1,3,2019,12,3


# Light Gradient Boosting Machine

In [55]:
import lightgbm as lgb

In [56]:
train_ds = lgb.Dataset(X_train, label=y_train)
val_ds = lgb.Dataset(X_test, label=y_test)

In [57]:
params = {
            'learning_rate' : 0.1,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

In [58]:
model = lgb.train(params,
                  train_ds,
                  5000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.978593
[200]	valid_0's rmse: 0.809112
[300]	valid_0's rmse: 0.732968
[400]	valid_0's rmse: 0.681674
[500]	valid_0's rmse: 0.64563
[600]	valid_0's rmse: 0.616457
[700]	valid_0's rmse: 0.593034
[800]	valid_0's rmse: 0.573656
[900]	valid_0's rmse: 0.559482
[1000]	valid_0's rmse: 0.549686
[1100]	valid_0's rmse: 0.539936
[1200]	valid_0's rmse: 0.526672
[1300]	valid_0's rmse: 0.517091
[1400]	valid_0's rmse: 0.508674
[1500]	valid_0's rmse: 0.501191
[1600]	valid_0's rmse: 0.496585
[1700]	valid_0's rmse: 0.492397
[1800]	valid_0's rmse: 0.486878
[1900]	valid_0's rmse: 0.480964
[2000]	valid_0's rmse: 0.476187
[2100]	valid_0's rmse: 0.473337
[2200]	valid_0's rmse: 0.470348
[2300]	valid_0's rmse: 0.466878
[2400]	valid_0's rmse: 0.463704
[2500]	valid_0's rmse: 0.461331
[2600]	valid_0's rmse: 0.458977
[2700]	valid_0's rmse: 0.456752
[2800]	valid_0's rmse: 0.454281
[2900]	valid_0's rmse: 0.451539
[3000]	valid_0's rms

In [79]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df['STD_CLSS_NM'].unique()
AGEs          = df['AGE'].unique()
SEX_CTGO_CDs  = df['SEX_CTGO_CD'].unique()
FLCs          = df['FLC'].unique()
Seasons = df['Season'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for AGE in AGEs:
            for SEX_CTGO_CD in SEX_CTGO_CDs:
                for FLC in FLCs:
                    for year in years:
                        for month in months:
                            for Season in Seasons:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, AGE, SEX_CTGO_CD, FLC, year, month, Season])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=X.columns)

In [80]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,AGE,SEX_CTGO_CD,FLC,year,month,Season
0,0,0,1,1,1,2020,4,3
1,0,0,1,1,1,2020,4,1
2,0,0,1,1,1,2020,4,2
3,0,0,1,1,1,2020,4,0
4,0,0,1,1,1,2020,7,3
...,...,...,...,...,...,...,...,...
390315,16,30,0,2,5,2020,4,0
390316,16,30,0,2,5,2020,7,3
390317,16,30,0,2,5,2020,7,1
390318,16,30,0,2,5,2020,7,2


In [81]:
X.columns

Index(['CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year',
       'month', 'Season'],
      dtype='object')

In [82]:
# 예측
pred = model.predict(temp)
pred = np.expm1(pred)

In [83]:
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [84]:
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,0,0,6.233222e+08
1,202004,0,1,1.830099e+10
2,202004,0,2,5.577815e+09
3,202004,0,3,1.128000e+08
4,202004,0,4,4.169540e+06
...,...,...,...,...
1389,202007,16,36,8.615044e+09
1390,202007,16,37,1.398722e+11
1391,202007,16,38,8.530561e+07
1392,202007,16,39,2.839321e+09


In [85]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [86]:
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,강원,건강보조식품 소매업,6.233222e+08
1,202004,강원,골프장 운영업,1.830099e+10
2,202004,강원,과실 및 채소 소매업,5.577815e+09
3,202004,강원,관광 민예품 및 선물용품 소매업,1.128000e+08
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,4.169540e+06
...,...,...,...,...
1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,8.615044e+09
1390,202007,충북,한식 음식점업,1.398722e+11
1391,202007,충북,호텔업,8.530561e+07
1392,202007,충북,화장품 및 방향제 소매업,2.839321e+09


In [98]:
submission = pd.read_csv('./data/jeju_data_ver1/submission.csv')

In [99]:
submission.columns

Index(['id', 'REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT'], dtype='object')

In [100]:
sub = submission.merge(temp,on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'],how='outer').drop(['AMT_x'], axis=1).rename(columns={'AMT_y':'AMT'})

In [101]:
sub.isnull().sum()

id              0
REG_YYMM        0
CARD_SIDO_NM    0
STD_CLSS_NM     0
AMT             0
dtype: int64

# 없는 업종 0값 넣기

In [92]:
df1 = df_raw.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'])['AMT'].sum().reset_index()

In [93]:
sido_list = list(df_raw['CARD_SIDO_NM'].unique())
clss_list = list(df_raw['STD_CLSS_NM'].unique())

In [94]:
dict1 = dict()
for i in sido_list:
    res = df1[df1['CARD_SIDO_NM'] =='{}'.format(i)]['STD_CLSS_NM'].unique()
    result = set(clss_list) - set(res)
    dict1['{}'.format(i)] = list(result)

In [95]:
dict1

{'강원': ['정기 항공 운송업'],
 '경기': ['면세점', '정기 항공 운송업'],
 '경남': ['면세점', '정기 항공 운송업'],
 '경북': ['면세점', '정기 항공 운송업'],
 '광주': ['그외 기타 분류안된 오락관련 서비스업', '휴양콘도 운영업', '면세점', '정기 항공 운송업'],
 '대구': ['그외 기타 분류안된 오락관련 서비스업', '휴양콘도 운영업', '내항 여객 운송업', '정기 항공 운송업'],
 '대전': ['휴양콘도 운영업',
  '그외 기타 분류안된 오락관련 서비스업',
  '내항 여객 운송업',
  '버스 운송업',
  '면세점',
  '택시 운송업',
  '정기 항공 운송업'],
 '부산': ['그외 기타 분류안된 오락관련 서비스업'],
 '서울': ['그외 기타 분류안된 오락관련 서비스업', '휴양콘도 운영업'],
 '세종': ['휴양콘도 운영업',
  '여행사업',
  '내항 여객 운송업',
  '버스 운송업',
  '호텔업',
  '자동차 임대업',
  '면세점',
  '택시 운송업',
  '정기 항공 운송업'],
 '울산': ['휴양콘도 운영업', '내항 여객 운송업', '정기 항공 운송업'],
 '인천': ['그외 기타 분류안된 오락관련 서비스업', '휴양콘도 운영업', '버스 운송업'],
 '전남': ['정기 항공 운송업'],
 '전북': ['그외 기타 분류안된 오락관련 서비스업', '면세점'],
 '제주': ['버스 운송업'],
 '충남': ['면세점'],
 '충북': ['그외 기타 분류안된 오락관련 서비스업', '정기 항공 운송업']}

In [102]:
for idx in range(len(sub)):
    for k,v in dict1.items():
        if sub.iloc[idx,2] == k:
            for clss in v:
                if sub.iloc[idx,3] == clss:
                    sub.iloc[idx,4] = 0

In [103]:
sub[sub['STD_CLSS_NM']=='정기 항공 운송업']

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
30,30,202004,강원,정기 항공 운송업,0.0
71,71,202004,경기,정기 항공 운송업,0.0
112,112,202004,경남,정기 항공 운송업,0.0
153,153,202004,경북,정기 항공 운송업,0.0
194,194,202004,광주,정기 항공 운송업,0.0
235,235,202004,대구,정기 항공 운송업,0.0
276,276,202004,대전,정기 항공 운송업,0.0
317,317,202004,부산,정기 항공 운송업,2566564000.0
358,358,202004,서울,정기 항공 운송업,23630450000.0
399,399,202004,세종,정기 항공 운송업,0.0


In [104]:
sub.to_csv('./data/clusters/submission_season.csv', index=False, encoding='utf-8-sig')