In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from lightgbm import LGBMClassifier
import lightgbm as lgb

import warnings
 
warnings.filterwarnings("ignore")

In [2]:
print('Pandas : %s'%(pd.__version__))
print('Numpy : %s'%(np.__version__))
print('Scikit-Learn : %s'%(sklearn.__version__))
!python --version

Pandas : 0.25.3
Numpy : 1.16.5
Scikit-Learn : 0.23.1


Python 3.6.9 :: Anaconda, Inc.


## 1. Dataset Load

In [3]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [5]:
# 201901-202003 과 202004 병합
data1 = pd.read_csv('C:/Users/kyh20/Downloads/jeju_data_ver1/201901-202003.csv')
data2 = pd.read_csv('C:/Users/kyh20/Downloads/jeju_data_ver1/202004.csv')
data = pd.concat([data1, data2], ignore_index = True)

In [6]:
sample = data.sample(frac=0.1, random_state=2019)
sample = sample.fillna('')
sample['year'] = sample['REG_YYMM'].apply(lambda x: grap_year(x))
sample['month'] = sample['REG_YYMM'].apply(lambda x: grap_month(x))
sample = sample.sort_values(by=['REG_YYMM'], axis=0)
for idx,date in enumerate(sample['REG_YYMM'].unique()):
    sample.loc[sample['REG_YYMM']==date,'new_date'] = idx+1

## 2. Feature Engineering & Initial Modeling

In [5]:
df = sample.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month', 'REG_YYMM']
df = df.groupby(columns).sum().reset_index(drop=False)

In [6]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        

df_num = df.copy()
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [7]:
train_num = df_num.sample(frac=1, random_state=0)
x = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = train_num.drop(['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month','CSTMR_CNT', 'CNT'], axis=1)

In [8]:
x = x.reset_index()
y = y.reset_index()

In [10]:
x = x.drop(['index'], axis=1)
y = y.drop(['index'], axis=1)

In [34]:
train_X = x.loc[x['REG_YYMM']!=202003]
train_y = y.loc[y['REG_YYMM']!=202003]
test_X = x.loc[x['REG_YYMM']==202003]
test_y = y.loc[y['REG_YYMM']==202003]

In [35]:
train_X['year'] = train_X['year'].apply(lambda x : 0 if x == 2019 else 1)
test_X['year'] = test_X['year'].apply(lambda x : 0 if x == 2019 else 1)

In [36]:
train_X = train_X.drop(['REG_YYMM'], axis=1)
train_y = train_y.drop(['REG_YYMM'], axis=1)
test_X = test_X.drop(['REG_YYMM'], axis=1)
test_y = test_y.drop(['REG_YYMM'], axis=1)

In [37]:
Cardsido = df_num['CARD_SIDO_NM'].unique()
Stdclss = df_num['STD_CLSS_NM'].unique()

In [38]:
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT', 'REG_YYMM', 'AGE', 'SEX_CTGO_CD', 'FLC'], axis=1)

In [39]:
train_features.columns

Index(['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'year', 'month'], dtype='object')

## 3. Modeling

In [47]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
years         = [1]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for year in years:
                for month in months:
                    temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=train_features.columns)

In [41]:
case = pd.DataFrame(columns=['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'year', 'month', 'AMT'])
train_X = train_X.drop(['AGE', 'SEX_CTGO_CD', 'FLC'], axis=1)
test_X = test_X.drop(['AGE', 'SEX_CTGO_CD', 'FLC'], axis=1)

In [42]:
for i in Cardsido:
    for j in Stdclss:
        #시와 종류가 같은 항목끼리 묶어서 따로따로 예측
        trainset = train_X.loc[train_X['CARD_SIDO_NM'] == i]
        trainset = trainset.loc[trainset['STD_CLSS_NM'] == j]
        #train_y에서 i,j 행만 trainset과 병합
        train_merge = pd.merge(trainset, train_y, how='outer', left_index = True, right_index = True).dropna(axis=0)
        new_train_X = train_merge.drop(['AMT'], axis=1).astype(int)
        new_train_y = np.log1p(train_merge['AMT']).astype(int)
        
        testset = test_X.loc[test_X['CARD_SIDO_NM'] == i]
        testset = testset.loc[testset['STD_CLSS_NM'] == j]
        test_merge = pd.merge(testset, test_y, how='outer', left_index = True, right_index = True).dropna(axis=0)
        new_test_X = test_merge.drop(['AMT'], axis=1).astype(int)
        new_test_y = np.log1p(test_merge['AMT']).astype(int)
        
        
        if len(train_merge) or len(test_merge) != 0:
            lgb = LGBMClassifier(n_estimators=200)
            lgb.fit(new_train_X, new_train_y)
            #model = RandomForestRegressor(n_estimators=100,criterion='mse', random_state=777)
            #model.fit(new_train_X, new_train_y)
            
            temp1 = temp.loc[temp['CARD_SIDO_NM'] == i]
            temp1 = temp1.loc[temp1['STD_CLSS_NM'] == j]
            # 예측
            pred = lgb.predict(temp1)
            pred = np.expm1(pred)
            temp1['AMT'] = np.round(pred, 0)
            case = pd.concat([case, temp1])
            
#temp['REG_YYMM'] = temp['year']*100 + temp['month']
#temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
#temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)
            
            
            #print("훈련 세트의 정확도 : {:.2f}".format(model.score(new_train_X, new_train_y)))
            #print("테스트 세트의 정확도 : {:.2f}".format(model.score(new_test_X, new_test_y)))
            
            #lasso = Lasso().fit(new_train_X, new_train_y)
        
            #print("훈련 세트의 정확도 : {:.2f}".format(lasso.score(new_train_X, new_train_y)))
            #print("테스트 세트의 정확도 : {:.2f}".format(lasso.score(new_test_X, new_test_y)))

In [22]:
# 빈case에 예측값 삽입
case['REG_YYMM'] = case['year']*100 + case['month']
case = case[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
case = case.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

# param_test 에 실데이터 삽입
param_test = pd.merge(test_X, test_y, how='outer', left_index = True, right_index = True).dropna(axis=0)
param_test['REG_YYMM'] = (param_test['year']+2019)*100 + param_test['month']
param_test = param_test[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
param_test = param_test.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

# 두 데이터셋 병합
total_merge = pd.merge(case, param_test, how='outer', left_index = True, right_index = True, on=['CARD_SIDO_NM', 'STD_CLSS_NM']).dropna(axis=0)

# 예측값과 실데이터값 rmsle 비교
from sklearn.metrics import mean_squared_log_error
rmsle = np.sqrt(mean_squared_log_error(total_merge['AMT_x'], total_merge['AMT_y']))

print(rmsle)


# 안지운값   3.558605188043879
# hom_sido_nm   3.6938316890017133
# age   3.643511587631316
# SEX_CTGO_CD   3.5445778872947615
# FLC   3.6582534629098964
# hom_sido_nm, age  3.7523989747241586
# 'HOM_SIDO_NM', 'SEX_CTGO_CD'   3.6471233688402265
# 'HOM_SIDO_NM, FLC   3.76090413624393
# 'AGE', 'SEX_CTGO_CD'   3.6173784835646954
# 'AGE', 'FLC'    3.4564127536127804
# 'SEX_CTGO_CD', 'FLC'    3.6231624717275013
# 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD'   3.6983336706301317
# 'HOM_SIDO_NM', 'AGE', 'FLC'   3.565750581551223
# 'HOM_SIDO_NM', 'SEX_CTGO_CD', 'FLC'    3.699553528734567
# 'AGE', 'SEX_CTGO_CD', 'FLC'   3.4386450073516523
# 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC'   3.5602021922691827

# AGE', 'SEX_CTGO_CD', 'FLC' drop 하는게 베스트

3.534037045318257


In [24]:
# 디코딩 
case['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(case['CARD_SIDO_NM'])
case['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(case['STD_CLSS_NM'])

In [26]:
submission = pd.read_csv("C:/Users/kyh20/Downloads/jeju_data_ver1/submission.csv", index_col = 0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(case, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left').fillna(0)
submission.index.name = 'id'
submission.to_csv('C:/Users/kyh20/Downloads/jeju_data_ver1/lgbm.csv', encoding='utf-8-sig')
submission

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,5.264703e+08
1,202004,강원,골프장 운영업,2.386900e+09
2,202004,강원,과실 및 채소 소매업,3.279040e+08
3,202004,강원,관광 민예품 및 선물용품 소매업,5.535271e+07
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0.000000e+00
...,...,...,...,...
1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,1.247464e+08
1390,202007,충북,한식 음식점업,5.202751e+09
1391,202007,충북,호텔업,1.124047e+08
1392,202007,충북,화장품 및 방향제 소매업,1.162358e+08
