In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [None]:
data = pd.read_csv('./jeju_data_201901-202003/201901-202003.csv')

In [3]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

def grap_month_covid(data):
    data = str(data)
    return int(data[5:7])

def grap_date_covid(data):
    data = str(data)
    return int(data[8:10])

In [18]:
#코로나 데이터 처리
covid = pd.read_csv('./Covid19.csv')
covid = covid.drop(['time','released','deceased'],axis=1)
covid = covid[covid['date'].isin(['2020-01-31','2020-02-29','2020-03-31','2020-04-30'])]

covid['month'] = covid['date'].apply(lambda x: grap_month_covid(x))
covid['year'] = covid['date'].apply(lambda x: grap_year(x))
covid = covid.drop(['date'],axis=1)
covid = covid.sort_values(by=['province','year','month'])

be = ['Seoul', 'Busan', 'Daegu', 'Incheon', 'Gwangju', 'Daejeon',
       'Ulsan', 'Sejong', 'Gyeonggi-do', 'Gangwon-do',
       'Chungcheongbuk-do', 'Chungcheongnam-do', 'Jeollabuk-do',
       'Jeollanam-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Jeju-do']
af = ['서울', '부산', '대구', '인천', '광주', '대전', '울산', '세종', '경기', '강원', '충북',
       '충남', '전북', '전남', '경북', '경남', '제주']


In [19]:
for i in range(len(covid['province'].unique())):
    covid['province'].replace(be[i],af[i], inplace=True)
    
covid_eda = covid.copy()

In [23]:
covid_eda.to_csv("./preprocissing_covid19.csv",index=False)

In [21]:
for i in range(len(covid)-1):
    if covid.iloc[i+1,0]==covid.iloc[i,0]:
        covid_eda.iloc[i+1,1]=covid.iloc[i+1,1]-covid.iloc[i,1]
        
covid_eda.rename({'province':'CARD_SIDO_NM'},axis='columns',inplace=True)

In [22]:
covid_eda

Unnamed: 0,CARD_SIDO_NM,confirmed,month,year
188,부산,0,1,2020
681,부산,80,2,2020
1208,부산,39,3,2020
1718,부산,18,4,2020
197,충북,0,1,2020
...,...,...,...,...
1717,서울,183,4,2020
193,울산,0,1,2020
686,울산,17,2,2020
1213,울산,22,3,2020


In [None]:
# 날짜 처리
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [None]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM','SEX_CTGO_CD'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE','year','FLC','month']
df = df.groupby(columns).sum().reset_index(drop=False)
df = df.sort_values(by=['year', 'month'] ,ascending=True)
#df = df.drop(['year'], axis=1)

In [None]:
df_covid = pd.merge(df,covid_eda,how='left')
df_covid = df_covid.fillna(0)

In [None]:
# 인코딩
dtypes = df_covid.dtypes
encoders = {}
for column in df_covid.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df_covid[column])
        encoders[column] = encoder
        
#df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_covid[column] = encoder.transform(df_covid[column])

In [None]:
# feature, target 설정
# train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
# train_target = np.log1p(train_num['AMT'])
x = df_covid.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = df_covid['AMT']

In [None]:
df_covid

In [None]:
x = np.array(x)
y = np.array(y)

In [None]:
##loss 정의
# log 값 변환 시 NaN등의 이슈로 log() 가 아닌 log1p() 를 이용하여 RMSLE 계산
def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

# def mse_AIFrenz(y_true, y_pred, **kwargs):
#     '''
#     y_true: 실제 값
#     y_pred: 예측 값
#     '''
#     diff = abs(y_true - y_pred)
    
#     less_then_one = np.where(diff < 1, 0, diff)
    
#     score = np.average(np.average(less_then_one ** 2, axis = 0))
#     return score

##lgbm custom loss
def rmsle_score(pred, dataset):
    y = dataset.get_label()
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return 'score', rmsle, False

In [None]:
def lgb_train(x_train,y_train,params):
    lgb_train = lgb.Dataset(x_train,y_train)
    lgb_model = lgb.train(params, 
                       lgb_train,
                       feval=rmsle_score,
                       valid_sets=[lgb_train],
                       verbose_eval=False)
    return lgb_model

# lgb basic pparams
params = {
    'learning_rate' : 0.1,
    'n_estimators' : 100,
    'objective': 'tweedie',
    'tweedie_variance_power': 1.1,
    'metric' : 'rmsle_score',
    'boosting_type' : 'gbdt',
    'n_jobs': -1,
}



In [None]:
def k_cross_validation(k,t,delta):
    error=0
    for i in range(k):
        x_train = x[:(t+(i+1)*delta)]
        y_train = y[:(t+(i+1)*delta)]
        # x_val = x[:delta]
        # y_val = y[:delta]
        x_val = x[(t+(i+1)*delta):(t+2*(i+1)*delta)]
        y_val = y[(t+(i+1)*delta):(t+2*(i+1)*delta)]
        
        
        lgb_model = lgb_train(x_train,y_train,params)
        pred=lgb_model.predict(x_val)
        print(x_val)
        print(y_val)
        # for j in pred:
        #     if j<0:
        #         print(j)
        print(pred)

        print('\RMSLE : {}'.format(rmsle(y_val, pred)))
        error += rmsle(y_val, pred)
    return error / k

In [None]:
k = 3
t = int(len(x)*(11/15))
delta = int(len(x)*(1/15))
#result_df = pd.DataFrame(columns=['scaler','input_days','MAE'])

# for scaler in [None,'minmax']:
#     for input_days in [3,5]:
#         error = k_cross_validation(k)
#         result_df = result_df.append({'scaler':scaler, 'input_days':input_days, 'MAE':error}, ignore_index=True)
#         print("-------------------------------")
#         print('Parameter : {}, {} / MAE {}'.format(scaler, input_days, error))
#         print("===============================")
error = k_cross_validation(k,t,delta)

In [None]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=x.columns)

In [None]:
# 예측
pred = model.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [None]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [None]:
# 제출 파일 만들기
submission = pd.read_csv('./submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('submission.csv', encoding='utf-8-sig')
submission.head()