In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import re
info_df = pd.read_csv('/content/drive/MyDrive/jeju.csv', on_bad_lines='skip')

rev_df = pd.read_csv('/content/drive/MyDrive/jeju_Review.csv', on_bad_lines='skip')

tour_df = pd.read_csv('/content/drive/MyDrive/jeju_Tour.csv', on_bad_lines='skip')


print(info_df.columns.tolist())


['id', 'placeID', 'MCT_NM', 'MCT_NAVER_NAME', 'UE_CNT_GRP', 'UE_AMT_GRP', 'MON_UE_CNT_RAT', 'TUE_UE_CNT_RAT', 'WED_UE_CNT_RAT', 'THU_UE_CNT_RAT', 'FRI_UE_CNT_RAT', 'SAT_UE_CNT_RAT', 'SUN_UE_CNT_RAT', 'OP_YMD', 'HR_5_11_UE_CNT_RAT', 'HR_12_13_UE_CNT_RAT', 'HR_14_17_UE_CNT_RAT', 'HR_18_22_UE_CNT_RAT', 'HR_23_4_UE_CNT_RAT', 'UE_AMT_PER_TRSN_GRP', 'LOCAL_UE_CNT_RAT', 'RC_M12_MAL_CUS_CNT_RAT', 'RC_M12_FME_CUS_CNT_RAT', 'RC_M12_AGE_UND_20_CUS_CNT_RAT', 'RC_M12_AGE_30_CUS_CNT_RAT', 'RC_M12_AGE_40_CUS_CNT_RAT', 'RC_M12_AGE_50_CUS_CNT_RAT', 'RC_M12_AGE_OVR_60_CUS_CNT_RAT', 'MCT_TYPE', 'MCT_NAVER_TYPE', 'ADDR', 'WT', 'EWT', 'CD', 'NAVER_ADDR', 'PHONE', 'AMENITY', 'PAYMENT', 'TOTAL_REVIEW_NUM', 'TOTAL_BLOG_REVIEW_NUM', 'BOSS_TIP', 'AUTH_TITLE', 'crawling_main_ver', 'crawling_info_ver', 'crawling_review_ver', 'crawling_menu_ver', 'AUTH_CONTENT', 'keywords', 'keywords_embeddings', 'original_type', 'wheelchair_access', 'Closed', 'Latitude', 'Longitude']


In [4]:
def extract_date(x):
    try:
        if pd.isnull(x):
            return pd.NaT

        # 문자열 변환 후 숫자만 추출
        x_str = re.sub(r'\D', '', str(x))  # 숫자만 남기기

        if len(x_str) >= 8:
            y, m, d = x_str[:4], x_str[4:6], x_str[6:8]
            date_str = f"{y}-{m}-{d}"
            return pd.to_datetime(date_str, errors='coerce')
        else:
            return pd.NaT
    except:
        return pd.NaT


In [5]:

# OP_YMD 열 전처리
info_df['OP_YMD_CLEAN'] = info_df['OP_YMD'].apply(extract_date)

# 결과 확인
print(info_df['OP_YMD_CLEAN'].head())
print(info_df['OP_YMD_CLEAN'].dtype)  # datetime64[ns]

0   2015-03-27
1   2009-02-17
2   2021-12-09
3   2014-01-10
4   2023-11-29
Name: OP_YMD_CLEAN, dtype: datetime64[ns]
datetime64[ns]


In [6]:
print(info_df['OP_YMD_CLEAN'].isna().sum())  # NaT 개수 확인

info_df['OP_YEAR'] = info_df['OP_YMD_CLEAN'].dt.year
info_df['OP_MONTH'] = info_df['OP_YMD_CLEAN'].dt.month
info_df['OP_DAY'] = info_df['OP_YMD_CLEAN'].dt.day
info_df['OP_WEEKDAY'] = info_df['OP_YMD_CLEAN'].dt.weekday  # 월:0 ~ 일:6

0


In [7]:
mapping_ordinal = {
    '1_상위 10% 이하': 1,
    '2_10~25%': 2,
    '3_25~50%': 3,
    '4_50~75%': 4,
    '5_75~90%': 5,
    '6_90% 초과': 6,
}


info_df['UE_CNT_GRP_NUM'] = info_df['UE_CNT_GRP'].map(mapping_ordinal)
info_df['UE_AMT_GRP_NUM'] = info_df['UE_AMT_GRP'].map(mapping_ordinal)
info_df['UE_AMT_PER_TRSN_GRP_NUM'] = info_df['UE_AMT_PER_TRSN_GRP'].map(mapping_ordinal)

In [11]:
info_df['UE_CNT_GRP_NUM'].value_counts()
info_df['UE_AMT_GRP_NUM'].value_counts()

Unnamed: 0_level_0,count
UE_AMT_GRP_NUM,Unnamed: 1_level_1
3,2252
2,2096
1,1759
4,1686
5,898
6,561


In [12]:
info_df['UE_AMT_PER_TRSN_GRP_NUM'].value_counts()

Unnamed: 0_level_0,count
UE_AMT_PER_TRSN_GRP_NUM,Unnamed: 1_level_1
3,2304
4,2300
2,1358
5,1290
6,1161
1,839


In [13]:
info_df['UE_AMT_GRP_NUM'].value_counts()

Unnamed: 0_level_0,count
UE_AMT_GRP_NUM,Unnamed: 1_level_1
3,2252
2,2096
1,1759
4,1686
5,898
6,561


In [14]:
# 제거할 컬럼 리스트
columns_to_drop = ['TOTAL_REVIEW_NUM', 'TOTAL_BLOG_REVIEW_NUM', 'EWT', 'CD', 'NAVER_ADDR']

# 해당 컬럼 제거
info_df = info_df.drop(columns=columns_to_drop)

In [15]:
info_df

Unnamed: 0,id,placeID,MCT_NM,MCT_NAVER_NAME,UE_CNT_GRP,UE_AMT_GRP,MON_UE_CNT_RAT,TUE_UE_CNT_RAT,WED_UE_CNT_RAT,THU_UE_CNT_RAT,...,Latitude,Longitude,OP_YMD_CLEAN,OP_YEAR,OP_MONTH,OP_DAY,OP_WEEKDAY,UE_CNT_GRP_NUM,UE_AMT_GRP_NUM,UE_AMT_PER_TRSN_GRP_NUM
0,1,,(사)한국수상레저안전협회 제주제주시지부,,1_상위 10% 이하,1_상위 10% 이하,1.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,2015-03-27,2015,3,27,4,1,1,6
1,2,1.418973e+09,(유)아웃백스테이크하우스 제주아일랜드점,아웃백스테이크하우스 제주아일랜드점,6_90% 초과,6_90% 초과,0.188065,0.083183,0.108499,0.083183,...,33.481149,126.502462,2009-02-17,2009,2,17,1,6,6,5
2,3,,(유)케이디에셋 담앤루,,4_50~75%,1_상위 10% 이하,0.214286,0.204082,0.030612,0.000000,...,33.245852,126.451468,2021-12-09,2021,12,9,3,4,1,2
3,4,1.463425e+09,(주) 베이힐,베이힐풀앤빌라,2_10~25%,5_75~90%,0.086957,0.086957,0.260870,0.043478,...,33.237583,126.372076,2014-01-10,2014,1,10,4,2,5,6
4,5,1.247913e+09,(주) 비케이알 버거킹 제주화북DT점,버거킹 제주화북DT점,6_90% 초과,5_75~90%,0.115321,0.125206,0.146623,0.148270,...,33.517639,126.562626,2023-11-29,2023,11,29,2,6,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9247,9248,1.996719e+09,히아담,히아담,3_25~50%,2_10~25%,0.096774,0.112903,0.209677,0.161290,...,33.476084,126.486324,2019-01-24,2019,1,24,3,3,2,3
9248,9249,1.011247e+09,히치하이커스라운지,,2_10~25%,2_10~25%,0.000000,0.217391,0.173913,0.086957,...,33.499090,126.528797,2020-07-10,2020,7,10,4,2,2,4
9249,9250,1.762730e+09,히포파운드,,4_50~75%,2_10~25%,0.185185,0.000000,0.000000,0.000000,...,33.485546,126.460446,2021-12-22,2021,12,22,2,4,2,2
9250,9251,3.216359e+07,힘찬장어,,1_상위 10% 이하,1_상위 10% 이하,0.166667,0.000000,0.333333,0.000000,...,33.490559,126.492486,2015-09-10,2015,9,10,3,1,1,6
