In [1]:
# 행정동 별 실 거주 인구 데이터의 양이 많아 Huggingface에 업로드 해 두었습니다. 아래의 링크에 들어가 LocalPeople.zip 파일을 받아주셔야 합니다.
# 데이터를 다운 후 압축을 해제하시고 Data 폴더 안으로 옮겨주세요.
# https://huggingface.co/datasets/uhjin1130/LocalPeople/blob/main/LocalPeople.zip

In [2]:
import os
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# 실거주 인구 데이터 로딩
def load_people_data(year, base_path):
    files = sorted([
        f for f in os.listdir(base_path)
        if f.startswith(f'LOCAL_PEOPLE_DONG_{year}') and f.endswith('.csv')
    ])
    df_list = []
    for f in files:
        path = os.path.join(base_path, f)
        try:
            df = pd.read_csv(path, encoding='utf-8')
        except UnicodeDecodeError:
            df = pd.read_csv(path, encoding='cp949')

        def looks_like_dongcode(series):
            return series.astype(str).str.match(r'^11\d{6}$').sum()

        score1 = looks_like_dongcode(df['행정동코드'])
        score2 = looks_like_dongcode(df['시간대구분'])

        if score2 > score1:
            df['행정동코드'] = df['시간대구분'].astype(int).astype(str).str.zfill(8)
        else:
            df['행정동코드'] = df['행정동코드'].astype(float).astype(int).astype(str).str.zfill(8)

        df_list.append(df)

    if not df_list:
        return pd.DataFrame()

    df_year = pd.concat(df_list)
    df_year = df_year[df_year['행정동코드'].notna()]
    df_grouped = df_year.groupby('행정동코드').mean(numeric_only=True).reset_index()
    return df_grouped

# 상권 분석 데이터 로딩
def load_biz_data(year, base_path):
    path = os.path.join(base_path, f'Trading_Area_{year}.csv')
    if not os.path.exists(path):
        return pd.DataFrame()
    df = pd.read_csv(path, encoding='utf-8')
    df['행정동_코드'] = df['행정동_코드'].astype(str).str.zfill(8)
    df_grouped = df.groupby(['행정동_코드', '서비스_업종_코드_명'])['당월_매출_금액'].mean().reset_index()
    return df_grouped

# 연도별 분석
def analyze_year(year, people_dir, biz_dir, save_dir):
    people_df = load_people_data(year, people_dir)
    biz_df = load_biz_data(year, biz_dir)
    if people_df.empty or biz_df.empty:
        return None

    merged = pd.merge(biz_df, people_df, left_on='행정동_코드', right_on='행정동코드', how='inner')
    if merged.empty:
        return None

    null_ratio = merged.isnull().mean()
    drop_cols = null_ratio[null_ratio > 0.5].index.tolist()
    merged = merged.drop(columns=drop_cols)

    X = merged.drop(columns=['행정동_코드', '서비스_업종_코드_명', '당월_매출_금액', '행정동코드'])
    y = merged['당월_매출_금액']
    info = merged[['행정동_코드', '서비스_업종_코드_명']].copy()

    # 결측값 처리
    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # 이상치 클리핑 및 로그 변환
    y_clipped = np.clip(y, None, np.quantile(y, 0.99))
    y_log = np.log1p(y_clipped)

    # 학습/검증 데이터 분할
    X_train, X_test, y_train_log, y_test_log = train_test_split(
        X_imputed, y_log, test_size=0.2, random_state=42
    )

    # 모델 학습 및 예측 (로그 공간)
    model = RandomForestRegressor(n_estimators=300, max_depth=12, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train_log)
    y_pred_log = model.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_test = np.expm1(y_test_log)

    # 평가 지표 계산 (MAPE 제외)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    print(f"[{year}] RMSE: {rmse:.2f}, MAE: {mae:.2f}, MSE: {mse:.2f}")


    # 전체 데이터 재학습 및 예측
    model.fit(X_imputed, y_log)
    y_full_log_pred = model.predict(X_imputed)
    y_full_pred = np.expm1(y_full_log_pred)

    info['총매출'] = y.values
    info['예측_총매출'] = y_full_pred
    info['연도'] = year
    info['순위'] = info.groupby('행정동_코드')['총매출'].rank(method='dense', ascending=False)

    # 매출 증감률 계산
    if year > 2019:
        prev_path = os.path.join(save_dir, f'Analyze_{year - 1}_LocalPeople.csv')
        if os.path.exists(prev_path):
            prev_df = pd.read_csv(prev_path, encoding='utf-8-sig')
            prev_df['행정동코드'] = prev_df['행정동코드'].astype(str).str.zfill(8)
            prev_df['업종'] = prev_df['업종'].astype(str).str.strip()
            info['서비스_업종_코드_명'] = info['서비스_업종_코드_명'].astype(str).str.strip()

            info = pd.merge(
                info,
                prev_df[['행정동코드', '업종', '총매출']],
                left_on=['행정동_코드', '서비스_업종_코드_명'],
                right_on=['행정동코드', '업종'],
                how='left',
                suffixes=('', '_prev')
            )
            info['총매출_prev'] = info['총매출_prev'].replace(0, pd.NA)
            info['매출증감률'] = ((info['총매출'] - info['총매출_prev']) / info['총매출_prev']) * 100
            info['매출증감률'] = info['매출증감률'].fillna(0)
        else:
            info['매출증감률'] = 0
    else:
        info['매출증감률'] = 0

    result = info[['연도', '행정동_코드', '서비스_업종_코드_명', '총매출', '매출증감률', '순위']].copy()
    result.rename(columns={'행정동_코드': '행정동코드', '서비스_업종_코드_명': '업종'}, inplace=True)
    result = result.sort_values(by=['행정동코드', '순위'])
    result.to_csv(os.path.join(save_dir, f'Analyze_{year}_LocalPeople.csv'), index=False, encoding='utf-8-sig')
    return result

# 전체 실행
def run_all(people_dir, biz_dir, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    all_data = []

    for year in range(2019, 2025):
        result = analyze_year(year, people_dir, biz_dir, save_dir)
        if result is not None:
            all_data.append(result)
        else:
            print(f"{year}년 분석 실패")

    if not all_data:
        raise ValueError("분석 데이터가 존재하지 않아 학습할 수 없습니다.")

# 실행
run_all('../../Data/LocalPeople', '../../Data/Trading_Area', './outputs')


[2019] RMSE: 1923899024.78, MAE: 784828575.38, MSE: 3701387457540995584.00
[2020] RMSE: 1902084354.49, MAE: 785505968.67, MSE: 3617924891605251072.00
[2021] RMSE: 2031885846.87, MAE: 849122156.66, MSE: 4128560094716606976.00
[2022] RMSE: 2388144625.90, MAE: 976566169.52, MSE: 5703234754227623936.00
[2023] RMSE: 2581257932.75, MAE: 1023838211.72, MSE: 6662892515407363072.00
[2024] RMSE: 2530136928.64, MAE: 1021625960.92, MSE: 6401592877645485056.00


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# 1. 데이터 로드
years = list(range(2019, 2025))
df_list = []
for y in years:
    df = pd.read_csv(f'./outputs/Analyze_{y}_LocalPeople.csv')
    df["연도"] = y
    df["행정동코드"] = df["행정동코드"].astype(str).str.zfill(8)
    df_list.append(df)
data = pd.concat(df_list, ignore_index=True)

# 2. 매핑 테이블 로드
dong_map = pd.read_excel('../../Data/행정동코드_매핑정보.xlsx', header=1)
dong_map = dong_map[['H_DNG_CD', 'H_DNG_NM']]
dong_map.columns = ['행정동코드', '행정동명']
dong_map['행정동코드'] = dong_map['행정동코드'].astype(str).str.zfill(8)

# 3. 범주형 변수 인코딩
le_dong = LabelEncoder()
le_biz = LabelEncoder()
data['행정동코드_enc'] = le_dong.fit_transform(data['행정동코드'])
data['업종_enc'] = le_biz.fit_transform(data['업종'])

# 4. 로그 변환 적용
data['총매출_log'] = np.log1p(data['총매출'])

X = data[['연도', '행정동코드_enc', '업종_enc']]
y = data['총매출_log']

# 5. 모델 학습 (로그 공간)
model = RandomForestRegressor(n_estimators=500, max_depth=15, random_state=42, n_jobs=-1)
model.fit(X, y)

# 6. 2025년 예측용 조합 생성
dong_list = sorted(data['행정동코드'].unique())
biz_list = sorted(data['업종'].unique())
pred_df = pd.DataFrame([
    {"연도": 2025, "행정동코드": d, "업종": b}
    for d in dong_list for b in biz_list
])
pred_df['행정동코드_enc'] = le_dong.transform(pred_df['행정동코드'])
pred_df['업종_enc'] = le_biz.transform(pred_df['업종'])

# 7. 예측 (로그 공간 → 역변환)
X_pred = pred_df[['연도', '행정동코드_enc', '업종_enc']]
pred_df['예측_총매출'] = np.expm1(model.predict(X_pred))

# 8. 행정동명 병합 및 매출 증감률 반영
pred_df = pred_df.merge(dong_map, on='행정동코드', how='left')
avg_rate = data.groupby(['행정동코드', '업종'])['매출증감률'].mean().reset_index()
avg_rate.columns = ['행정동코드', '업종', '과거_평균_증감률']
pred_df = pred_df.merge(avg_rate, on=['행정동코드', '업종'], how='left')
pred_df['과거_평균_증감률'] = pred_df['과거_평균_증감률'].fillna(0)

# 9. 점수 계산 및 클리핑
pred_df['예측_점수'] = pred_df['예측_총매출'] * (1 + pred_df['과거_평균_증감률'].clip(lower=-0.99))
pred_df['예측_점수'] = pred_df['예측_점수'].clip(lower=0)
pred_df['순위'] = pred_df.groupby('행정동코드')['예측_점수'].rank(ascending=False, method='min')
pred_df['연도'] = 2025

# 10. 최종 저장
final = pred_df[['연도', '행정동코드', '행정동명', '업종', '예측_총매출', '순위']]
final = final.sort_values(by=['행정동코드', '순위'])
final.to_csv('./Predicted_2025_Top_Business_LocalPeople_RF_LogApplied.csv', index=False, encoding='utf-8-sig')

final.head(10)


Unnamed: 0,연도,행정동코드,행정동명,업종,예측_총매출,순위
44,2025,11110515,청운효자동,전자상거래업,39671320.0,1.0
52,2025,11110515,청운효자동,커피-음료,2456314000.0,2.0
36,2025,11110515,청운효자동,인테리어,198566000.0,3.0
57,2025,11110515,청운효자동,한식음식점,5110614000.0,4.0
27,2025,11110515,청운효자동,양식음식점,2647965000.0,5.0
47,2025,11110515,청운효자동,중식음식점,348565200.0,6.0
62,2025,11110515,청운효자동,화초,280324700.0,7.0
46,2025,11110515,청운효자동,조명용품,35005610000.0,8.0
60,2025,11110515,청운효자동,호프-간이주점,560389000.0,9.0
37,2025,11110515,청운효자동,일반교습학원,537073300.0,10.0
