In [1]:
# 행정동 별 실 거주 인구 데이터의 양이 많아 Huggingface에 업로드 해 두었습니다. 아래의 링크에 들어가 Data.zip 파일을 받아주셔야 합니다.
# 데이터를 다운 후 압축을 해제하시고 다운 받은 Data 파일 내부의 LocalPeople 폴더를 Data폴더 안에(Trading_Area 파일과 같은 선상) 있도록 옮겨주셔야 합니다.
# https://huggingface.co/datasets/uhjin1130/LocalPeople/blob/main/Data.zip

In [2]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# 실거주 인구 데이터 로딩
def load_people_data(year, base_path):
    files = sorted([
        f for f in os.listdir(base_path)
        if f.startswith(f'LOCAL_PEOPLE_DONG_{year}') and f.endswith('.csv')
    ])
    df_list = []
    for f in files:
        path = os.path.join(base_path, f)
        try:
            df = pd.read_csv(path, encoding='utf-8')
        except UnicodeDecodeError:
            df = pd.read_csv(path, encoding='cp949')

        def looks_like_dongcode(series):
            return series.astype(str).str.match(r'^11\d{6}$').sum()

        score1 = looks_like_dongcode(df['행정동코드'])
        score2 = looks_like_dongcode(df['시간대구분'])

        if score2 > score1:
            df['행정동코드'] = df['시간대구분'].astype(int).astype(str).str.zfill(8)
        else:
            df['행정동코드'] = df['행정동코드'].astype(float).astype(int).astype(str).str.zfill(8)

        df_list.append(df)

    if not df_list:
        return pd.DataFrame()

    df_year = pd.concat(df_list)
    df_year = df_year[df_year['행정동코드'].notna()]
    df_grouped = df_year.groupby('행정동코드').mean(numeric_only=True).reset_index()
    return df_grouped

# 상권 분석 데이터 로딩
def load_biz_data(year, base_path):
    path = os.path.join(base_path, f'Trading_Area_{year}.csv')
    if not os.path.exists(path):
        return pd.DataFrame()
    df = pd.read_csv(path, encoding='utf-8')
    df['행정동_코드'] = df['행정동_코드'].astype(str).str.zfill(8)
    df_grouped = df.groupby(['행정동_코드', '서비스_업종_코드_명'])['당월_매출_금액'].mean().reset_index()
    return df_grouped

# 연도별 분석
def analyze_year(year, people_dir, biz_dir, save_dir):
    people_df = load_people_data(year, people_dir)
    biz_df = load_biz_data(year, biz_dir)
    if people_df.empty or biz_df.empty:
        return None

    merged = pd.merge(biz_df, people_df, left_on='행정동_코드', right_on='행정동코드', how='inner')
    if merged.empty:
        return None

    null_ratio = merged.isnull().mean()
    drop_cols = null_ratio[null_ratio > 0.5].index.tolist()
    merged = merged.drop(columns=drop_cols)

    X = merged.drop(columns=['행정동_코드', '서비스_업종_코드_명', '당월_매출_금액', '행정동코드'])
    y = merged['당월_매출_금액']
    info = merged[['행정동_코드', '서비스_업종_코드_명']].copy()

    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    info['총매출'] = y.values
    info['연도'] = year
    info['순위'] = info.groupby('행정동_코드')['총매출'].rank(method='dense', ascending=False)

    if year > 2019:
        prev_path = os.path.join(save_dir, f'Analyze_{year - 1}.csv')
        if os.path.exists(prev_path):
            prev_df = pd.read_csv(prev_path, encoding='utf-8-sig')
            prev_df['행정동코드'] = prev_df['행정동코드'].astype(str).str.zfill(8)
            prev_df['업종'] = prev_df['업종'].astype(str)
            info = pd.merge(info, prev_df[['행정동코드', '업종', '총매출']],
                            left_on=['행정동_코드', '서비스_업종_코드_명'],
                            right_on=['행정동코드', '업종'], how='left', suffixes=('', '_prev'))
            info['매출증감률'] = ((info['총매출'] - info['총매출_prev']) / info['총매출_prev']) * 100
            info['매출증감률'] = info['매출증감률'].fillna(0)
        else:
            info['매출증감률'] = 0
    else:
        info['매출증감률'] = 0

    result = info[['연도', '행정동_코드', '서비스_업종_코드_명', '총매출', '매출증감률', '순위']].copy()
    result.rename(columns={'행정동_코드': '행정동코드', '서비스_업종_코드_명': '업종'}, inplace=True)
    result.to_csv(os.path.join(save_dir, f'Analyze_{year}_LocalPeople.csv'), index=False, encoding='utf-8-sig')
    return result

# 실행
def run_all(people_dir, biz_dir, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    all_data = []

    for year in range(2019, 2025):
        result = analyze_year(year, people_dir, biz_dir, save_dir)
        if result is not None:
            all_data.append(result)
        else:
            print(f"{year}년 분석 실패")

    if not all_data:
        raise ValueError("분석 데이터가 존재하지 않아 학습할 수 없습니다.")

# 실행
run_all('../../Data/LocalPeople', '../../Data/Trading_Area', './outputs')


In [3]:
#pip install catboost
# catboost 설치 필요

In [6]:
import pandas as pd
from catboost import CatBoostRegressor, Pool

# 1. 데이터 로드
years = list(range(2019, 2025))
df_list = []
for y in years:
    df = pd.read_csv(f'./outputs/Analyze_{y}_LocalPeople.csv')
    df["연도"] = y
    df["행정동코드"] = df["행정동코드"].astype(str).str.zfill(8)
    df_list.append(df)

data = pd.concat(df_list, ignore_index=True)

# 2. 매핑 테이블 로드
dong_map = pd.read_excel('../../Data/행정동코드_매핑정보.xlsx', header=1)
dong_map = dong_map[['H_DNG_CD', 'H_DNG_NM']]
dong_map.columns = ['행정동코드', '행정동명']
dong_map['행정동코드'] = dong_map['행정동코드'].astype(str).str.zfill(8)

# 3. 학습용 데이터 구성
X = data[['연도', '행정동코드', '업종']]
y = data['총매출']
cat_features = ['행정동코드', '업종']

train_pool = Pool(X, y, cat_features=cat_features)

# 4. 모델 학습
model = CatBoostRegressor(
    iterations=3000,           # 트리 개수
    learning_rate=0.01,        # 학습률 
    depth=8,                   # 각 트리의 최대 깊이
    l2_leaf_reg=3,             # 과적합 방지를 위한 정규화
    loss_function='RMSE',      # 평균 제곱 오차 최소화
    random_seed=42,            # 난수 초기값
    cat_features=cat_features, # CatBoost가 범주형 변수로 정확하게 처리하게끔 지정
    task_type='GPU',           # GPU가 있다면 'GPU', 없으면 'CPU'로 설정
    verbose=100                # 100번 학습마다 로그 출력
)

model.fit(train_pool)

# 5. 2025년 예측용 조합 생성
dong_list = sorted(X['행정동코드'].unique())
biz_list = sorted(X['업종'].unique())
pred_df = pd.DataFrame([
    {"연도": 2025, "행정동코드": d, "업종": b}
    for d in dong_list for b in biz_list
])
pred_pool = Pool(pred_df, cat_features=cat_features)

# 6. 예측
pred_df['예측_총매출'] = model.predict(pred_pool)

# 7. 행정동명 병합
pred_df = pred_df.merge(dong_map, on='행정동코드', how='left')

# 7-1. 과거 평균 매출증감률 계산
avg_rate = data.groupby(['행정동코드', '업종'])['매출증감률'].mean().reset_index()
avg_rate.columns = ['행정동코드', '업종', '과거_평균_증감률']
pred_df = pred_df.merge(avg_rate, on=['행정동코드', '업종'], how='left')
pred_df['과거_평균_증감률'] = pred_df['과거_평균_증감률'].fillna(0)

# 7-2. 점수 계산 및 순위
pred_df['예측_점수'] = pred_df['예측_총매출'] * (1 + pred_df['과거_평균_증감률'])
pred_df['순위'] = pred_df.groupby('행정동코드')['예측_점수'].rank(ascending=False, method='min')
pred_df['연도'] = 2025

# 8. 최종 저장: 필요한 컬럼만 포함
final = pred_df[['연도', '행정동코드', '행정동명', '업종', '예측_총매출', '순위']]
final = final.sort_values(by=['행정동코드', '순위'])
final.to_csv('./Predicted_2025_Top_Busines_LocalPeople.csv', index=False, encoding='utf-8-sig')





0:	learn: 9950239427.2318611	total: 24.9ms	remaining: 1m 14s
100:	learn: 9580750808.7116680	total: 794ms	remaining: 22.8s
200:	learn: 9331525618.3543968	total: 1.65s	remaining: 23s
300:	learn: 9145962899.9264030	total: 2.59s	remaining: 23.2s
400:	learn: 9006477475.3072128	total: 3.42s	remaining: 22.2s
500:	learn: 8939055665.6176357	total: 4.38s	remaining: 21.9s
600:	learn: 8884272499.0890579	total: 5.27s	remaining: 21s
700:	learn: 8833889367.1339779	total: 6.14s	remaining: 20.1s
800:	learn: 8782153168.2238407	total: 7.02s	remaining: 19.3s
900:	learn: 8739105678.5781441	total: 7.91s	remaining: 18.4s
1000:	learn: 8701525949.3048630	total: 8.81s	remaining: 17.6s
1100:	learn: 8660786663.7086010	total: 9.75s	remaining: 16.8s
1200:	learn: 8625238533.8831081	total: 10.8s	remaining: 16.2s
1300:	learn: 8600414322.8645535	total: 11.7s	remaining: 15.3s
1400:	learn: 8560154245.4460058	total: 12.6s	remaining: 14.4s
1500:	learn: 8516365316.0007772	total: 13.6s	remaining: 13.5s
1600:	learn: 848557648