In [1]:
#  데이터 처리 모듈
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 행렬 등 통계 연산 모듈
import numpy as np
from scipy.stats import iqr
# 지수형 표기법 e를 연속형 변환
pd.options.display.float_format = '{:.4f}'.format
# 타입 어노테이션(Any, Sequence 등의 메서드 활용)
from typing import *
import os

# 구글 드라이브 마운트
from google.colab import drive
drive.mount("/content/drive")
import os

# 조합 찾는 반복문 메서드
from itertools import permutations
from itertools import combinations

# 데이터 시각화
import matplotlib.pyplot as plt
# from matplotlib.pyplot import
import matplotlib.font_manager as fm
import missingno as msno
import seaborn as sns

# pd.set_option('display.max_seq_items',5)

Mounted at /content/drive


In [2]:
class LoadGoogleDriveData():
    def __init__(self, data = None):
    self.data = data

    def loadData(self, file_path: str, file_name_extension,
               columnTF: bool, unicode: str) -> pd.DataFrame():
    self.data = pd.read_csv(os.path.join(file_path + file_name_extension),
                            index_col = columnTF,
                            sep = ",",
                            na_values = "NaN",
                            encoding = unicode)
      return self.data

    def loadTxTData(self, file_path: str, file_name_extension,
               columnTF: bool, unicode: str) -> pd.DataFrame():
    self.data = pd.read_csv(os.path.join(file_path + file_name_extension),
                            index_col = columnTF,
                            sep = "|",
                            na_values = "NaN",
                            encoding = unicode)
      return self.data

    def loadExcelData(self, file_path: str, file_name_extension,
               columnTF: bool) -> pd.DataFrame():
    self.data = pd.read_excel(os.path.join(file_path + file_name_extension),
                              index_col = columnTF)
      return self.data

  # 용량이 큰 csv 파일 읽어오기(fopen - fread와 유사한 방식)
    def loadDataWithChunking(self, file_path: str, file_name_extension,
                           chunking_row_num: int, columnTF: bool, unicode: str) -> pd.DataFrame():
    chunkdata = pd.read_csv(os.path.join(file_path + file_name_extension),
                            chunksize = chunking_row_num,
                            index_col = columnTF,
                            sep = ",",
                            na_values = "NaN",
                            encoding = unicode)
    self.data = list(chunkdata)
    self.data = pd.concat(self.data)

      return self.data

In [3]:
mountInstance = LoadGoogleDriveData()

In [4]:
KCD_MARKET_CONTEST_JOIN = mountInstance.loadData(
    file_path = "/content/drive/MyDrive/2023BigContest/data/", file_name_extension = "KCD_MARKET_CONTEST_JOIN.csv",
    unicode = "utf-8-sig", columnTF = False)

KCD_SEASON_PATTERN = mountInstance.loadData(
    file_path = "/content/drive/MyDrive/2023BigContest/data/", file_name_extension = "KCD_SEASON_PATTERN.csv",
    unicode = "utf-8-sig", columnTF = False)

CONTEST_2022 = mountInstance.loadDataWithChunking(
    file_path = "/content/drive/MyDrive/2023BigContest/data/contest data/", file_name_extension = "필지단위 소상공인 매출등급 정보.csv",
    unicode = "cp949", chunking_row_num = 10**5, columnTF = False)

CONTEST_2023 = pd.read_excel('/content/drive/MyDrive/2023BigContest/data/contest data/소상공인 매출등급 예상 대상 필지.xlsx')

CONTEST = mountInstance.loadDataWithChunking(
    file_path = "/content/drive/MyDrive/2023BigContest/data/", file_name_extension = "CONTEST_2022_YLABEL.csv",
    unicode = "utf-8-sig", chunking_row_num = 10**5, columnTF = False)

### 1. 이상치 처리

In [5]:
# 필요없는 칼럼 삭제 및 칼럼 정리
KCD_MARKET_CONTEST_JOIN = KCD_MARKET_CONTEST_JOIN.drop(['상권_식별자','상권명','외식업종_중분류','상권_코드_명','상권_코드_2022','상권_코드_2023'],axis=1)
KCD_MARKET_CONTEST_JOIN['기준_년_코드'] = KCD_MARKET_CONTEST_JOIN['기준일자'].str[:4].astype('int64')
KCD_MARKET_CONTEST_JOIN['기준_분기_코드'] = KCD_MARKET_CONTEST_JOIN['기준일자'].str[4:5].astype('int64')
KCD_MARKET_CONTEST_JOIN = KCD_MARKET_CONTEST_JOIN.drop(['기준일자'],axis=1)

In [6]:
# 이상치 처리 함수
def preprocess_outlier(dataframe):
    for column in dataframe.columns:
        column_data = dataframe[column]

        # 1사분위수와 3사분위수 계산
        q1 = column_data.quantile(0.25)
        q3 = column_data.quantile(0.75)

        # IQR 계산
        iqr = q3 - q1

        # 이상치 경계값 계산
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # 이상치를 NaN으로 대체
        dataframe[column] = dataframe[column].apply(
            lambda x: x if lower_bound <= x <= upper_bound else None)

    return dataframe

In [7]:
preprocess_outlier(KCD_MARKET_CONTEST_JOIN)

Unnamed: 0,상권코드,STDG_EMD_CD,배달매출액_변동계수,주말배달매출액_변동계수,손익분기점매출액_변동계수,사업장방문고객수_평균,정규고용인원_평균,주말카드매출액_변동계수,매입액_변동계수,사업장임대면적_변동계수,...,목요일_생활인구_수,금요일_생활인구_수,토요일_생활인구_수,일요일_생활인구_수,운영_영업_개월_평균,폐업_영업_개월_평균,서울_운영_영업_개월_평균,서울_폐업_영업_개월_평균,기준_년_코드,기준_분기_코드
0,2120098.0000,11440127.0000,6.6481,7.9662,1.6407,17.8384,2.4877,3.9457,2.6117,1.2383,...,56139.0000,52455.0000,27161.0000,24683.0000,0.0000,0.0000,0.0000,0.0000,2022,1
1,2120098.0000,11440127.0000,,,0.9338,12.4025,1.6792,3.0789,0.9359,1.2653,...,56139.0000,52455.0000,27161.0000,24683.0000,0.0000,0.0000,0.0000,0.0000,2022,1
2,2120234.0000,11710107.0000,5.8697,6.1537,1.2814,14.1810,6.4333,4.2607,1.5821,1.4835,...,214831.0000,215912.0000,195928.0000,180363.0000,0.0000,0.0000,0.0000,0.0000,2022,1
3,2120186.0000,11680107.0000,7.3501,7.3485,2.5290,10.4700,3.1739,,2.5521,1.5978,...,300683.0000,308841.0000,274353.0000,235313.0000,0.0000,0.0000,0.0000,0.0000,2022,1
4,2120186.0000,11680107.0000,7.1350,7.1350,1.1558,19.9781,3.1739,1.3306,1.9152,1.8144,...,300683.0000,308841.0000,274353.0000,235313.0000,0.0000,0.0000,0.0000,0.0000,2022,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3359,,11380110.0000,,,,,,,,,...,,,,,,,,,2023,1
3360,,11380101.0000,,,,,,,,,...,,,,,,,,,2023,1
3361,,11200109.0000,,,,,,,,,...,,,,,,,,,2023,1
3362,,11170132.0000,,,,,,,,,...,,,,,,,,,2023,1


### 2. 결측치 처리

In [8]:
# 결측치 처리할 칼럼 리스트
num_col = KCD_MARKET_CONTEST_JOIN.columns.tolist()
num_col.remove('기준_년_코드')
num_col.remove('기준_분기_코드')

# KNNImputer로 결측치 대체
imputer = KNNImputer(n_neighbors=5)
KCD_MARKET_CONTEST_JOIN[num_col] = imputer.fit_transform(KCD_MARKET_CONTEST_JOIN[num_col])

KCD_MARKET_CONTEST_JOIN[KCD_MARKET_CONTEST_JOIN.isna().any(axis=1)]

Unnamed: 0,상권코드,STDG_EMD_CD,배달매출액_변동계수,주말배달매출액_변동계수,손익분기점매출액_변동계수,사업장방문고객수_평균,정규고용인원_평균,주말카드매출액_변동계수,매입액_변동계수,사업장임대면적_변동계수,...,목요일_생활인구_수,금요일_생활인구_수,토요일_생활인구_수,일요일_생활인구_수,운영_영업_개월_평균,폐업_영업_개월_평균,서울_운영_영업_개월_평균,서울_폐업_영업_개월_평균,기준_년_코드,기준_분기_코드


### 3. 분기 데이터 => 월별 데이터 확장

In [9]:
# 분기별 상권데이터를 소호 신용데이터를 활용하여 월별로 확장(계절성 지수)
## 기준_분기_코드 칼럼의 값을 변경하여 월 칼럼 생성
KCD_MARKET_CONTEST_JOIN['월'] = KCD_MARKET_CONTEST_JOIN['기준_분기_코드'].apply(lambda x: '1 2 3' if x == 1 else ('4 5 6' if x == 2 else ('7 8 9' if x == 3 else ('10 11 12' if x == 4 else x))))

## '월' 칼럼의 값을 공백을 기준으로 분리
KCD_MARKET_CONTEST_JOIN['월'] = KCD_MARKET_CONTEST_JOIN['월'].str.split()

## '월' 칼럼을 분리하기 위해 데이터프레임을 재구성
tmp = pd.DataFrame({'기준_월_코드': np.concatenate(KCD_MARKET_CONTEST_JOIN['월'].values)})

## 데이터를 인덱스 기준으로 확장
EXPAND_MON = KCD_MARKET_CONTEST_JOIN.loc[KCD_MARKET_CONTEST_JOIN.index.repeat(3)].reset_index(drop=True)
EXPAND_MON = EXPAND_MON.drop(columns = '월')

## 확장된 데이터 병합
KCD_MARKET_CONTEST_JOIN_MON = pd.concat([EXPAND_MON,tmp],axis=1)

## DATA_CRTR_YM 칼럼 생성(기준년도+기준월)
KCD_MARKET_CONTEST_JOIN_MON['기준_월_코드'] = KCD_MARKET_CONTEST_JOIN_MON['기준_월_코드'].apply(lambda x : '0' + x if len(x) == 1 else x)
KCD_MARKET_CONTEST_JOIN_MON['기준_월_코드'] = KCD_MARKET_CONTEST_JOIN_MON['기준_월_코드'].astype('int64')
KCD_MARKET_CONTEST_JOIN_MON['DATA_CRTR_YM'] = KCD_MARKET_CONTEST_JOIN_MON['기준_년_코드'].astype('str') + '0' + KCD_MARKET_CONTEST_JOIN_MON['기준_월_코드'].astype('str')
KCD_MARKET_CONTEST_JOIN_MON['DATA_CRTR_YM'] = KCD_MARKET_CONTEST_JOIN_MON['DATA_CRTR_YM'].astype('int64')
KCD_MARKET_CONTEST_JOIN_MON = KCD_MARKET_CONTEST_JOIN_MON.drop(['기준_분기_코드'],axis=1)
# KCD_MARKET_CONTEST_JOIN_MON.to_csv('KCD_MARKET_CONTEST_JOIN_MON.csv',encoding='utf-8-sig',index=False)
KCD_MARKET_CONTEST_JOIN_MON.head(3)

Unnamed: 0,상권코드,STDG_EMD_CD,배달매출액_변동계수,주말배달매출액_변동계수,손익분기점매출액_변동계수,사업장방문고객수_평균,정규고용인원_평균,주말카드매출액_변동계수,매입액_변동계수,사업장임대면적_변동계수,...,금요일_생활인구_수,토요일_생활인구_수,일요일_생활인구_수,운영_영업_개월_평균,폐업_영업_개월_평균,서울_운영_영업_개월_평균,서울_폐업_영업_개월_평균,기준_년_코드,기준_월_코드,DATA_CRTR_YM
0,2120098.0,11440127.0,6.6481,7.9662,1.6407,17.8384,2.4877,3.9457,2.6117,1.2383,...,52455.0,27161.0,24683.0,0.0,0.0,0.0,0.0,2022,1,202201
1,2120098.0,11440127.0,6.6481,7.9662,1.6407,17.8384,2.4877,3.9457,2.6117,1.2383,...,52455.0,27161.0,24683.0,0.0,0.0,0.0,0.0,2022,2,202202
2,2120098.0,11440127.0,6.6481,7.9662,1.6407,17.8384,2.4877,3.9457,2.6117,1.2383,...,52455.0,27161.0,24683.0,0.0,0.0,0.0,0.0,2022,3,202203


In [10]:
## 분모가 0으로 되는 문제를 해결해주기 위해서 부동소수점을 더해줍니다.
col_li = KCD_MARKET_CONTEST_JOIN_MON.columns
col_li = [item for item in col_li if item not in ['상권코드','STDG_EMD_CD','기준_년_코드', '기준_월_코드', 'DATA_CRTR_YM']]

floating_point = 0.0001
KCD_MARKET_CONTEST_JOIN_MON[col_li] = KCD_MARKET_CONTEST_JOIN_MON[col_li] + floating_point
corr = KCD_MARKET_CONTEST_JOIN_MON.corr()
corr = corr.fillna(0)

pd.set_option('display.max_rows',None)
soho_col_name = KCD_SEASON_PATTERN.columns.tolist()
corr_market_soho = corr.loc[:,~corr.columns.isin(soho_col_name)]
corr_market_soho_matrix = corr_market_soho.iloc[2:25,3:]

## 상권_신용 데이터와 계절성 지수의 상관계수가 0.15~0.7인 경우 매핑

data = []
for feature in corr_market_soho_matrix.columns:
    mask = (0.15 <= abs(corr_market_soho_matrix[feature])) & (abs(corr_market_soho_matrix[feature]) <= 0.7)
    data.append({'feature':feature,'season':corr_market_soho_matrix.loc[mask,feature].index.tolist()})

FEATURE_SEASON_MAP = pd.DataFrame(data)

In [11]:
# 빈 데이터프레임을 생성하여 결과를 저장할 변수를 초기화합니다.
pca_dataframe = pd.DataFrame()

# FEATURE_SEASON_MAP을 순회하며 주성분 분석을 수행합니다.
for idx in range(len(FEATURE_SEASON_MAP['season'])):
    try:
        col = FEATURE_SEASON_MAP['season'][idx]

        # 주성분 분석 전에 표준화 (평균 0, 표준 편차 1의 표준 정규 분포로 스케일링)
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(KCD_SEASON_PATTERN[col])

        # 주성분 분석
        pca = PCA(n_components=1)
        pca_data = pca.fit_transform(scaled_data)

        # PCA화된 데이터 칼럼명을 'pca_col'로 명명
        col_name = FEATURE_SEASON_MAP['feature'][idx]
        pca_df = pd.DataFrame(pca_data, columns=[col_name])

        # 결과를 pca_dataframe에 추가합니다.
        pca_dataframe = pd.concat([pca_dataframe, pca_df], axis=1)
    except Exception as e:
        # 예외가 발생한 경우 빈 열을 추가합니다.
        col_name = FEATURE_SEASON_MAP['feature'][idx]
        pca_dataframe[col_name] = np.nan

In [12]:
col_to_drop = ['기준_년_코드', '기준_월_코드']
pca_dataframe.drop(columns=col_to_drop,inplace=True)
pca_dataframe['DATA_CRTR_YM'] = [202201,202202,202203,202204,202205,202206,202207,202208,202209,202210,202211,202212,202301,202302,202303]
pca_dataframe = pca_dataframe.dropna(axis=1)
# pca_dataframe.to_csv('pca_dataframe.csv',encoding='utf-8-sig',index=False)
pca_dataframe = pca_dataframe.abs()
pca_dataframe.head(3)

Unnamed: 0,월_평균_소득_금액,소득_구간_코드,지출_총금액,식료품_지출_총금액,의류_신발_지출_총금액,생활용품_지출_총금액,의료비_지출_총금액,교통_지출_총금액,여가_지출_총금액,문화_지출_총금액,...,시간대_5_생활인구_수,시간대_6_생활인구_수,월요일_생활인구_수,화요일_생활인구_수,수요일_생활인구_수,목요일_생활인구_수,금요일_생활인구_수,토요일_생활인구_수,일요일_생활인구_수,DATA_CRTR_YM
0,1.4644,1.458,0.7188,0.7188,0.61,0.739,0.739,1.6785,0.5654,0.642,...,2.7838,1.7562,2.7838,2.7838,2.7838,2.7838,2.7838,2.7838,2.7838,202201
1,2.3749,2.3749,1.98,1.98,1.965,1.7661,1.7661,1.9835,1.6963,0.8364,...,2.1658,1.0949,2.1658,2.1658,2.1658,2.1658,2.1658,2.1658,2.1658,202202
2,1.3421,1.2565,1.0572,1.0572,1.1772,0.9614,0.9614,1.216,0.8338,0.147,...,0.8963,0.3066,0.8963,0.8963,0.8963,0.8963,0.8963,0.8963,0.8963,202203


In [None]:
# KCD, MARKET, CONTEST 데이터에 계절성 지수를 곱했습니다.
tmp = pd.merge(KCD_MARKET_CONTEST_JOIN_MON,pca_dataframe,on=['DATA_CRTR_YM'],how='left')

tmp['월_평균_소득_금액'] = tmp['월_평균_소득_금액_x'] * tmp['월_평균_소득_금액_y']
tmp['소득_구간_코드'] = tmp['소득_구간_코드_x'] * tmp['소득_구간_코드_y']
tmp['지출_총금액'] = tmp['지출_총금액_x'] * tmp['지출_총금액_y']
tmp['식료품_지출_총금액'] = tmp['식료품_지출_총금액_x'] * tmp['식료품_지출_총금액_y']
tmp['의류_신발_지출_총금액'] = tmp['의류_신발_지출_총금액_x'] * tmp['의류_신발_지출_총금액_y']
tmp['교통_지출_총금액'] = tmp['교통_지출_총금액_x'] * tmp['교통_지출_총금액_y']
tmp['여가_지출_총금액'] = tmp['여가_지출_총금액_x'] * tmp['여가_지출_총금액_y']
tmp['문화_지출_총금액'] = tmp['문화_지출_총금액_x'] * tmp['문화_지출_총금액_y']
tmp['교육_지출_총금액'] = tmp['교육_지출_총금액_x'] * tmp['교육_지출_총금액_y']
tmp['유흥_지출_총금액'] = tmp['유흥_지출_총금액_x'] * tmp['유흥_지출_총금액_y']
tmp['아파트_단지_수'] = tmp['아파트_단지_수_x'] * tmp['아파트_단지_수_y']
tmp['아파트_가격_1_억_미만_세대_수'] = tmp['아파트_가격_1_억_미만_세대_수_x'] * tmp['아파트_가격_1_억_미만_세대_수_y']
tmp['아파트_가격_1_억_세대_수'] = tmp['아파트_가격_1_억_세대_수_x'] * tmp['아파트_가격_1_억_세대_수_y']
tmp['아파트_가격_2_억_세대_수'] = tmp['아파트_가격_2_억_세대_수_x'] * tmp['아파트_가격_2_억_세대_수_y']
tmp['아파트_가격_4_억_세대_수'] = tmp['아파트_가격_4_억_세대_수_x'] * tmp['아파트_가격_4_억_세대_수_y']
tmp['아파트_가격_5_억_세대_수'] = tmp['아파트_가격_5_억_세대_수_x'] * tmp['아파트_가격_5_억_세대_수_y']
tmp['점포_수'] = tmp['점포_수_x'] * tmp['점포_수_y']
tmp['유사_업종_점포_수'] = tmp['유사_업종_점포_수_x'] * tmp['유사_업종_점포_수_y']
tmp['개업_점포_수'] = tmp['개업_점포_수_x'] * tmp['개업_점포_수_y']
tmp['폐업_점포_수'] = tmp['폐업_점포_수_x'] * tmp['폐업_점포_수_y']
tmp['프랜차이즈_점포_수'] = tmp['프랜차이즈_점포_수_x'] * tmp['프랜차이즈_점포_수_y']
tmp['총_생활인구_수'] = tmp['총_생활인구_수_x'] * tmp['총_생활인구_수_y']
tmp['남성_생활인구_수'] = tmp['남성_생활인구_수_x'] * tmp['남성_생활인구_수_y']
tmp['여성_생활인구_수'] = tmp['여성_생활인구_수_x'] * tmp['여성_생활인구_수_y']
tmp['연령대_10_생활인구_수'] = tmp['연령대_10_생활인구_수_x'] * tmp['연령대_10_생활인구_수_y']
tmp['연령대_20_생활인구_수'] = tmp['연령대_20_생활인구_수_x'] * tmp['연령대_20_생활인구_수_y']
tmp['연령대_30_생활인구_수'] = tmp['연령대_30_생활인구_수_x'] * tmp['연령대_30_생활인구_수_y']
tmp['연령대_40_생활인구_수'] = tmp['연령대_40_생활인구_수_x'] * tmp['연령대_40_생활인구_수_y']
tmp['연령대_50_생활인구_수'] = tmp['연령대_50_생활인구_수_x'] * tmp['연령대_50_생활인구_수_y']
tmp['연령대_60_이상_생활인구_수'] = tmp['연령대_60_이상_생활인구_수_x'] * tmp['연령대_60_이상_생활인구_수_y']
tmp['시간대_1_생활인구_수'] = tmp['시간대_1_생활인구_수_x'] * tmp['시간대_1_생활인구_수_y']
tmp['시간대_2_생활인구_수'] = tmp['시간대_2_생활인구_수_x'] * tmp['시간대_2_생활인구_수_y']
tmp['시간대_3_생활인구_수'] = tmp['시간대_3_생활인구_수_x'] * tmp['시간대_3_생활인구_수_y']
tmp['시간대_4_생활인구_수'] = tmp['시간대_4_생활인구_수_x'] * tmp['시간대_4_생활인구_수_y']
tmp['시간대_5_생활인구_수'] = tmp['시간대_5_생활인구_수_x'] * tmp['시간대_5_생활인구_수_y']
tmp['시간대_6_생활인구_수'] = tmp['시간대_6_생활인구_수_x'] * tmp['시간대_6_생활인구_수_y']
tmp['월요일_생활인구_수'] = tmp['월요일_생활인구_수_x'] * tmp['월요일_생활인구_수_y']
tmp['화요일_생활인구_수'] = tmp['화요일_생활인구_수_x'] * tmp['화요일_생활인구_수_y']
tmp['수요일_생활인구_수'] = tmp['수요일_생활인구_수_x'] * tmp['수요일_생활인구_수_y']
tmp['목요일_생활인구_수'] = tmp['목요일_생활인구_수_x'] * tmp['목요일_생활인구_수_y']
tmp['금요일_생활인구_수'] = tmp['금요일_생활인구_수_x'] * tmp['금요일_생활인구_수_y']
tmp['토요일_생활인구_수'] = tmp['토요일_생활인구_수_x'] * tmp['토요일_생활인구_수_y']
tmp['일요일_생활인구_수'] = tmp['일요일_생활인구_수_x'] * tmp['일요일_생활인구_수_y']

# 필요없는 칼럼 삭제
drop_col = ['월_평균_소득_금액_x', '소득_구간_코드_x', '지출_총금액_x', '식료품_지출_총금액_x','의류_신발_지출_총금액_x', '생활용품_지출_총금액_x', '의료비_지출_총금액_x',
            '교통_지출_총금액_x','여가_지출_총금액_x', '문화_지출_총금액_x', '교육_지출_총금액_x', '유흥_지출_총금액_x','아파트_단지_수_x', '아파트_가격_1_억_미만_세대_수_x',
            '아파트_가격_1_억_세대_수_x','아파트_가격_2_억_세대_수_x','아파트_가격_4_억_세대_수_x', '아파트_가격_5_억_세대_수_x','점포_수_x', '유사_업종_점포_수_x',
            '개업_점포_수_x','프랜차이즈_점포_수_x','총_생활인구_수_x', '남성_생활인구_수_x','여성_생활인구_수_x', '연령대_10_생활인구_수_x','연령대_20_생활인구_수_x',
            '연령대_30_생활인구_수_x','연령대_40_생활인구_수_x', '연령대_50_생활인구_수_x', '연령대_60_이상_생활인구_수_x','시간대_1_생활인구_수_x', '시간대_2_생활인구_수_x',
            '시간대_3_생활인구_수_x', '시간대_4_생활인구_수_x','시간대_5_생활인구_수_x', '시간대_6_생활인구_수_x', '월요일_생활인구_수_x', '화요일_생활인구_수_x',
            '수요일_생활인구_수_x', '목요일_생활인구_수_x', '금요일_생활인구_수_x', '토요일_생활인구_수_x','일요일_생활인구_수_x']
tmp.drop(tmp.columns[102:147], axis=1, inplace = True)
tmp.drop(columns = drop_col, inplace=True)
KCD_MARKET_CONTEST_JOIN_MON_SEASON = tmp
# KCD_MARKET_CONTEST_JOIN_MON_SEASON.to_csv('KCD_MARKET_CONTEST_JOIN_MON_SEASON.csv',encoding='utf-8-sig',index=False)

### 4. 필지고유번호 기준 label데이터 결합

In [14]:
# 데이터 연도별 분리
KCD_MARKET_CONTEST_JOIN_MON_SEASON['STDG_EMD_CD']= KCD_MARKET_CONTEST_JOIN_MON_SEASON['STDG_EMD_CD'].astype(str).str[:8]
KCD_MARKET_CONTEST_JOIN_MON_SEASON['STDG_EMD_CD'] = KCD_MARKET_CONTEST_JOIN_MON_SEASON['STDG_EMD_CD'].astype('int64')

# 연도별 분리
KCD_MARKET_CONTEST_2022 = KCD_MARKET_CONTEST_JOIN_MON_SEASON[KCD_MARKET_CONTEST_JOIN_MON_SEASON['기준_년_코드'] == 2022]
KCD_MARKET_CONTEST_2023 = KCD_MARKET_CONTEST_JOIN_MON_SEASON[KCD_MARKET_CONTEST_JOIN_MON_SEASON['기준_년_코드'] == 2023]

# label 데이터 결합
CONTEST = CONTEST.rename(columns = {'법정동코드':'STDG_EMD_CD'})
KCD_MARKET_CONTEST_2022 = pd.merge(KCD_MARKET_CONTEST_2022,CONTEST,right_on = "STDG_EMD_CD",left_on = "STDG_EMD_CD",how = 'right')
KCD_MARKET_CONTEST_2022 = KCD_MARKET_CONTEST_2022.dropna()
KCD_MARKET_CONTEST_2022 = KCD_MARKET_CONTEST_2022.drop(columns = ['업종코드'])

print(KCD_MARKET_CONTEST_2022['DATA_CRTR_YM'].unique())
print(KCD_MARKET_CONTEST_2023['DATA_CRTR_YM'].unique())

[202201. 202202. 202203. 202204. 202205. 202206. 202207. 202208. 202209.]
[202301 202302 202303]


In [15]:
# 2022년 데이터 병합
CONTEST_2022 = CONTEST_2022.drop(columns = ['SLS_GRD'])
KCD_MARKET_CONTEST_SEASON_2022_TOTAL = pd.merge(KCD_MARKET_CONTEST_2022, CONTEST_2022, right_index=True, left_index=True, how = 'left')
KCD_MARKET_CONTEST_SEASON_2022_TOTAL = KCD_MARKET_CONTEST_SEASON_2022_TOTAL.drop(columns = ['STDG_EMD_CD_x','DATA_CRTR_YM_x'])
KCD_MARKET_CONTEST_SEASON_2022_TOTAL = KCD_MARKET_CONTEST_SEASON_2022_TOTAL.rename(columns = {'STDG_EMD_CD_y':'STDG_EMD_CD','DATA_CRTR_YM_y':'DATA_CRTR_YM'})

# 2023년 데이터 병합
KCD_MARKET_CONTEST_SEASON_2023_TOTAL = pd.merge(KCD_MARKET_CONTEST_2023, CONTEST_2023, right_index=True, left_index=True, how = 'right')
KCD_MARKET_CONTEST_SEASON_2023_TOTAL = KCD_MARKET_CONTEST_SEASON_2023_TOTAL.drop(columns = 'STDG_EMD_CD_x')
KCD_MARKET_CONTEST_SEASON_2023_TOTAL = KCD_MARKET_CONTEST_SEASON_2023_TOTAL.rename(columns = {'STDG_EMD_CD_y':'STDG_EMD_CD'})
KCD_MARKET_CONTEST_SEASON_2023_TOTAL = KCD_MARKET_CONTEST_SEASON_2023_TOTAL.fillna(0)

# KCD_MARKET_CONTEST_SEASON_2022_TOTAL.to_csv('KCD_MARKET_CONTEST_SEASON_2022_TOTAL.csv',encoding='utf-8-sig',index=False)
# KCD_MARKET_CONTEST_SEASON_2023_TOTAL.to_csv('KCD_MARKET_CONTEST_SEASON_2023_TOTAL.csv',encoding='utf-8-sig',index=False)

In [16]:
KCD_MARKET_CONTEST_SEASON_2022_TOTAL.head(3)

Unnamed: 0,상권코드,배달매출액_변동계수,주말배달매출액_변동계수,손익분기점매출액_변동계수,사업장방문고객수_평균,정규고용인원_평균,주말카드매출액_변동계수,매입액_변동계수,사업장임대면적_변동계수,부가가치세_변동계수,...,수요일_생활인구_수,목요일_생활인구_수,금요일_생활인구_수,토요일_생활인구_수,일요일_생활인구_수,매출등급x로짓추정확률,DATA_CRTR_YM,LT_UNQ_NO,STDG_EMD_CD,INDUSTRY_CD
0,2110008.0,5.9366,5.9046,1.6103,17.7064,1.3426,1.8547,2.7658,2.1284,-2.2768,...,444558.0746,445109.259,437643.2155,447144.1874,449735.8676,2,202201,1111010100100030100,11110101,A03
1,2110008.0,5.9366,5.9046,1.6103,17.7064,1.3426,1.8547,2.7658,2.1284,-2.2768,...,345866.8018,346295.624,340487.0318,347878.8011,349895.132,2,202201,1111010100100480000,11110101,A01
2,2110008.0,5.9366,5.9046,1.6103,17.7064,1.3426,1.8547,2.7658,2.1284,-2.2768,...,143141.7468,143319.2208,140915.2547,143974.4404,144808.9268,2,202201,1111010100100590001,11110101,A11


In [17]:
KCD_MARKET_CONTEST_SEASON_2023_TOTAL.head(3)

Unnamed: 0,상권코드,배달매출액_변동계수,주말배달매출액_변동계수,손익분기점매출액_변동계수,사업장방문고객수_평균,정규고용인원_평균,주말카드매출액_변동계수,매입액_변동계수,사업장임대면적_변동계수,부가가치세_변동계수,...,수요일_생활인구_수,목요일_생활인구_수,금요일_생활인구_수,토요일_생활인구_수,일요일_생활인구_수,LT_UNQ_NO,INDUSTRY_CD,STDG_EMD_CD,SLS_GRD_2301,SLS_GRD_2302
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1168010800102090000,A01,11680108,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1135010300106330015,A05,11350103,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1154510200109830004,A02,11545102,0.0,0.0
