***

# Environment
- COLAB PRO PLUS , GPU: A100 ,  고용량 RAM

***

## Library version check

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install catboost

In [142]:
import sys
import tqdm as tq
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np
print("-------------------------- Python & library version --------------------------")
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("numpy version: {}".format(np.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("tqdm version: {}".format(tq.__version__))
print("xgboost version: {}".format(xgb.__version__))
print("lightgbm version: {}".format(lgb.__version__))
print("catboost version: {}".format(cat.__version__))
print("seaborn version: {}".format(sns.__version__))
print("scikit-learn version: {}".format(skl.__version__))
print("------------------------------------------------------------------------------")

-------------------------- Python & library version --------------------------
Python version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
pandas version: 1.5.3
numpy version: 1.23.5
matplotlib version: 3.7.1
tqdm version: 4.66.1
xgboost version: 2.0.2
lightgbm version: 4.1.0
catboost version: 1.2.2
seaborn version: 0.12.2
scikit-learn version: 1.2.2
------------------------------------------------------------------------------


## 0. Load the Libraries

In [143]:
import os
import re
import random
import geopandas as gpd
from shapely.geometry import MultiPolygon
from pyproj import Proj, transform
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import StratifiedKFold, KFold
import matplotlib.pyplot as plt
from tqdm import tqdm
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 30)

#Seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(42)

#Eval metric
def rmsle(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred) ** 0.5

#시각화 설정
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 15

# 운영체제별 한글 폰트 설정
if os.name == 'posix': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif os.name == 'nt': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')

plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정


# 글씨 선명하게 출력하는 설정
%config InlineBackend.figure_format = 'retina'

***

## 1. Load Data (외부데이터(공공데이터 포털), CCTV, 보안등, 주차장, 어린이보호구역)

In [None]:
#대구 교통사고 추가데이터
additional_1 = pd.read_csv('/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/data/외부데이터1_csv.csv', encoding='utf-8')[['사고번호','사고일시','요일','기상상태','시군구','도로형태','노면상태','사고유형','사망자수','중상자수','경상자수','부상신고자수']]
additional_2 = pd.read_csv('/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/data/외부데이터2_csv.csv', encoding='utf-8')[['사고번호','사고일시','요일','기상상태','시군구','도로형태','노면상태','사고유형','사망자수','중상자수','경상자수','부상신고자수']]
additional_3 = pd.read_csv('/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/data/외부데이터3_csv.csv', encoding='utf-8')[['사고번호','사고일시','요일','기상상태','시군구','도로형태','노면상태','사고유형','사망자수','중상자수','경상자수','부상신고자수']]
additional_4 = pd.read_csv('/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/data/외부데이터4_csv.csv', encoding='utf-8')[['사고번호','사고일시','요일','기상상태','시군구','도로형태','노면상태','사고유형','사망자수','중상자수','경상자수','부상신고자수']]

additional = pd.concat([additional_1,additional_2,additional_3,additional_4], axis = 0).reset_index(drop=True)

additional.rename(columns={'사고번호': 'ID'}, inplace=True)
additional['사고일시'] = pd.to_datetime(additional['사고일시'], format='%Y년 %m월 %d일 %H시')
additional['사고유형'] = additional['사고유형'].str.split(' - ').str[0]

additional.rename(columns={'부상신고자수':'부상자수'}, inplace = True)

additional['ECLO'] = additional['사망자수'] * 10 + additional['중상자수'] * 5 + additional['경상자수'] * 3 + additional['부상자수'] * 1

additional.head()

In [145]:
#시간 정보 추출

for df in [additional]:
    df['연'] = df['사고일시'].dt.year
    df['월'] = df['사고일시'].dt.month
    df['일'] = df['사고일시'].dt.day
    df['monthday'] = df.apply(lambda row: str(row['월']) + '-' + str(row['일']), axis=1)
    df['시간'] = df['사고일시'].dt.hour
    df['weekday'] = df['사고일시'].dt.weekday
    df['weekofyear'] = (df['사고일시'].dt.isocalendar().week).astype(int)

    df['새벽'] = df['시간'].isin([0,1,2,3,4,5,6]).astype(int)
    df['밤'] = df['시간'].isin([21,22,23]).astype(int)
    df['주말'] = df['weekday'].isin([5,6]).astype(int)
    df['주중'] = df['weekday'].isin([0,1,2,3,4]).astype(int)
    df['국가공휴일_상준'] = df['monthday'].isin(['1-1','3-1','5-5','6-6','8-15','10-3','10-9','12-25','12-31']).astype(int)

additional = additional.drop(columns=['사고일시','monthday'])

#지역 정보 추출
location_pattern = r'(\S+) (\S+) (\S+)'

additional[['도시', '구', '동']] = additional['시군구'].str.extract(location_pattern)
additional = additional.drop(columns=['시군구'])

#도로 정보 추출
road_pattern = r'(.+) - (.+)'

additional[['도로형태1', '도로형태2']] = additional['도로형태'].str.extract(road_pattern)
additional = additional.drop(columns=['도로형태'])

additional['사고유형_도로형태2'] = additional['사고유형'] + '_' + additional['도로형태2']

additional.dropna(subset = ['도시','구','동'], inplace = True)

In [None]:
#보안등 데이터
light_df = pd.read_csv('/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/data/대구 보안등 정보.csv', encoding='cp949')[['설치개수', '위도', '경도', '설치연도', '설치형태', '소재지지번주소']]

light_df['위도'] = light_df['위도'].fillna(light_df['위도'].mean())
light_df['경도'] = light_df['경도'].fillna(light_df['경도'].mean())
light_df['설치연도'] = light_df['설치연도'].fillna(light_df['설치연도'].mode()[0])
light_df['설치형태'] = light_df['설치형태'].fillna(light_df['설치형태'].mode()[0])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)

location_pattern = r'(\S+) (\S+) (\S+)'
light_df.loc[light_df['도시'].isna(), '도시'] = light_df.loc[light_df['도시'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,0]
light_df.loc[light_df['구'].isna(), '구'] = light_df.loc[light_df['구'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,1]
light_df.loc[light_df['동'].isna(), '동'] = light_df.loc[light_df['동'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,2]

location_pattern = r'(\S+) (\S+)'
light_df.loc[light_df['도시'].isna(), '도시'] = light_df.loc[light_df['도시'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,0]
light_df.loc[light_df['구'].isna(), '구'] = light_df.loc[light_df['구'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,1]
light_df['동'] = light_df['동'].fillna("신암동")

light_df.loc[light_df['동'].str.contains("-"), '동'] = light_df.loc[light_df['동'].str.contains("-"), '동'].apply(lambda x : re.sub(r'\d+-\d+', '', x))
light_df.loc[light_df['동'].str.contains("동"), '동'] = light_df.loc[light_df['동'].str.contains("동"), '동'].apply(lambda x : re.sub(r'\d\d+', '', x))
light_df.loc[light_df['동'] == '', '동'] = "신암동"

light_df = light_df.drop(columns=['소재지지번주소', '번지'])

light_df['설치형태'] = light_df['설치형태'].fillna(light_df['설치형태'].mode()[0])

#보안등 동이름 그룹핑
light_df.loc[light_df['동']=="내당1동", "동"] = "내당동"
light_df.loc[light_df['동']=="내당2·3동", "동"] = "내당동"
light_df.loc[light_df['동']=="내당4동", "동"] = "내당동"
light_df.loc[light_df['동']=="대현1동", "동"] = "대현동"
light_df.loc[light_df['동']=="대현2동", "동"] = "대현동"
light_df.loc[light_df['동']=="도평로131", "동"] = "도동"
light_df.loc[light_df['동']=="만촌1동", "동"] = "만촌동"
light_df.loc[light_df['동']=="만촌2동", "동"] = "만촌동"
light_df.loc[light_df['동']=="만촌3동", "동"] = "만촌동"
light_df.loc[light_df['동']=="범물1동", "동"] = "범물동"
light_df.loc[light_df['동']=="범물2동", "동"] = "범물동"
light_df.loc[light_df['동']=="범어1동", "동"] = "범어동"
light_df.loc[light_df['동']=="범어2동", "동"] = "범어동"
light_df.loc[light_df['동']=="범어3동", "동"] = "범어동"
light_df.loc[light_df['동']=="범어4동", "동"] = "범어동"
light_df.loc[light_df['동']=="범물1동", "동"] = "범물동"
light_df.loc[light_df['동']=="범물2동", "동"] = "범물동"
light_df.loc[light_df['동']=="복현1동", "동"] = "복현동"
light_df.loc[light_df['동']=="부동길", "동"] = "부동"
light_df.loc[light_df['동']=="비산1동", "동"] = "비산동"
light_df.loc[light_df['동']=="비산2·3동", "동"] = "비산동"
light_df.loc[light_df['동']=="비산4동", "동"] = "비산동"
light_df.loc[light_df['동']=="비산7동", "동"] = "비산동"
light_df.loc[light_df['동']=="산격1동", "동"] = "산격동"
light_df.loc[light_df['동']=="산격2동", "동"] = "산격동"
light_df.loc[light_df['동']=="산격3동", "동"] = "산격동"
light_df.loc[light_df['동']=="산격4동", "동"] = "산격동"
light_df.loc[light_df['동']=="송촌리", "동"] = "옥포읍"
light_df.loc[light_df['동']=="수성1가동", "동"] = "수성동1가"
light_df.loc[light_df['동']=="수성2.3가동", "동"] = "수성동2가"
light_df.loc[light_df['동']=="수성4가동", "동"] = "수성동4가"
light_df.loc[light_df['동']=="신암5동", "동"] = "신암동"
light_df.loc[light_df['동']=="옥포면", "동"] = "옥포읍"
light_df.loc[light_df['동']=="원교리", "동"] = "현풍읍"
light_df.loc[light_df['동']=="유가면", "동"] = "유가읍"
light_df.loc[light_df['동']=="유곡리", "동"] = "유가읍"
light_df.loc[light_df['동']=="입석로", "동"] = "입석동"
light_df.loc[light_df['동']=="지산1동", "동"] = "지산동"
light_df.loc[light_df['동']=="지산2동", "동"] = "지산동"
light_df.loc[light_df['동']=="침산1동", "동"] = "침산동"
light_df.loc[light_df['동']=="침산3동", "동"] = "침산동"
light_df.loc[light_df['동']=="태전1동", "동"] = "태전동"
light_df.loc[light_df['동']=="태전2동", "동"] = "태전동"
light_df.loc[light_df['동']=="팔공로", "동"] = "봉무동"
light_df.loc[light_df['동']=="평리1동", "동"] = "평리동"
light_df.loc[light_df['동']=="평리2동", "동"] = "평리동"
light_df.loc[light_df['동']=="평리3동", "동"] = "평리동"
light_df.loc[light_df['동']=="평리4동", "동"] = "평리동"
light_df.loc[light_df['동']=="평리5동", "동"] = "평리동"
light_df.loc[light_df['동']=="평리6동", "동"] = "평리동"
light_df.loc[light_df['동']=="한정리", "동"] = "유가읍"
light_df.loc[light_df['동']=="현풍면", "동"] = "현풍읍"
light_df.loc[light_df['동']=="황금1동", "동"] = "황금동"
light_df.loc[light_df['동']=="황금2동", "동"] = "황금동"

light_df_1 = light_df[['도시', '구', '동', '설치개수']].groupby(['도시', '구', '동']).sum().reset_index()
light_df_1.reset_index(inplace=True, drop=True)
light_df_1.rename(columns = {'설치개수':'보안등_설치총개수'}, inplace = True)

light_df_2 = light_df[['도시', '구', '동', '설치개수']].groupby(['도시', '구', '동']).mean().reset_index()
light_df_2.reset_index(inplace=True, drop=True)
light_df_2.rename(columns = {'설치개수':'보안등_평균설치개수'}, inplace = True)

light_df_3 = light_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).mean().reset_index()
light_df_3.reset_index(inplace=True, drop=True)
light_df_3.rename(columns = {'위도':'보안등_평균위도'}, inplace = True)

light_df_4 = light_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).min().reset_index()
light_df_4.reset_index(inplace=True, drop=True)
light_df_4.rename(columns = {'위도':'보안등_최저위도'}, inplace = True)

light_df_5 = light_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).max().reset_index()
light_df_5.reset_index(inplace=True, drop=True)
light_df_5.rename(columns = {'위도':'보안등_최고위도'}, inplace = True)

light_df_6 = light_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).mean().reset_index()
light_df_6.reset_index(inplace=True, drop=True)
light_df_6.rename(columns = {'경도':'보안등_평균경도'}, inplace = True)

light_df_7 = light_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).min().reset_index()
light_df_7.reset_index(inplace=True, drop=True)
light_df_7.rename(columns = {'경도':'보안등_최저경도'}, inplace = True)

light_df_8 = light_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).max().reset_index()
light_df_8.reset_index(inplace=True, drop=True)
light_df_8.rename(columns = {'경도':'보안등_최고경도'}, inplace = True)

light_df_9 = light_df[['도시', '구', '동', '설치연도']].groupby(['도시', '구', '동']).mean().reset_index()
light_df_9.reset_index(inplace=True, drop=True)
light_df_9.rename(columns = {'설치연도':'보안등_평균설치연도'}, inplace = True)

light_df_10 = light_df[['도시', '구', '동', '설치연도']].groupby(['도시', '구', '동']).min().reset_index()
light_df_10.reset_index(inplace=True, drop=True)
light_df_10.rename(columns = {'설치연도':'보안등_최소설치연도'}, inplace = True)

light_df_11 = light_df[['도시', '구', '동', '설치연도']].groupby(['도시', '구', '동']).max().reset_index()
light_df_11.reset_index(inplace=True, drop=True)
light_df_11.rename(columns = {'설치연도':'보안등_최대설치연도'}, inplace = True)

light_df_12 = light_df[['도시', '구', '동', '설치형태']].groupby(['도시', '구', '동']).nunique().reset_index()
light_df_12.reset_index(inplace=True, drop=True)
light_df_12.rename(columns = {'설치형태':'보안등_설치형태종류수'}, inplace = True)

light_df = pd.merge(light_df_1, light_df_2, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_3, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_4, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_5, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_6, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_7, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_8, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_9, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_10, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_11, how='left', on=['도시', '구', '동'])
light_df = pd.merge(light_df, light_df_12, how='left', on=['도시', '구', '동'])

light_df.head()

In [None]:
#어린이 보호구역 데이터
child_area_df = pd.read_csv('/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/data/대구 어린이 보호 구역 정보.csv', encoding='cp949').drop_duplicates()[['CCTV설치대수', '위도', '경도','보호구역도로폭','소재지지번주소']]
child_area_df.dropna(subset = ['소재지지번주소'], inplace = True)
child_area_df['cnt'] = 1

child_area_df['CCTV설치대수'] = child_area_df['CCTV설치대수'].fillna(0)
child_area_df['위도'] = child_area_df['위도'].fillna(child_area_df['위도'].mean())
child_area_df['경도'] = child_area_df['경도'].fillna(child_area_df['경도'].mean())
child_area_df['보호구역도로폭'] = child_area_df['보호구역도로폭'].fillna(child_area_df['보호구역도로폭'].mode()[0])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)

child_area_df.loc[child_area_df['도시'].isna(), '도시'] = "대구광역시"
child_area_df.loc[child_area_df['구'].isna(), '구'] = "서구"
child_area_df.loc[child_area_df['동'].isna(), '동'] = "원대동3가"

child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])

child_area_df['도로폭_min'] = 0
child_area_df.loc[child_area_df['보호구역도로폭'].str.contains("~"), "도로폭_min"] = child_area_df.loc[child_area_df['보호구역도로폭'].str.contains("~"), "보호구역도로폭"].apply(lambda x : float(x.split("~")[0]))
child_area_df.loc[~child_area_df['보호구역도로폭'].str.contains("~"), "도로폭_min"] = child_area_df.loc[~child_area_df['보호구역도로폭'].str.contains("~"), "보호구역도로폭"].astype(float)

child_area_df['도로폭_max'] = 0
child_area_df.loc[child_area_df['보호구역도로폭'].str.contains("~"), "도로폭_max"] = child_area_df.loc[child_area_df['보호구역도로폭'].str.contains("~"), "보호구역도로폭"].apply(lambda x : float(x.split("~")[-1]))
child_area_df.loc[~child_area_df['보호구역도로폭'].str.contains("~"), "도로폭_max"] = child_area_df.loc[~child_area_df['보호구역도로폭'].str.contains("~"), "보호구역도로폭"].astype(float)

child_area_df['도로폭_mean'] = (child_area_df['도로폭_min'] + child_area_df['도로폭_max']) / 2

#어린이보호구역 동이름 재지정
child_area_df.loc[child_area_df['동']=="옥포면", "동"] = "옥포읍"
child_area_df.loc[child_area_df['동']=="현풍면", "동"] = "현풍읍"

child_area_df_1 = child_area_df[['도시', '구', '동', 'CCTV설치대수']].groupby(['도시', '구', '동']).sum().reset_index()
child_area_df_1.reset_index(inplace=True, drop=True)
child_area_df_1.rename(columns = {'CCTV설치대수':'어린이보호구역_CCTV총설치대수'}, inplace = True)

child_area_df_2 = child_area_df[['도시', '구', '동', 'CCTV설치대수']].groupby(['도시', '구', '동']).mean().reset_index()
child_area_df_2.reset_index(inplace=True, drop=True)
child_area_df_2.rename(columns = {'CCTV설치대수':'어린이보호구역_CCTV평균설치대수'}, inplace = True)

child_area_df_3 = child_area_df[['도시', '구', '동', 'CCTV설치대수']].groupby(['도시', '구', '동']).min().reset_index()
child_area_df_3.reset_index(inplace=True, drop=True)
child_area_df_3.rename(columns = {'CCTV설치대수':'어린이보호구역_CCTV최소설치대수'}, inplace = True)

child_area_df_4 = child_area_df[['도시', '구', '동', 'CCTV설치대수']].groupby(['도시', '구', '동']).max().reset_index()
child_area_df_4.reset_index(inplace=True, drop=True)
child_area_df_4.rename(columns = {'CCTV설치대수':'어린이보호구역_CCTV최대설치대수'}, inplace = True)

child_area_df_5 = child_area_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).min().reset_index()
child_area_df_5.reset_index(inplace=True, drop=True)
child_area_df_5.rename(columns = {'위도':'어린이보호구역_최저위도'}, inplace = True)

child_area_df_6 = child_area_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).mean().reset_index()
child_area_df_6.reset_index(inplace=True, drop=True)
child_area_df_6.rename(columns = {'위도':'어린이보호구역_평균위도'}, inplace = True)

child_area_df_7 = child_area_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).max().reset_index()
child_area_df_7.reset_index(inplace=True, drop=True)
child_area_df_7.rename(columns = {'위도':'어린이보호구역_최고위도'}, inplace = True)

child_area_df_8 = child_area_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).min().reset_index()
child_area_df_8.reset_index(inplace=True, drop=True)
child_area_df_8.rename(columns = {'경도':'어린이보호구역_최저경도'}, inplace = True)

child_area_df_9 = child_area_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).mean().reset_index()
child_area_df_9.reset_index(inplace=True, drop=True)
child_area_df_9.rename(columns = {'경도':'어린이보호구역_평균경도'}, inplace = True)

child_area_df_10 = child_area_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).max().reset_index()
child_area_df_10.reset_index(inplace=True, drop=True)
child_area_df_10.rename(columns = {'경도':'어린이보호구역_최고경도'}, inplace = True)

child_area_df_11 = child_area_df[['도시', '구', '동', '도로폭_min']].groupby(['도시', '구', '동']).mean().reset_index()
child_area_df_11.reset_index(inplace=True, drop=True)
child_area_df_11.rename(columns = {'도로폭_min':'어린이보호구역_평균최소도로폭'}, inplace = True)

child_area_df_12 = child_area_df[['도시', '구', '동', '도로폭_mean']].groupby(['도시', '구', '동']).mean().reset_index()
child_area_df_12.reset_index(inplace=True, drop=True)
child_area_df_12.rename(columns = {'도로폭_mean':'어린이보호구역_평균도로폭'}, inplace = True)

child_area_df_13 = child_area_df[['도시', '구', '동', '도로폭_max']].groupby(['도시', '구', '동']).mean().reset_index()
child_area_df_13.reset_index(inplace=True, drop=True)
child_area_df_13.rename(columns = {'도로폭_max':'어린이보호구역_평균최대도로폭'}, inplace = True)

child_area_df_14 = child_area_df[['도시', '구', '동', 'cnt']].groupby(['도시', '구', '동']).sum().reset_index()
child_area_df_14.reset_index(inplace=True, drop=True)
child_area_df_14.rename(columns = {'cnt':'어린이보호구역_총개수'}, inplace = True)

child_area_df = pd.merge(child_area_df_1, child_area_df_2, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_3, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_4, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_5, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_6, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_7, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_8, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_9, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_10, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_11, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_12, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_13, how='left', on=['도시', '구', '동'])
child_area_df = pd.merge(child_area_df, child_area_df_14, how='left', on=['도시', '구', '동'])

child_area_df.head()

In [None]:
#주차장 데이터
parking_df = pd.read_csv('/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/data/대구 주차장 정보.csv', encoding='cp949').drop_duplicates()[['소재지지번주소', '주차구획수', '위도', '경도', '급지구분']]
parking_df.dropna(subset = ['소재지지번주소'], inplace = True)
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])
parking_df['cnt'] = 1

parking_df['위도'] = parking_df['위도'].fillna(parking_df['위도'].mean())
parking_df['경도'] = parking_df['경도'].fillna(parking_df['경도'].mean())

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)

location_pattern = r'(\S+) (\S+) (\S+)'

parking_df.loc[parking_df['도시'].isna(), '도시'] = parking_df.loc[parking_df['도시'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,0]
parking_df.loc[parking_df['구'].isna(), '구'] = parking_df.loc[parking_df['구'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,1]
parking_df.loc[parking_df['동'].isna(), '동'] = parking_df.loc[parking_df['동'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,2]

parking_df.loc[parking_df['동'].str.contains("-"), '동'] = parking_df.loc[parking_df['동'].str.contains("-"), '동'].apply(lambda x : re.sub(r'\d+-\d+', '', x))

parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])

#주차장 동이름 재지정
parking_df.loc[parking_df['동']=="옥포면", "동"] = "옥포읍"
parking_df.loc[parking_df['동']=="현풍면", "동"] = "현풍읍"
parking_df.loc[parking_df['동']=="내당4동", "동"] = "내당동"
parking_df.loc[parking_df['동']=="유곡리", "동"] = "유가읍"

parking_df_1 = parking_df[['도시', '구', '동', '주차구획수']].groupby(['도시', '구', '동']).mean().reset_index()
parking_df_1.reset_index(inplace=True, drop=True)
parking_df_1.rename(columns = {'주차구획수':'주차장_평균주차구획수'}, inplace = True)

parking_df_2 = parking_df[['도시', '구', '동', '주차구획수']].groupby(['도시', '구', '동']).min().reset_index()
parking_df_2.reset_index(inplace=True, drop=True)
parking_df_2.rename(columns = {'주차구획수':'주차장_최소주차구획수'}, inplace = True)

parking_df_3 = parking_df[['도시', '구', '동', '주차구획수']].groupby(['도시', '구', '동']).max().reset_index()
parking_df_3.reset_index(inplace=True, drop=True)
parking_df_3.rename(columns = {'주차구획수':'주차장_최대주차구획수'}, inplace = True)

parking_df_4 = parking_df[['도시', '구', '동', '주차구획수']].groupby(['도시', '구', '동']).sum().reset_index()
parking_df_4.reset_index(inplace=True, drop=True)
parking_df_4.rename(columns = {'주차구획수':'주차장_총주차구획수'}, inplace = True)

parking_df_5 = parking_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).min().reset_index()
parking_df_5.reset_index(inplace=True, drop=True)
parking_df_5.rename(columns = {'위도':'주차장_최저위도'}, inplace = True)

parking_df_6 = parking_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).mean().reset_index()
parking_df_6.reset_index(inplace=True, drop=True)
parking_df_6.rename(columns = {'위도':'주차장_평균위도'}, inplace = True)

parking_df_7 = parking_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).max().reset_index()
parking_df_7.reset_index(inplace=True, drop=True)
parking_df_7.rename(columns = {'위도':'주차장_최고위도'}, inplace = True)

parking_df_8 = parking_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).min().reset_index()
parking_df_8.reset_index(inplace=True, drop=True)
parking_df_8.rename(columns = {'경도':'주차장_최저경도'}, inplace = True)

parking_df_9 = parking_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).mean().reset_index()
parking_df_9.reset_index(inplace=True, drop=True)
parking_df_9.rename(columns = {'경도':'주차장_평균경도'}, inplace = True)

parking_df_10 = parking_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).max().reset_index()
parking_df_10.reset_index(inplace=True, drop=True)
parking_df_10.rename(columns = {'경도':'주차장_최고경도'}, inplace = True)

parking_df_11 = parking_df[['도시', '구', '동', '급지구분_1']].groupby(['도시', '구', '동']).sum().reset_index()
parking_df_11.reset_index(inplace=True, drop=True)
parking_df_11.rename(columns = {'급지구분_1':'주차장_급지구분합계_1'}, inplace = True)

parking_df_12 = parking_df[['도시', '구', '동', '급지구분_2']].groupby(['도시', '구', '동']).sum().reset_index()
parking_df_12.reset_index(inplace=True, drop=True)
parking_df_12.rename(columns = {'급지구분_2':'주차장_급지구분합계_2'}, inplace = True)

parking_df_13 = parking_df[['도시', '구', '동', '급지구분_3']].groupby(['도시', '구', '동']).sum().reset_index()
parking_df_13.reset_index(inplace=True, drop=True)
parking_df_13.rename(columns = {'급지구분_3':'주차장_급지구분합계_3'}, inplace = True)

parking_df_14 = parking_df[['도시', '구', '동', 'cnt']].groupby(['도시', '구', '동']).sum().reset_index()
parking_df_14.reset_index(inplace=True, drop=True)
parking_df_14.rename(columns = {'cnt':'주차장_총개수'}, inplace = True)

parking_df = pd.merge(parking_df_1, parking_df_2, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_3, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_4, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_5, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_6, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_7, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_8, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_9, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_10, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_11, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_12, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_13, how='left', on=['도시', '구', '동'])
parking_df = pd.merge(parking_df, parking_df_14, how='left', on=['도시', '구', '동'])

parking_df.head()

In [None]:
#cctv 데이터
cctv_df = pd.read_csv('/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/data/대구 CCTV 정보.csv', encoding='cp949').drop_duplicates()[['소재지지번주소', '단속구분', '제한속도', '위도', '경도', '설치연도']]
cctv_df.dropna(subset = ['소재지지번주소'], inplace = True)
cctv_df['설치연도'] = cctv_df['설치연도'].fillna(cctv_df['설치연도'].mode()[0])
cctv_df = pd.get_dummies(cctv_df, columns=['단속구분'])
cctv_df['cnt'] = 1

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

cctv_df[['도시', '구', '동', '번지']] = cctv_df['소재지지번주소'].str.extract(location_pattern)

location_pattern = r'(\S+) (\S+) (\S+)'

cctv_df.loc[cctv_df['도시'].isna(), '도시'] = cctv_df.loc[cctv_df['도시'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,0]
cctv_df.loc[cctv_df['구'].isna(), '구'] = cctv_df.loc[cctv_df['구'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,1]
cctv_df.loc[cctv_df['동'].isna(), '동'] = cctv_df.loc[cctv_df['동'].isna(), '소재지지번주소'].str.extract(location_pattern).iloc[:,2]

cctv_df.loc[cctv_df['동'].str.contains("-"), '동'] = cctv_df.loc[cctv_df['동'].str.contains("-"), '동'].apply(lambda x : re.sub(r'\d+-\d+', '', x))

cctv_df['도시'] = cctv_df['도시'].map({'대구':'대구광역시','대구광역시':'대구광역시'})

cctv_df.loc[cctv_df['구'] == "가창면", "구"] = "달성군"
cctv_df.loc[cctv_df['동'] == "삼산리", "동"] = "가창면"
cctv_df.loc[cctv_df['구'] == "다사읍", "구"] = "달성군"
cctv_df.loc[cctv_df['동'] == "세천리", "동"] = "다사읍"

cctv_df = cctv_df.drop(columns=['소재지지번주소', '번지'])

#cctv 동이름 그룹핑
cctv_df.loc[cctv_df['동']=="남리", "동"] = "논공읍"
cctv_df.loc[cctv_df['동']=="두류1동", "동"] = "두류동"
cctv_df.loc[cctv_df['동']=="두류2동", "동"] = "두류동"
cctv_df.loc[cctv_df['동']=="매곡리", "동"] = "다사읍"
cctv_df.loc[cctv_df['동']=="북리", "동"] = "논공읍"
cctv_df.loc[cctv_df['동']=="비산2.3동", "동"] = "비산동"
cctv_df.loc[cctv_df['동']=="비산4동", "동"] = "비산동"
cctv_df.loc[cctv_df['동']=="신암1동", "동"] = "신암동"
cctv_df.loc[cctv_df['동']=="신암4동", "동"] = "신암동"
cctv_df.loc[cctv_df['동']=="용계리", "동"] = "가창면"
cctv_df.loc[cctv_df['동']=="정대리", "동"] = "가창면"
cctv_df.loc[cctv_df['동']=="유가면", "동"] = "유가읍"
cctv_df.loc[cctv_df['동']=="침산2동", "동"] = "침산동"
cctv_df.loc[cctv_df['동']=="평리2동", "동"] = "평리동"
cctv_df.loc[cctv_df['동']=="하리", "동"] = "논공읍"
cctv_df.loc[cctv_df['동']=="현풍면", "동"] = "현풍읍"

cctv_df_1 = cctv_df[['도시', '구', '동', '단속구분_1']].groupby(['도시', '구', '동']).sum().reset_index()
cctv_df_1.reset_index(inplace=True, drop=True)
cctv_df_1.rename(columns = {'단속구분_1':'cctv_총단속구분_1'}, inplace = True)

cctv_df_2 = cctv_df[['도시', '구', '동', '단속구분_2']].groupby(['도시', '구', '동']).sum().reset_index()
cctv_df_2.reset_index(inplace=True, drop=True)
cctv_df_2.rename(columns = {'단속구분_2':'cctv_총단속구분_2'}, inplace = True)

cctv_df_3 = cctv_df[['도시', '구', '동', '단속구분_4']].groupby(['도시', '구', '동']).sum().reset_index()
cctv_df_3.reset_index(inplace=True, drop=True)
cctv_df_3.rename(columns = {'단속구분_4':'cctv_총단속구분_4'}, inplace = True)

cctv_df_4 = cctv_df[['도시', '구', '동', '단속구분_99']].groupby(['도시', '구', '동']).sum().reset_index()
cctv_df_4.reset_index(inplace=True, drop=True)
cctv_df_4.rename(columns = {'단속구분_99':'cctv_총단속구분_99'}, inplace = True)

cctv_df_5 = cctv_df[['도시', '구', '동', '제한속도']].groupby(['도시', '구', '동']).min().reset_index()
cctv_df_5.reset_index(inplace=True, drop=True)
cctv_df_5.rename(columns = {'제한속도':'cctv_최소제한속도'}, inplace = True)

cctv_df_6 = cctv_df[['도시', '구', '동', '제한속도']].groupby(['도시', '구', '동']).mean().reset_index()
cctv_df_6.reset_index(inplace=True, drop=True)
cctv_df_6.rename(columns = {'제한속도':'cctv_평균제한속도'}, inplace = True)

cctv_df_7 = cctv_df[['도시', '구', '동', '제한속도']].groupby(['도시', '구', '동']).max().reset_index()
cctv_df_7.reset_index(inplace=True, drop=True)
cctv_df_7.rename(columns = {'제한속도':'cctv_최대제한속도'}, inplace = True)

cctv_df_8 = cctv_df[['도시', '구', '동', '제한속도']].groupby(['도시', '구', '동']).median().reset_index()
cctv_df_8.reset_index(inplace=True, drop=True)
cctv_df_8.rename(columns = {'제한속도':'cctv_중간제한속도'}, inplace = True)

cctv_df_9 = cctv_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).min().reset_index()
cctv_df_9.reset_index(inplace=True, drop=True)
cctv_df_9.rename(columns = {'위도':'cctv_최저위도'}, inplace = True)

cctv_df_10 = cctv_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).mean().reset_index()
cctv_df_10.reset_index(inplace=True, drop=True)
cctv_df_10.rename(columns = {'위도':'cctv_평균위도'}, inplace = True)

cctv_df_11 = cctv_df[['도시', '구', '동', '위도']].groupby(['도시', '구', '동']).max().reset_index()
cctv_df_11.reset_index(inplace=True, drop=True)
cctv_df_11.rename(columns = {'위도':'cctv_최고위도'}, inplace = True)

cctv_df_12 = cctv_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).min().reset_index()
cctv_df_12.reset_index(inplace=True, drop=True)
cctv_df_12.rename(columns = {'경도':'cctv_최저경도'}, inplace = True)

cctv_df_13 = cctv_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).mean().reset_index()
cctv_df_13.reset_index(inplace=True, drop=True)
cctv_df_13.rename(columns = {'경도':'cctv_평균경도'}, inplace = True)

cctv_df_14 = cctv_df[['도시', '구', '동', '경도']].groupby(['도시', '구', '동']).max().reset_index()
cctv_df_14.reset_index(inplace=True, drop=True)
cctv_df_14.rename(columns = {'경도':'cctv_최고경도'}, inplace = True)

cctv_df_15 = cctv_df[['도시', '구', '동', '설치연도']].groupby(['도시', '구', '동']).mean().reset_index()
cctv_df_15.reset_index(inplace=True, drop=True)
cctv_df_15.rename(columns = {'설치연도':'cctv_평균설치연도'}, inplace = True)

cctv_df_16 = cctv_df[['도시', '구', '동', '설치연도']].groupby(['도시', '구', '동']).min().reset_index()
cctv_df_16.reset_index(inplace=True, drop=True)
cctv_df_16.rename(columns = {'설치연도':'cctv_최소설치연도'}, inplace = True)

cctv_df_17 = cctv_df[['도시', '구', '동', '설치연도']].groupby(['도시', '구', '동']).max().reset_index()
cctv_df_17.reset_index(inplace=True, drop=True)
cctv_df_17.rename(columns = {'설치연도':'cctv_최대설치연도'}, inplace = True)

cctv_df_18 = cctv_df[['도시', '구', '동', 'cnt']].groupby(['도시', '구', '동']).sum().reset_index()
cctv_df_18.reset_index(inplace=True, drop=True)
cctv_df_18.rename(columns = {'cnt':'cctv_총개수'}, inplace = True)

cctv_df = pd.merge(cctv_df_1, cctv_df_2, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_3, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_4, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_5, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_6, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_7, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_8, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_9, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_10, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_11, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_12, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_13, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_14, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_15, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_16, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_17, how='left', on=['도시', '구', '동'])
cctv_df = pd.merge(cctv_df, cctv_df_18, how='left', on=['도시', '구', '동'])

cctv_df.head()

### 1-2. Train, Test, Countrywide

In [150]:
train_df = pd.read_csv("/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/test.csv")
country_wide = pd.read_csv("/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/data/countrywide_accident.csv")

In [151]:
train_df['ECLO'] = train_df['사망자수'] * 10 + train_df['중상자수'] * 5 + train_df['경상자수'] * 3 + train_df['부상자수'] * 1
country_wide['ECLO'] = country_wide['사망자수'] * 10 + country_wide['중상자수'] * 5 + country_wide['경상자수'] * 3 + country_wide['부상자수'] * 1

***

## 2. Preprocessing

In [152]:
#test 추론시점에서는 알 수 없는 정보들 drop
drop_cols = ['사고유형 - 세부분류', '법규위반', '가해운전자 차종', '가해운전자 성별', '가해운전자 연령', '가해운전자 상해정도', '피해운전자 차종', '피해운전자 성별', '피해운전자 연령', '피해운전자 상해정도']
train_df.drop(columns = drop_cols, inplace = True)
country_wide.drop(columns = drop_cols, inplace = True)

In [153]:
train_df['사고일시'] = pd.to_datetime(train_df['사고일시'])
test_df['사고일시'] = pd.to_datetime(test_df['사고일시'])
country_wide['사고일시'] = pd.to_datetime(country_wide['사고일시'])

In [154]:
for df in [train_df, test_df, country_wide]:
    df['연'] = df['사고일시'].dt.year
    df['월'] = df['사고일시'].dt.month
    df['일'] = df['사고일시'].dt.day
    df['monthday'] = df.apply(lambda row: str(row['월']) + '-' + str(row['일']), axis=1)
    df['시간'] = df['사고일시'].dt.hour
    df['weekday'] = df['사고일시'].dt.weekday
    df['weekofyear'] = (df['사고일시'].dt.isocalendar().week).astype(int)
    df['새벽'] = df['시간'].isin([0,1,2,3,4,5,6]).astype(int)
    df['밤'] = df['시간'].isin([21,22,23]).astype(int)
    df['주말'] = df['weekday'].isin([5,6]).astype(int)
    df['주중'] = df['weekday'].isin([0,1,2,3,4]).astype(int)
    df['국가공휴일_상준'] = df['monthday'].isin(['1-1','3-1','5-5','6-6','8-15','10-3','10-9','12-25','12-31']).astype(int)


train_df = train_df.drop(columns=['사고일시','monthday']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다
test_df = test_df.drop(columns=['사고일시','monthday'])
country_wide = country_wide.drop(columns=['사고일시','monthday'])

In [155]:
#지역 정보 추출
location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_df['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])

test_df[['도시', '구', '동']] = test_df['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

country_wide[['도시', '구', '동']] = country_wide['시군구'].str.extract(location_pattern)

location_pattern = r'(\S+) (\S+)'

country_wide.loc[country_wide['도시'].isna(), '도시'] = country_wide.loc[country_wide['도시'].isna(), '시군구'].str.extract(location_pattern).iloc[:,0]
country_wide.loc[country_wide['구'].isna(), '구'] = "세종특별자치시"
country_wide.loc[country_wide['동'].isna(), '동'] = country_wide.loc[country_wide['동'].isna(), '시군구'].str.extract(location_pattern).iloc[:,1]
country_wide = country_wide.drop(columns=['시군구'])

In [156]:
#도로 정보 추출
road_pattern = r'(.+) - (.+)'

train_df[['도로형태1', '도로형태2']] = train_df['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])

test_df[['도로형태1', '도로형태2']] = test_df['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

country_wide[['도로형태1', '도로형태2']] = country_wide['도로형태'].str.extract(road_pattern)
country_wide = country_wide.drop(columns=['도로형태'])

In [157]:
train_df['사고유형_도로형태2'] = train_df['사고유형'] + '_' + train_df['도로형태2']
test_df['사고유형_도로형태2'] = test_df['사고유형'] + '_' + test_df['도로형태2']
country_wide['사고유형_도로형태2'] = country_wide['사고유형'] + '_' + country_wide['도로형태2']

In [158]:
#대구 추가데이터 preprocessing
additional.loc[additional['노면상태']=="결빙", "노면상태"] = "서리/결빙"
additional.loc[additional['노면상태']=="습기", "노면상태"] = "젖음/습기"

additional.loc[additional['동']=="논공읍공단출장", "동"] = "논공읍"
additional.loc[additional['동']=="다사읍서재출장", "동"] = "다사읍"
additional.loc[additional['동']=="옥포면", "동"] = "옥포읍"
additional.loc[additional['동']=="유가면", "동"] = "유가읍"
additional.loc[additional['동']=="현풍면", "동"] = "현풍읍"

additional.loc[additional['도로형태2']=="횡단보도부근", "도로형태2"] = "교차로횡단보도내"
additional.loc[additional['도로형태2']=="횡단보도상", "도로형태2"] = "교차로횡단보도내"

In [159]:
train_df = pd.concat([additional,train_df], axis=0).reset_index(drop=True)

In [160]:
#train, test에 외부데이터 merge
train_df = pd.merge(train_df, light_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, parking_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, cctv_df, how='left', on=['도시', '구', '동'])

test_df = pd.merge(test_df, light_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, parking_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, cctv_df, how='left', on=['도시', '구', '동'])

In [161]:
#country_wide 데이터를 합친 데이터 셋
train_org = pd.read_csv("/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/train.csv")
test_org = pd.read_csv("/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/test.csv")

train_org['ECLO'] = train_org['사망자수'] * 10 + train_org['중상자수'] * 5 + train_org['경상자수'] * 3 + train_org['부상자수'] * 1

train_org.drop(columns = drop_cols, inplace = True)
train_org['사고일시'] = pd.to_datetime(train_org['사고일시'])
test_org['사고일시'] = pd.to_datetime(test_org['사고일시'])

for df in [train_org]:
    df['연'] = df['사고일시'].dt.year
    df['월'] = df['사고일시'].dt.month
    df['일'] = df['사고일시'].dt.day
    df['monthday'] = df.apply(lambda row: str(row['월']) + '-' + str(row['일']), axis=1)
    df['시간'] = df['사고일시'].dt.hour
    df['weekday'] = df['사고일시'].dt.weekday
    df['weekofyear'] = (df['사고일시'].dt.isocalendar().week).astype(int)

    df['새벽'] = df['시간'].isin([0,1,2,3,4,5,6]).astype(int)
    df['밤'] = df['시간'].isin([21,22,23]).astype(int)
    df['주말'] = df['weekday'].isin([5,6]).astype(int)
    df['주중'] = df['weekday'].isin([0,1,2,3,4]).astype(int)
    df['국가공휴일_상준'] = df['monthday'].isin(['1-1','3-1','5-5','6-6','8-15','10-3','10-9','12-25','12-31']).astype(int)

train_org = train_org.drop(columns=['사고일시','monthday'])

location_pattern = r'(\S+) (\S+) (\S+)'

train_org[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
train_org = train_org.drop(columns=['시군구'])

train_org[['도로형태1', '도로형태2']] = train_org['도로형태'].str.extract(road_pattern)
train_org = train_org.drop(columns=['도로형태'])

train_org['사고유형_도로형태2'] = train_org['사고유형'] + '_' + train_org['도로형태2']

train_large = pd.concat([train_org,country_wide], axis = 0).reset_index(drop=True)

for df in [test_org]:
    df['연'] = df['사고일시'].dt.year
    df['월'] = df['사고일시'].dt.month
    df['일'] = df['사고일시'].dt.day
    df['monthday'] = df.apply(lambda row: str(row['월']) + '-' + str(row['일']), axis=1)
    df['시간'] = df['사고일시'].dt.hour
    df['weekday'] = df['사고일시'].dt.weekday
    df['weekofyear'] = (df['사고일시'].dt.isocalendar().week).astype(int)

    df['새벽'] = df['시간'].isin([0,1,2,3,4,5,6]).astype(int)
    df['밤'] = df['시간'].isin([21,22,23]).astype(int)
    df['주말'] = df['weekday'].isin([5,6]).astype(int)
    df['주중'] = df['weekday'].isin([0,1,2,3,4]).astype(int)
    df['국가공휴일_상준'] = df['monthday'].isin(['1-1','3-1','5-5','6-6','8-15','10-3','10-9','12-25','12-31']).astype(int)

test_org = test_org.drop(columns=['사고일시','monthday'])

test_org[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
test_org = test_org.drop(columns=['시군구'])

test_org[['도로형태1', '도로형태2']] = test_org['도로형태'].str.extract(road_pattern)
test_org = test_org.drop(columns=['도로형태'])

test_org['사고유형_도로형태2'] = test_org['사고유형'] + '_' + test_org['도로형태2']

In [162]:
#겹치는 구 및 동 처리
train_large['new_구'] = train_large['도시'] + '_' + train_large['구']
test_org['new_구'] = test_org['도시'] + '_' + test_org['구']

train_large['new_동'] = train_large['도시'] + '_' + train_large['구'] + '_' + train_large['동']
test_org['new_동'] = test_org['도시'] + '_' + test_org['구'] + '_' + test_org['동']

train_large = train_large.drop('구', axis=1)
test_org = test_org.drop('구', axis=1)

train_large = train_large.drop('동', axis=1)
test_org = test_org.drop('동', axis=1)

In [163]:
k1 = train_large.groupby('new_구')['ECLO'].mean().reset_index()
k2 = train_large.groupby('new_동')['ECLO'].mean().reset_index()

고속도로1 = list(k1[k1['ECLO']>5]['new_구'])
고속도로2 = list(k2[k2['ECLO']>5]['new_동'])

train_large['고속도로여부1'] = train_large['new_구'].isin(고속도로1).astype(int)
test_org['고속도로여부1'] = test_org['new_구'].isin(고속도로1).astype(int)

train_large['고속도로여부2'] = train_large['new_동'].isin(고속도로2).astype(int)
test_org['고속도로여부2'] = test_org['new_동'].isin(고속도로2).astype(int)

In [164]:
k1 = train_df.groupby('구')['ECLO'].mean().reset_index()
k2 = train_df.groupby('동')['ECLO'].mean().reset_index()

고속도로1 = list(k1[k1['ECLO']>5]['구'])
고속도로2 = list(k2[k2['ECLO']>5]['동'])

train_df['고속도로여부1'] = train_df['구'].isin(고속도로1).astype(int)
test_df['고속도로여부1'] = test_df['구'].isin(고속도로1).astype(int)

train_df['고속도로여부2'] = train_df['동'].isin(고속도로2).astype(int)
test_df['고속도로여부2'] = test_df['동'].isin(고속도로2).astype(int)

In [165]:
a1 = train_df.groupby('동')['사망자수'].sum().reset_index()
a2 = train_df.groupby('동')['중상자수'].sum().reset_index()
a3 = train_df.groupby('동')['경상자수'].sum().reset_index()
a4 = train_df.groupby('동')['부상자수'].sum().reset_index()

a1.columns  = ['동','동사망자수']
a2.columns  = ['동','동중상자수']
a3.columns  = ['동','동경상자수']
a4.columns  = ['동','동부상자수']

train_df = pd.merge(train_df, a1, how='left', on=['동'])
train_df = pd.merge(train_df, a2, how='left', on=['동'])
train_df = pd.merge(train_df, a3, how='left', on=['동'])
train_df = pd.merge(train_df, a4, how='left', on=['동'])

test_df = pd.merge(test_df, a1, how='left', on=['동'])
test_df = pd.merge(test_df, a2, how='left', on=['동'])
test_df = pd.merge(test_df, a3, how='left', on=['동'])
test_df = pd.merge(test_df, a4, how='left', on=['동'])

In [166]:
e1 = train_large.groupby('new_동')['사망자수'].sum().reset_index()
e2 = train_large.groupby('new_동')['중상자수'].sum().reset_index()
e3 = train_large.groupby('new_동')['경상자수'].sum().reset_index()
e4 = train_large.groupby('new_동')['부상자수'].sum().reset_index()

e1.columns  = ['new_동','동사망자수']
e2.columns  = ['new_동','동중상자수']
e3.columns  = ['new_동','동경상자수']
e4.columns  = ['new_동','동부상자수']

train_large = pd.merge(train_large, e1, how='left', on=['new_동'])
train_large = pd.merge(train_large, e2, how='left', on=['new_동'])
train_large = pd.merge(train_large, e3, how='left', on=['new_동'])
train_large = pd.merge(train_large, e4, how='left', on=['new_동'])

test_org = pd.merge(test_org, e1, how='left', on=['new_동'])
test_org = pd.merge(test_org, e2, how='left', on=['new_동'])
test_org = pd.merge(test_org, e3, how='left', on=['new_동'])
test_org = pd.merge(test_org, e4, how='left', on=['new_동'])

In [167]:
test_x_1 = test_df.drop(columns=['ID']).copy()
train_x = train_df[test_x_1.columns].copy()
train_y = train_df['ECLO'].copy()

In [168]:
#categorical 변수 인코딩
str_col = ['요일', '기상상태', '노면상태', '사고유형', '도시', '구', '동', '도로형태1', '도로형태2', '사고유형_도로형태2']

for i in str_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    for label in np.unique(test_x_1[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x_1[i] = le.transform(test_x_1[i])

In [169]:
#Time Cycling Transform
##시간
train_x['sin_hour'] = np.sin(2 * np.pi * train_x['시간']/23.0)
train_x['cos_hour'] = np.cos(2 * np.pi * train_x['시간']/23.0)
test_x_1['sin_hour'] = np.sin(2 * np.pi * test_x_1['시간']/23.0)
test_x_1['cos_hour'] = np.cos(2 * np.pi * test_x_1['시간']/23.0)

##날짜
train_x['sin_date'] = -np.sin(2 * np.pi * (train_x['월']+train_x['일']/31)/12)
train_x['cos_date'] = -np.sin(2 * np.pi * (train_x['월']+train_x['일']/31)/12)
test_x_1['sin_date'] = -np.sin(2 * np.pi * (test_x_1['월']+test_x_1['일']/31)/12)
test_x_1['cos_date'] = -np.sin(2 * np.pi * (test_x_1['월']+test_x_1['일']/31)/12)

##월
train_x['sin_month'] = -np.sin(2 * np.pi * train_x['월']/12.0)
train_x['cos_month'] = -np.cos(2 * np.pi * train_x['월']/12.0)
test_x_1['sin_month'] = -np.sin(2 * np.pi * test_x_1['월']/12.0)
test_x_1['cos_month'] = -np.cos(2 * np.pi * test_x_1['월']/12.0)

In [170]:
#Time feature engineering
train_x['covid-19'] = train_x['연'].apply(lambda x : 1 if x >= 2020  else 0)

train_x['season'] = '-'
train_x.loc[(train_x['월'] == 3) | (train_x['월'] == 4) | (train_x['월'] == 5), 'season'] = 0
train_x.loc[(train_x['월'] == 6) | (train_x['월'] == 7) | (train_x['월'] == 8), 'season'] = 1
train_x.loc[(train_x['월'] == 9) | (train_x['월'] == 10) | (train_x['월'] == 11), 'season'] = 2
train_x.loc[(train_x['월'] == 12) | (train_x['월'] == 1) | (train_x['월'] == 2), 'season'] = 3
train_x.loc[(train_x['season'] == '-'), 'season'] = 4

train_x['group_time'] = '-'
train_x.loc[(train_x['시간'] < 5), 'group_time'] = 0
train_x.loc[(train_x['시간'] >= 5) & (train_x['시간'] < 11), 'group_time'] = 1
train_x.loc[(train_x['시간'] >= 11) & (train_x['시간'] < 18), 'group_time'] = 2
train_x.loc[(train_x['시간'] >= 18) & (train_x['시간'] <= 23), 'group_time'] = 3
train_x.loc[(train_x['group_time'] == '-'), 'group_time'] = 4

train_x['season'] = train_x['season'].astype(int)
train_x['group_time'] = train_x['group_time'].astype(int)

test_x_1['covid-19'] = test_x_1['연'].apply(lambda x : 1 if x >= 2020
                                        else 0)

test_x_1['season'] = '-'
test_x_1.loc[(test_x_1['월'] == 3) | (test_x_1['월'] == 4) | (test_x_1['월'] == 5), 'season'] = 0
test_x_1.loc[(test_x_1['월'] == 6) | (test_x_1['월'] == 7) | (test_x_1['월'] == 8), 'season'] = 1
test_x_1.loc[(test_x_1['월'] == 9) | (test_x_1['월'] == 10) | (test_x_1['월'] == 11), 'season'] = 2
test_x_1.loc[(test_x_1['월'] == 12) | (test_x_1['월'] == 1) | (test_x_1['월'] == 2), 'season'] = 3
test_x_1.loc[(test_x_1['season'] == '-'), 'season'] = 4

test_x_1['group_time'] = '-'
test_x_1.loc[(test_x_1['시간'] < 5), 'group_time'] = 0
test_x_1.loc[(test_x_1['시간'] >= 5) & (test_x_1['시간'] < 11), 'group_time'] = 1
test_x_1.loc[(test_x_1['시간'] >= 11) & (test_x_1['시간'] < 18), 'group_time'] = 2
test_x_1.loc[(test_x_1['시간'] >= 18) & (test_x_1['시간'] <= 23), 'group_time'] = 3
test_x_1.loc[(test_x_1['group_time'] == '-'), 'group_time'] = 4

test_x_1['season'] = test_x_1['season'].astype(int)
test_x_1['group_time'] = test_x_1['group_time'].astype(int)

***

## 3. Modeling : XGBOOST & CATBOOST

### 3-1. Only Train (DAEGU)

### FEATURE SELECTION

In [171]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel


X = train_x
y = train_y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Create an XGBoost Regressor
model = XGBRegressor(
            max_depth=8,
            learning_rate=0.01,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            min_child_weight=50,
            objective='reg:squaredlogerror',
            eval_metric='rmse')

model.fit(X_train, y_train)

# Display feature importances
feature_importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

sel_features = feature_importance_df[feature_importance_df['Importance']>0]['Feature']

train_x = train_x[sel_features]
test_x_1 = test_x_1[sel_features]

In [172]:
cat_features = ['요일','구' ,'주말', '새벽','사고유형', '도로형태1', '도로형태2','사고유형_도로형태2','group_time', '고속도로여부1', '고속도로여부2']

In [173]:
is_holdout = False
iterations = 3000
patience = 100

In [None]:
models_xgb_1 = []
models_lgb_1 = []
models_cat_1 = []
rmsle_scores = []
n_split_list = [10,20]

for i in [0,26,42,100,5000]:
    for split in n_split_list:
        fold_idx = 1
        cv = StratifiedKFold(n_splits=split, shuffle=True, random_state=42)
        for train_index, valid_index in cv.split(train_x,train_y):
            X_train, X_valid = train_x.iloc[train_index], train_x.iloc[valid_index]
            Y_train, Y_valid = train_y[train_index], train_y[valid_index]
            log_Y_train, log_Y_valid = np.log1p(train_y[train_index]), np.log1p(train_y[valid_index])
            print("="*50)

            model_xgb = XGBRegressor(n_estimators = iterations, eta = 0.01, min_child_weight = 50,
                        max_depth = 10, colsample_bytree = 0.9,
                        subsample = 0.9, seed = i,
                        objective = "reg:squaredlogerror",
                        eval_metric = rmsle,
                        tree_method="gpu_hist")
            model_xgb.fit(X_train, Y_train,
                    eval_set=[(X_valid, Y_valid)],
                    early_stopping_rounds=patience, verbose=100)


            model_cat = CatBoostRegressor(iterations = iterations,
                                random_state = i,
                                task_type = "GPU",
                                loss_function = "RMSE",
                                eval_metric = "RMSE",
                                cat_features = cat_features,
                                one_hot_max_size = 6,
                                random_strength = 5,
                                #learning_rate=0.04
                                      )
            model_cat.fit(X_train, log_Y_train,
                    eval_set=[(X_valid, log_Y_valid)],
                    early_stopping_rounds=patience,
                    verbose=100)

            pred_xgb = model_xgb.predict(X_valid)
            #pred_lgb = np.expm1(model_lgb.predict(X_valid))
            pred_cat = np.expm1(model_cat.predict(X_valid))
            pred = pred_cat*0.7 + pred_xgb*0.3   # + pred_lgb*0.4
            score_xgb = np.sqrt(mean_squared_log_error(Y_valid,pred_xgb))
            #score_lgb = np.sqrt(mean_squared_log_error(Y_valid,pred_lgb))
            score_cat = np.sqrt(mean_squared_log_error(Y_valid,pred_cat))
            score_ensemble = np.sqrt(mean_squared_log_error(Y_valid,pred))
            print(fold_idx,"/",split,"Fold Validation RMSLE score(XGB/LGBM/CAT/ENSEMBLE) :", score_xgb,"/", score_cat,"/", score_ensemble)
            models_xgb_1.append(model_xgb)
            #models_lgb_1.append(model_lgb)
            models_cat_1.append(model_cat)
            rmsle_scores.append(score_ensemble)
            fold_idx += 1
            if is_holdout:
                break
    print("Validation : RMSLE scores for each fold:", rmsle_scores)
    print("Validation : RMSLE:", np.mean(rmsle_scores))

In [175]:
preds_1 = []
for i in range(150):
    pred_xgb = models_xgb_1[i].predict(test_x_1)
    pred_cat = np.expm1(models_cat_1[i].predict(test_x_1))
    pred = pred_cat*0.5 + pred_xgb*0.5
    preds_1.append(pred)

preds_1 = np.mean(preds_1 , axis = 0)

### 3-2. Train + countrywide

In [176]:
test_x_2 = test_org.drop(columns=['ID']).copy()
train_x = train_large[test_x_2.columns].copy()
train_y = train_large['ECLO'].copy()

In [None]:
#categorical 변수 인코딩
str_col = ['요일','기상상태', '노면상태', '사고유형', '도시', 'new_구', 'new_동', '도로형태1', '도로형태2', '사고유형_도로형태2']

for i in str_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    for label in np.unique(test_x_2[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x_2[i] = le.transform(test_x_2[i])

display(train_x.head())
display(test_x_2.head())

In [178]:
#Time Cycling Transform
##시간
train_x['sin_hour'] = np.sin(2 * np.pi * train_x['시간']/23.0)
train_x['cos_hour'] = np.cos(2 * np.pi * train_x['시간']/23.0)
test_x_2['sin_hour'] = np.sin(2 * np.pi * test_x_2['시간']/23.0)
test_x_2['cos_hour'] = np.cos(2 * np.pi * test_x_2['시간']/23.0)

##날짜
train_x['sin_date'] = -np.sin(2 * np.pi * (train_x['월']+train_x['일']/31)/12)
train_x['cos_date'] = -np.sin(2 * np.pi * (train_x['월']+train_x['일']/31)/12)
test_x_2['sin_date'] = -np.sin(2 * np.pi * (test_x_2['월']+test_x_2['일']/31)/12)
test_x_2['cos_date'] = -np.sin(2 * np.pi * (test_x_2['월']+test_x_2['일']/31)/12)

##월
train_x['sin_month'] = -np.sin(2 * np.pi * train_x['월']/12.0)
train_x['cos_month'] = -np.cos(2 * np.pi * train_x['월']/12.0)
test_x_2['sin_month'] = -np.sin(2 * np.pi * test_x_2['월']/12.0)
test_x_2['cos_month'] = -np.cos(2 * np.pi * test_x_2['월']/12.0)

In [179]:
#Time feature engineering
train_x['covid-19'] = train_x['연'].apply(lambda x : 1 if x >= 2020
                                        else 0)
# train_x['holiday'] = train_x['요일'].apply(lambda x : 2 if x == 6
#                                         else 1 if x == 5
#                                         else 0)
train_x['season'] = '-'
train_x.loc[(train_x['월'] == 3) | (train_x['월'] == 4) | (train_x['월'] == 5), 'season'] = 0
train_x.loc[(train_x['월'] == 6) | (train_x['월'] == 7) | (train_x['월'] == 8), 'season'] = 1
train_x.loc[(train_x['월'] == 9) | (train_x['월'] == 10) | (train_x['월'] == 11), 'season'] = 2
train_x.loc[(train_x['월'] == 12) | (train_x['월'] == 1) | (train_x['월'] == 2), 'season'] = 3
train_x.loc[(train_x['season'] == '-'), 'season'] = 4

train_x['group_time'] = '-'
train_x.loc[(train_x['시간'] < 5), 'group_time'] = 0
train_x.loc[(train_x['시간'] >= 5) & (train_x['시간'] < 11), 'group_time'] = 1
train_x.loc[(train_x['시간'] >= 11) & (train_x['시간'] < 18), 'group_time'] = 2
train_x.loc[(train_x['시간'] >= 18) & (train_x['시간'] <= 23), 'group_time'] = 3
train_x.loc[(train_x['group_time'] == '-'), 'group_time'] = 4

train_x['season'] = train_x['season'].astype(int)
train_x['group_time'] = train_x['group_time'].astype(int)

test_x_2['covid-19'] = test_x_2['연'].apply(lambda x : 1 if x >= 2020
                                        else 0)
# test_x_2['holiday'] = test_x_2['요일'].apply(lambda x : 2 if x == 6
#                                         else 1 if x == 5
#                                         else 0)
test_x_2['season'] = '-'
test_x_2.loc[(test_x_2['월'] == 3) | (test_x_2['월'] == 4) | (test_x_2['월'] == 5), 'season'] = 0
test_x_2.loc[(test_x_2['월'] == 6) | (test_x_2['월'] == 7) | (test_x_2['월'] == 8), 'season'] = 1
test_x_2.loc[(test_x_2['월'] == 9) | (test_x_2['월'] == 10) | (test_x_2['월'] == 11), 'season'] = 2
test_x_2.loc[(test_x_2['월'] == 12) | (test_x_2['월'] == 1) | (test_x_2['월'] == 2), 'season'] = 3
test_x_2.loc[(test_x_2['season'] == '-'), 'season'] = 4

test_x_2['group_time'] = '-'
test_x_2.loc[(test_x_2['시간'] < 5), 'group_time'] = 0
test_x_2.loc[(test_x_2['시간'] >= 5) & (test_x_2['시간'] < 11), 'group_time'] = 1
test_x_2.loc[(test_x_2['시간'] >= 11) & (test_x_2['시간'] < 18), 'group_time'] = 2
test_x_2.loc[(test_x_2['시간'] >= 18) & (test_x_2['시간'] <= 23), 'group_time'] = 3
test_x_2.loc[(test_x_2['group_time'] == '-'), 'group_time'] = 4

test_x_2['season'] = test_x_2['season'].astype(int)
test_x_2['group_time'] = test_x_2['group_time'].astype(int)

### FEATURE SELECTION

In [180]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel


X = train_x
y = train_y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Create an XGBoost Regressor
model = XGBRegressor(
            max_depth=8,
            learning_rate=0.01,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            min_child_weight=50,
            objective='reg:squaredlogerror',
            eval_metric='rmse')

model.fit(X_train, y_train)

# Display feature importances
feature_importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

sel_features = feature_importance_df[feature_importance_df['Importance']>0]['Feature']

train_x = train_x[sel_features]
test_x_2 = test_x_2[sel_features]

In [None]:
train_x.columns

In [182]:
cat_features = ['고속도로여부2', '사고유형', '사고유형_도로형태2', '주말', 'weekday', '고속도로여부1','도로형태2', 'new_구','new_동', 'group_time','도시','도로형태1','기상상태','노면상태']

***

# Modeling

In [183]:
is_holdout = False
iterations = 30000
patience = 100

In [None]:
models_xgb_2 = []
models_lgb_2 = []
models_cat_2 = []
rmsle_scores = []
n_split_list = [10,20]
for i in [0,26,42,100,5000]:
    for split in n_split_list:
        fold_idx = 1
        cv = StratifiedKFold(n_splits=split, shuffle=True, random_state=42)
        for train_index, valid_index in cv.split(train_x,train_y):
            X_train, X_valid = train_x.iloc[train_index], train_x.iloc[valid_index]
            Y_train, Y_valid = train_y[train_index], train_y[valid_index]
            log_Y_train, log_Y_valid = np.log1p(train_y[train_index]), np.log1p(train_y[valid_index])
            print("="*50)

            model_xgb = XGBRegressor(n_estimators = iterations, eta = 0.01, min_child_weight = 50,
                        max_depth = 10, colsample_bytree = 0.9,
                        subsample = 0.9, seed = i,
                        objective = 'reg:squaredlogerror',
                        eval_metric = rmsle,
                        tree_method="gpu_hist")

            model_xgb.fit(X_train, Y_train,
                    eval_set=[(X_valid, Y_valid)],
                    early_stopping_rounds=patience, verbose=100)

            model_cat = CatBoostRegressor(iterations = iterations,
                                random_state = i,
                                task_type = "GPU",
                                loss_function = "RMSE",
                                eval_metric = "RMSE",
                                cat_features=cat_features,
                                one_hot_max_size=6,
                                random_strength = 10,
                                #learning_rate=0.01
                                      )

            model_cat.fit(X_train, log_Y_train,
                    eval_set=[(X_valid, log_Y_valid)],
                    early_stopping_rounds=patience,
                    verbose=100)

            pred_xgb = model_xgb.predict(X_valid)
            pred_cat = np.expm1(model_cat.predict(X_valid))
            pred = pred_cat*0.5 + pred_xgb*0.5    # + pred_lgb*0.4
            score_xgb = np.sqrt(mean_squared_log_error(Y_valid,pred_xgb))
            score_cat = np.sqrt(mean_squared_log_error(Y_valid,pred_cat))
            score_ensemble = np.sqrt(mean_squared_log_error(Y_valid,pred))
            print(fold_idx,"/",split,"Fold Validation RMSLE score(XGB/LGBM/CAT/ENSEMBLE) :", score_xgb,"/", score_cat,"/", score_ensemble)
            models_xgb_2.append(model_xgb)
            models_cat_2.append(model_cat)
            rmsle_scores.append(score_ensemble)
            fold_idx += 1
            if is_holdout:
                break
print("Validation : RMSLE scores for each fold:", rmsle_scores)
print("Validation : RMSLE:", np.mean(rmsle_scores))

***

## 4. Test Inference

In [185]:
preds_1 = []
for i in range(150):
    pred_xgb = models_xgb_1[i].predict(test_x_1)
    pred_cat = np.expm1(models_cat_1[i].predict(test_x_1))
    pred = pred_cat*0.5 + pred_xgb*0.5
    preds_1.append(pred)

preds_1 = np.mean(preds_1 , axis = 0)

preds_2 = []
for i in range(150):
    pred_xgb = models_xgb_2[i].predict(test_x_2)
    pred_cat = np.expm1(models_cat_2[i].predict(test_x_2))
    pred = pred_cat*0.5 + pred_xgb*0.5
    preds_2.append(pred)

preds_2 = np.mean(preds_2 , axis = 0)

In [186]:
sample_submission = pd.read_csv("/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/sample_submission.csv")

sample_submission["ECLO_1"] = preds_1
sample_submission["ECLO_2"] = preds_2
sample_submission["ECLO"] = sample_submission["ECLO_1"] * 0.7 + sample_submission["ECLO_2"] * 0.3

sample_submission = sample_submission[['ID','ECLO']]

In [190]:
sample_submission.to_csv("/content/drive/MyDrive/대구_교통사고_피해_예측_AI경진대회/open/COLAB_FINAL.csv", index=False)

***