In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import catboost as cat
from catboost import CatBoostRegressor
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

In [None]:
train_df = pd.read_csv('/kaggle/input/daegunew/train.csv')
test_df = pd.read_csv('/kaggle/input/daegunew/test.csv')
train_df['timestamp'] = pd.to_datetime(train_df['사고일시'])
test_df['timestamp'] = pd.to_datetime(test_df['사고일시'])

for df in [train_df, test_df]:
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.weekday
    df['hour'] = df['timestamp'].dt.hour
    df['mon'] = 0
    df['tue'] = 0
    df['wed'] = 0
    df['thu'] = 0
    df['fri'] = 0
    df['sat'] = 0
    df['sun'] = 0
    

def get_season(month):
    if month in [3, 4, 5]:
        return 2
    elif month in [6, 7, 8]:
        return 3
    elif month in [9, 10, 11]:
        return 4
    else:
        return 1

def get_quarter_time(hour):
    if hour in [0, 1, 2, 3, 4, 5]:
        return 1
    elif hour in [6, 7, 8, 9, 10, 11]:
        return 2
    elif hour in [12, 13, 14, 15, 16, 17]:
        return 3
    else:
        return 4

In [None]:
train_df['season'] = train_df['month'].apply(get_season)
test_df['season'] = train_df['month'].apply(get_season)

train_df['quarter_time'] = train_df['hour'].apply(get_quarter_time)
test_df['quarter_time'] = train_df['hour'].apply(get_quarter_time)

for df in [train_df,test_df]:
    for i in range(len(df['weekday'])):
        if df['weekday'][i] == 0:
            df['mon'][i] = 1
        elif df['weekday'][i] == 1:
            df['tue'][i] = 1
        elif df['weekday'][i] == 2:
            df['wed'][i] = 1
        elif df['weekday'][i] == 3:
            df['thu'][i] = 1
        elif df['weekday'][i] == 4:
            df['fri'][i] = 1
        elif df['weekday'][i] == 5:
            df['sat'][i] = 1
        elif df['weekday'][i] == 6:
            df['sun'][i] = 1

In [None]:
!pip install pytimekr

from pytimekr import pytimekr

holiday_list = []
for i in range(2019, 2022):
    holiday = pytimekr.holidays(year = i)
    holiday_list = holiday_list + holiday
    
for i in range(len(holiday_list)):
    holiday_list[i] = holiday_list[i].strftime('%Y-%m-%d')
print(holiday_list)

date_list_train = list(train_df['timestamp'])
date_list_test = list(test_df['timestamp'])

holiday_bool_train = []
for i in range(len(date_list_train)):
    if date_list_train[i] in holiday_list:
        holiday_bool_train.append(1)
    else:
        holiday_bool_train.append(0)

holiday_bool_test = []
for i in range(len(date_list_test)):
    if date_list_test[i] in holiday_list:
        holiday_bool_test.append(1)
    else:
        holiday_bool_test.append(0)
        
train_df['holiday'] = holiday_bool_train
test_df['holiday'] = holiday_bool_test

In [None]:
location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_df['시군구'].str.extract(location_pattern)
test_df[['도시', '구', '동']] = test_df['시군구'].str.extract(location_pattern)

road_pattern = r'(.+) - (.+)'

train_df[['도로형태1', '도로형태2']] = train_df['도로형태'].str.extract(road_pattern)
test_df[['도로형태1', '도로형태2']] = test_df['도로형태'].str.extract(road_pattern)

In [None]:
cctv_df = pd.read_csv('/kaggle/input/daegunew/CCTV.csv', encoding='cp949')[['소재지지번주소', '단속구분']]
cctv_df = pd.get_dummies(cctv_df, columns=['단속구분'])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

cctv_df[['도시', '구', '동', '번지']] = cctv_df['소재지지번주소'].str.extract(location_pattern)
cctv_df = cctv_df.drop(columns=['소재지지번주소', '번지'])

cctv_df = cctv_df.groupby(['도시', '구', '동']).sum().reset_index()
cctv_df.reset_index(inplace=True, drop=True)


light_df = pd.read_csv('/kaggle/input/daegunew/security_lamp.csv', encoding='cp949')[['설치개수', '소재지지번주소']]

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])

light_df = light_df.groupby(['도시', '구', '동']).sum().reset_index()
light_df.reset_index(inplace=True, drop=True)


child_area_df = pd.read_csv('/kaggle/input/daegunew/child_protect_area.csv', encoding='cp949').drop_duplicates()[['소재지지번주소']]
child_area_df['cnt'] = 1

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])

child_area_df = child_area_df.groupby(['도시', '구', '동']).sum().reset_index()
child_area_df.reset_index(inplace=True, drop=True)


parking_df = pd.read_csv('/kaggle/input/daegunew/parking_lot.csv', encoding='cp949')[['소재지지번주소', '급지구분']]
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])

parking_df = parking_df.groupby(['도시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)

In [None]:
print(cctv_df)
print(light_df)
print(child_area_df)
print(parking_df)

In [None]:
train_df = pd.merge(train_df, cctv_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, light_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, parking_df, how='left', on=['도시', '구', '동'])

test_df = pd.merge(test_df, cctv_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, light_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, parking_df, how='left', on=['도시', '구', '동'])

In [None]:
test_df.drop(columns=['ID'], inplace = True)
train_df_copy = train_df.copy()
train_df = train_df[test_df.columns]
train_df['ECLO'] = train_df_copy['ECLO']

In [None]:
train_df.drop(columns=['도로형태', '사고일시', '요일', '시군구', 'timestamp', 'weekday'], inplace = True)
test_df.drop(columns=['도로형태', '사고일시', '요일', '시군구', 'timestamp', 'weekday'], inplace = True)

In [None]:
print(test_df.columns)

In [None]:
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

In [None]:
print(train_df)
print(test_df)

In [None]:
print(train_df_copy)

In [None]:
train_df.drop(columns = 'ECLO', inplace = True)

In [None]:
cat_list = ['기상상태', '노면상태', '사고유형', 'year', 'month', 'day', 
            'hour', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun', 'season', 
            'quarter_time', 'holiday', '도시', '구', '동', '도로형태1', '도로형태2']

for col in cat_list : 
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

In [None]:
float_list = ['단속구분_1', '단속구분_2', '단속구분_4', '단속구분_99', '설치개수',
              'cnt', '급지구분_1', '급지구분_2', '급지구분_3']

for col in float_list : 
    train_df[col] = train_df[col].astype('int64')
    test_df[col] = test_df[col].astype('int64')

In [None]:
train_x = train_df

In [None]:
categorical_features = ['기상상태', '노면상태', '사고유형', '도시', '구', '동', '도로형태1', '도로형태2']
encoders = {}
for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train_df[feature] = le.fit_transform(train_df[feature].astype(str))
    le_classes_set = set(le.classes_)
    test_df[feature] = test_df[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test_df[feature] = le.transform(test_df[feature].astype(str))
    encoders[feature] = le

In [None]:
train_df.dtypes

In [None]:
train_y = train_df['ECLO']
train_x = train_df.drop(columns = 'ECLO')

In [None]:
# ECLO 각 구분 별로 나눠서 구한 다음 산정하기
train_y_1 = train_df_copy['사망자수']
train_y_2 = train_df_copy['중상자수']
train_y_3 = train_df_copy['경상자수']
train_y_4 = train_df_copy['부상자수']
train_y_5 = train_df_copy['ECLO']

In [None]:
train_x_1 = train_x
train_x_2 = train_x
train_x_3 = train_x
train_x_4 = train_x
train_x_5 = train_x

In [None]:
test_1 = test_df
test_2 = test_df
test_3 = test_df
test_4 = test_df
test_5 = test_df

In [None]:
print(train_y_1)
print(train_y_2)
print(train_y_3)
print(train_y_4)
print(train_y_5)

In [None]:
train_y_5.to_csv('train5.csv')

In [None]:
from category_encoders.target_encoder import TargetEncoder
categorical_features = ['기상상태', '노면상태', '사고유형', '도시', '구', '동', '도로형태1', '도로형태2']
for i in categorical_features:
    le = TargetEncoder(cols=[i])
    train_x_1[i] = le.fit_transform(train_x_1[i], train_y_1)
    test_1[i] = le.transform(test_1[i])
    
for i in categorical_features:
    le = TargetEncoder(cols=[i])
    train_x_2[i] = le.fit_transform(train_x_2[i], train_y_2)
    test_2[i] = le.transform(test_2[i])
    
for i in categorical_features:
    le = TargetEncoder(cols=[i])
    train_x_3[i] = le.fit_transform(train_x_3[i], train_y_3)
    test_3[i] = le.transform(test_3[i])
    
for i in categorical_features:
    le = TargetEncoder(cols=[i])
    train_x_4[i] = le.fit_transform(train_x_4[i], train_y_4)
    test_4[i] = le.transform(test_4[i])
    
for i in categorical_features:
    le = TargetEncoder(cols=[i])
    train_x_5[i] = le.fit_transform(train_x_5[i], train_y_5)
    test_5[i] = le.transform(test_5[i])

In [None]:
print(train_x_1)
print(test_1)

In [None]:
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
def objective_1(trial: Trial) -> float:
    params_cat = {
        'learning_rate' : trial.suggest_float('learning_rate', 1e-7, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "reg_lambda" : trial.suggest_float("reg_lambda", 0, 100),
        "bagging_temperature" : trial.suggest_float("bagging_temperature", 0, 10),
        "task_type" : "GPU",
        "border_count" : trial.suggest_int("border_count", 128, 254),
        "verbose" : 0,
        "objective": "RMSE",
    }
    
    seed = 777
    X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_1, train_y_1, test_size=0.2, random_state=seed)

    model = cat.CatBoostRegressor(**params_cat)
    model.fit(X_tr, Y_tr)

    val_pred = model.predict(X_test)
    val_pred = np.where(val_pred < 0, 0, val_pred)
    score = np.sqrt(mean_squared_log_error(Y_test, val_pred))
    
    return score

sampler = TPESampler(777)
study_1 = optuna.create_study(study_name="train_y_1(사망자수)", direction="minimize", sampler=sampler)
study_1.optimize(objective_1, n_trials=50)
print("Best Score:", study_1.best_value)
print("Best trial:", study_1.best_trial.params)

In [None]:
def objective_2(trial: Trial) -> float:
    params_cat = {
        'learning_rate' : trial.suggest_float('learning_rate', 1e-7, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "reg_lambda" : trial.suggest_float("reg_lambda", 0, 100),
        "bagging_temperature" : trial.suggest_float("bagging_temperature", 0, 10),
        "task_type" : "GPU",
        "border_count" : trial.suggest_int("border_count", 128, 254),
        "verbose" : 0,
        "objective": "RMSE",
    }
    
    seed = 777
    X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_2, train_y_2, test_size=0.2, random_state=seed)

    model = cat.CatBoostRegressor(**params_cat)
    model.fit(X_tr, Y_tr)

    val_pred = model.predict(X_test)
    val_pred = np.where(val_pred < 0, 0, val_pred)
    score = np.sqrt(mean_squared_log_error(Y_test, val_pred))
    
    return score

sampler = TPESampler(777)
study_2 = optuna.create_study(study_name="train_y_2(중상자수)", direction="minimize", sampler=sampler)
study_2.optimize(objective_2, n_trials=50)
print("Best Score:", study_2.best_value)
print("Best trial:", study_2.best_trial.params)

In [None]:
def objective_3(trial: Trial) -> float:
    params_cat = {
        'learning_rate' : trial.suggest_float('learning_rate', 1e-7, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "reg_lambda" : trial.suggest_float("reg_lambda", 0, 100),
        "bagging_temperature" : trial.suggest_float("bagging_temperature", 0, 10),
        "task_type" : "GPU",
        "border_count" : trial.suggest_int("border_count", 128, 254),
        "verbose" : 0,
        "objective": "RMSE",
    }
    
    seed = 777
    X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_3, train_y_3, test_size=0.2, random_state=seed)

    model = cat.CatBoostRegressor(**params_cat)
    model.fit(X_tr, Y_tr)

    val_pred = model.predict(X_test)
    val_pred = np.where(val_pred < 0, 0, val_pred)
    score = np.sqrt(mean_squared_log_error(Y_test, val_pred))
    
    return score

sampler = TPESampler(777)
study_3 = optuna.create_study(study_name="train_y_3(경상자수)", direction="minimize", sampler=sampler)
study_3.optimize(objective_3, n_trials=50)
print("Best Score:", study_3.best_value)
print("Best trial:", study_3.best_trial.params)

In [None]:
def objective_4(trial: Trial) -> float:
    params_cat = {
        'learning_rate' : trial.suggest_float('learning_rate', 1e-7, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "reg_lambda" : trial.suggest_float("reg_lambda", 0, 100),
        "bagging_temperature" : trial.suggest_float("bagging_temperature", 0, 10),
        "task_type" : "GPU",
        "border_count" : trial.suggest_int("border_count", 128, 254),
        "verbose" : 0,
        "objective": "RMSE",
    }
    
    seed = 777
    X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_4, train_y_4, test_size=0.2, random_state=seed)

    model = cat.CatBoostRegressor(**params_cat)
    model.fit(X_tr, Y_tr)

    val_pred = model.predict(X_test)
    val_pred = np.where(val_pred < 0, 0, val_pred)
    score = np.sqrt(mean_squared_log_error(Y_test, val_pred))
    
    return score

sampler = TPESampler(777)
study_4 = optuna.create_study(study_name="train_y_4(부상자수)", direction="minimize", sampler=sampler)
study_4.optimize(objective_4, n_trials=50)
print("Best Score:", study_4.best_value)
print("Best trial:", study_4.best_trial.params)

In [None]:
def objective_5(trial: Trial) -> float:
    params_cat = {
        'learning_rate' : trial.suggest_float('learning_rate', 1e-7, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "reg_lambda" : trial.suggest_float("reg_lambda", 0, 100),
        "bagging_temperature" : trial.suggest_float("bagging_temperature", 0, 10),
        "task_type" : "GPU",
        "border_count" : trial.suggest_int("border_count", 128, 254),
        "verbose" : 0,
        "objective": "RMSE",
    }
    
    seed = 777
    X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_5, train_y_5, test_size=0.2, random_state=seed)

    model = cat.CatBoostRegressor(**params_cat)
    model.fit(X_tr, Y_tr)

    val_pred = model.predict(X_test)
    val_pred = np.where(val_pred < 0, 0, val_pred)
    score = np.sqrt(mean_squared_log_error(Y_test, val_pred))
    
    return score

sampler = TPESampler(777)
study_5 = optuna.create_study(study_name="train_y_5(ECLO)", direction="minimize", sampler=sampler)
study_5.optimize(objective_5, n_trials=50)
print("Best Score:", study_5.best_value)
print("Best trial:", study_5.best_trial.params)

In [None]:
seed = 777
X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_1,train_y_1, test_size=0.2, random_state=seed)

params = {"n_estimators" : 205,
          "task_type" : "GPU",
          "border_count" : 155,
          "verbose" : 1,
          "objective": "RMSE",
          "max_depth": 3,
          'learning_rate': 0.14960400849197492,
          "reg_lambda" : 36.439918761886915,
          "bagging_temperature" :5.8515626924780175}

model_1 = cat.CatBoostRegressor(**params)
model_1.fit(X_tr, Y_tr)
val = model_1.predict(X_test)
val = np.where(val < 0, 0, val)
score = np.sqrt(mean_squared_log_error(Y_test, val))
print('score : ',score)

seed = 777
X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_2,train_y_2, test_size=0.2, random_state=seed)

params = {"n_estimators" : 779,
          "task_type" : "GPU",
          "border_count" : 249,
          "verbose" : 1,
          "objective": "RMSE",
          "max_depth": 8,
          'learning_rate': 0.04184889794706989,
          "reg_lambda" : 71.21396329756737,
          "bagging_temperature" :7.742247886903844}

model_2 = cat.CatBoostRegressor(**params)
model_2.fit(X_tr, Y_tr)
val = model_2.predict(X_test)
val = np.where(val < 0, 0, val)
score = np.sqrt(mean_squared_log_error(Y_test, val))
print('score : ',score)

seed = 777
X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_3,train_y_3, test_size=0.2, random_state=seed)

params = {"n_estimators" : 509,
          "task_type" : "GPU",
          "border_count" : 162,
          "verbose" : 1,
          "objective": "RMSE",
          "max_depth": 9,
          'learning_rate': 0.045409691925147315,
          "reg_lambda" : 98.56534103829885,
          "bagging_temperature" :2.0232604251079365}

model_3 = cat.CatBoostRegressor(**params)
model_3.fit(X_tr, Y_tr)
val = model_3.predict(X_test)
val = np.where(val < 0, 0, val)
score = np.sqrt(mean_squared_log_error(Y_test, val))
print('score : ',score)

seed = 777
X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_4,train_y_4, test_size=0.2, random_state=seed)

params = {"n_estimators" : 130,
          "task_type" : "GPU",
          "border_count" : 147,
          "verbose" : 1,
          "objective": "RMSE",
          "max_depth": 4,
          'learning_rate': 0.07717177979888168,
          "reg_lambda" : 41.638725460142986,
          "bagging_temperature" :6.455965696389759}

model_4 = cat.CatBoostRegressor(**params)
model_4.fit(X_tr, Y_tr)
val = model_4.predict(X_test)
val = np.where(val < 0, 0, val)
score = np.sqrt(mean_squared_log_error(Y_test, val))
print('score : ',score)

seed = 777
X_tr, X_test, Y_tr, Y_test = train_test_split(train_x_5,train_y_5, test_size=0.2, random_state=seed)

params = {"n_estimators" : 476,
          "task_type" : "GPU",
          "border_count" : 210,
          "verbose" : 1,
          "objective": "RMSE",
          "max_depth": 2,
          'learning_rate': 0.1725754225910719,
          "reg_lambda" : 82.48558827109139,
          "bagging_temperature" :5.834113119465238}

model_5 = cat.CatBoostRegressor(**params)
model_5.fit(X_tr, Y_tr)
val = model_5.predict(X_test)
val = np.where(val < 0, 0, val)
score = np.sqrt(mean_squared_log_error(Y_test, val))
print('score : ',score)

In [None]:
pred_1 = model_1.predict(test_1)
pred_2 = model_2.predict(test_2)
pred_3 = model_3.predict(test_3)
pred_4 = model_4.predict(test_4)
pred_5 = model_5.predict(test_5)

In [None]:
answer = pd.DataFrame()
pred_1 = pd.DataFrame(pred_1)
pred_2 = pd.DataFrame(pred_2)
pred_3 = pd.DataFrame(pred_3)
pred_4 = pd.DataFrame(pred_4)
pred_5 = pd.DataFrame(pred_5)
answer['ECLO'] = pred_1 * 10 + pred_2 * 5 + pred_3 * 3 + pred_4 * 1

In [None]:
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer['ECLO']
print(sample_submission)
sample_submission.to_csv('catboost_each_optuna.csv', index = False)

In [None]:
answer_2 = pd.DataFrame()
answer_2['ECLO'] = pred_5
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer_2['ECLO']
print(sample_submission)
sample_submission.to_csv('catboost_ECLO_optuna.csv', index = False)

In [None]:
answer_3 = pd.DataFrame()
answer_3['ECLO'] = answer['ECLO'] * 0.5 + answer_2['ECLO'] * 0.5
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer_3['ECLO']
print(sample_submission)
sample_submission.to_csv('catboost_mean_optuna.csv', index = False)

-----------------------------------------------------------------------------------

In [None]:
def objective(trial: Trial) -> float:
    params_cat = {
        'learning_rate' : trial.suggest_float('learning_rate', 1e-7, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "reg_lambda" : trial.suggest_float("reg_lambda", 0, 100),
        "bagging_temperature" : trial.suggest_float("bagging_temperature", 0, 10),
        "boosting_type" : trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "cat_features" : ['기상상태', '노면상태', '사고유형', 'year', 'month', 'day', 
                          'hour', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun', 'season', 
                          'quarter_time', 'holiday', '도시', '구', '동', '도로형태1', '도로형태2'],
        "task_type" : "GPU",
        "border_count" : 254,
        "verbose" : 0,
        "objective": "RMSE",
    }
    
    seed = 777
    X_tr, X_test, Y_tr, Y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=seed)

    model = cat.CatBoostRegressor(**params_cat)
    model.fit(X_tr, Y_tr)

    val_pred = model.predict(X_test)
    val_pred = np.where(val_pred < 0, 0, val_pred)
    score = np.sqrt(mean_squared_log_error(Y_test, val_pred))
    
    return score

sampler = TPESampler(777)
study = optuna.create_study(study_name="train_y", direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=100)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

In [None]:
seed = 777
X_tr, X_test, Y_tr, Y_test = train_test_split(train_x,train_y, test_size=0.2, random_state=seed)

params = {"n_estimators" : 1000,
          'learning_rate' : 0.49103397753448996,
          "task_type" : "GPU",
          'max_depth' : 15,
          'reg_lambda' : 45.82681861634982,
          'bagging_temperature' : 1.3666714592614848,
          "border_count" : 254,
          "cat_features" : ['기상상태', '노면상태', '사고유형', 'year', 'month', 'day', 
                          'hour', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun', 'season', 
                          'quarter_time', 'holiday', '도시', '구', '동', '도로형태1', '도로형태2'],
          "verbose" : 1,
          "objective": "RMSE",
          'boosting_type': 'Plain'
         }

model = cat.CatBoostRegressor(**params)
model.fit(X_tr, Y_tr)
val = model.predict(X_test)
score = mean_squared_log_error(Y_test, val)
rmse = np.sqrt(score)
print('score : ',rmse)

In [None]:
pred = model.predict(test_df)

In [None]:
answer = pd.DataFrame()
answer['ECLO'] = pred

In [None]:
print(answer)

In [None]:
answer = answer.round(0)
print(answer)

In [None]:
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer['ECLO']
print(sample_submission)

In [None]:
sample_submission.to_csv('catboost_ECLO_round_optuna.csv', index = False)

-------------------------------------------------------------------

In [None]:
# AUTOML 사용 & train_y 분포 확인 및 MinMaxScaling & LabelEncoding
!pip install mljar-supervised

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from supervised.automl import AutoML

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
scaler_1 = MinMaxScaler()
train_scaled_1 = scaler_1.fit_transform(train_x_1)
test_scaled_1 = scaler_1.transform(test_1)

train_x_1 = pd.DataFrame(train_scaled_1, columns = train_x_1.columns)
test_1 = pd.DataFrame(test_scaled_1, columns = test_1.columns)

scaler_2 = MinMaxScaler()
train_scaled_2 = scaler_2.fit_transform(train_x_2)
test_scaled_2 = scaler_2.transform(test_2)

train_x_2 = pd.DataFrame(train_scaled_2, columns = train_x_2.columns)
test_2 = pd.DataFrame(test_scaled_2, columns = test_2.columns)

scaler_3 = MinMaxScaler()
train_scaled_3 = scaler_3.fit_transform(train_x_3)
test_scaled_3 = scaler_3.transform(test_3)

train_x_3 = pd.DataFrame(train_scaled_3, columns = train_x_3.columns)
test_3 = pd.DataFrame(test_scaled_3, columns = test_3.columns)

scaler_4 = MinMaxScaler()
train_scaled_4 = scaler_4.fit_transform(train_x_4)
test_scaled_4 = scaler_4.transform(test_4)

train_x_4 = pd.DataFrame(train_scaled_4, columns = train_x_4.columns)
test_4 = pd.DataFrame(test_scaled_4, columns = test_4.columns)

scaler_5 = MinMaxScaler()
train_scaled_5 = scaler_5.fit_transform(train_x_5)
test_scaled_5 = scaler_5.transform(test_5)

train_x_5 = pd.DataFrame(train_scaled_5, columns = train_x_5.columns)
test_5 = pd.DataFrame(test_scaled_5, columns = test_5.columns)

In [None]:
print(train_x_1)
print(test_1)

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
automl_1 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_2 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_3 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_4 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_5 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_list = [automl_1, automl_2, automl_3, automl_4, automl_5]

ensemble_predictions = []
scores = []
count = 0

for train_idx, val_idx in tqdm(kf.split(train_x_5), total=5, desc="Processing folds"):
    X_t, X_val = train_x_5.iloc[train_idx], train_x_5.iloc[val_idx]
    y_t, y_val = train_y_5[train_idx], train_y_5[val_idx]
    
    automl = automl_list[count]
    automl.fit(X_t, y_t)
    val_pred = automl.predict(X_val)
    val_pred = np.where(val_pred < 0, 0, val_pred)
    score = np.sqrt(mean_squared_log_error(y_val, val_pred))
    scores.append(score)

    automl_pred = automl.predict(test_5)
    automl_pred = np.where(automl_pred < 0, 0, automl_pred)

    ensemble_predictions.append(automl_pred)
    count = count + 1

final_predictions = np.mean(ensemble_predictions, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : RMSLE scores for each fold:", scores)
print("Validation : RMSLE:", np.mean(scores))

In [None]:
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = final_predictions
print(sample_submission)
sample_submission.to_csv('AutoML_ECLO_Kfold_5_ECLO.csv', index = False)

In [None]:
answer = pd.DataFrame()
answer['answer'] = final_predictions
answer.to_csv('5_Kfold.csv', index = False)

In [None]:
automl_11 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM", "Random Forest", "Neural Network"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_11.fit(train_x_1, train_y_1)

In [None]:
pred_1 = automl_1.predict(test_1)
answer_1 = pd.DataFrame()
answer_1['answer'] = pred_1
answer_1.to_csv('1.csv', index = False)

In [None]:
automl_2 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM", "Random Forest", "Neural Network"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_2.fit(train_x_2, train_y_2)

pred_2 = automl_2.predict(test_2)
answer_2 = pd.DataFrame()
answer_2['answer'] = pred_2
answer_2.to_csv('2.csv', index = False)

In [None]:
automl_3 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM", "Random Forest", "Neural Network"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_3.fit(train_x_3, train_y_3)

pred_3 = automl_3.predict(test_3)
answer_3 = pd.DataFrame()
answer_3['answer'] = pred_3
answer_3.to_csv('3.csv', index = False)

In [None]:
automl_4 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM", "Random Forest", "Neural Network"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_4.fit(train_x_4, train_y_4)

pred_4 = automl_4.predict(test_4)
answer_4 = pd.DataFrame()
answer_4['answer'] = pred_4
answer_4.to_csv('4.csv', index = False)

In [None]:
automl_5 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM", "Random Forest", "Neural Network"],
    mode="Compete",
    ml_task="regression",
    random_state=42,
    total_time_limit=None,
)

automl_5.fit(train_x_5, train_y_5)

pred_5 = automl_5.predict(test_5)
answer_5 = pd.DataFrame()
answer_5['answer'] = pred_5
answer_5.to_csv('5.csv', index = False)

In [None]:
csv_1 = pd.read_csv('/kaggle/input/kfold5/1_Kfold.csv')
csv_2 = pd.read_csv('/kaggle/input/kfold5/2_Kfold.csv')
csv_3 = pd.read_csv('/kaggle/input/kfold5/3_Kfold.csv')
csv_4 = pd.read_csv('/kaggle/input/kfold5/4_Kfold.csv')

answer = pd.DataFrame()
answer['answer'] = csv_1['answer'] * 10 + csv_2['answer'] * 5 + csv_3['answer'] * 3 + csv_4['answer'] * 1

sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer['answer']
print(sample_submission)
sample_submission.to_csv('AutoML_Seperated_Kfold_5.csv', index = False)

In [None]:
csv_5 = pd.read_csv('/kaggle/input/kfoldnew/final_predictions_5.csv')
answer = pd.DataFrame()
answer['answer'] = csv_5['answer']
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer['answer']
print(sample_submission)
sample_submission.to_csv('AutoML_ECLO_NEW_Kfold_5.csv', index = False)

In [None]:
csv_1 = pd.read_csv('/kaggle/input/kfold5/1_Kfold.csv')
csv_2 = pd.read_csv('/kaggle/input/kfold5/2_Kfold.csv')
csv_3 = pd.read_csv('/kaggle/input/kfold5/3_Kfold.csv')
csv_4 = pd.read_csv('/kaggle/input/kfold5/4_Kfold.csv')
csv_5 = pd.read_csv('/kaggle/input/kfold5/5_Kfold.csv')
answer = pd.DataFrame()
answer['answer'] = (csv_1['answer'] * 10 + csv_2['answer'] * 5 + csv_3['answer'] * 3 + csv_4['answer'] * 1) * 0.4 + csv_5['answer'] * 0.6
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer['answer']
print(sample_submission)
sample_submission.to_csv('AutoML_Mean_Kfold_5_0.6.csv', index = False)

In [None]:
answer['answer'] = answer['answer'].round(1)
print(answer)

In [None]:
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer['answer']
print(sample_submission)
sample_submission.to_csv('AutoML_Mean_0.4_round_1.csv', index = False)

In [None]:
csv_1 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_1_bo .csv')
csv_2 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_2_bo .csv')
csv_3 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_3_bo .csv')
csv_4 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_4_bo .csv')
csv_5 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_5_bo .csv')

In [None]:
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization

In [None]:
parameter_bounds = {
    'a' : (0, 1),
    'b' : (0, 1),
    }

def bo(a,b):
    answer = pd.DataFrame()
    answer['answer'] = (csv_1['answer'] * 10 + csv_2['answer'] * 5 + csv_3['answer'] * 3 + csv_4['answer'] * 1) * a + csv_5['answer'] * b
    score = np.sqrt(mean_squared_log_error(answer, train_y_5))
    score = 1 / score
    return score

BO = BayesianOptimization(f = bo, pbounds = parameter_bounds, random_state = 0)
BO.maximize(init_points = 500, n_iter = 500)
    

In [None]:
csv_1 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_1.csv')
csv_2 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_2.csv')
csv_3 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_3.csv')
csv_4 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_4.csv')
csv_5 = pd.read_csv('/kaggle/input/kfoldlast/final_predictions_5.csv')
a=4.044
b=4.279
c=1.779
d=0.4202
e=0.6793
f=0.5412
answer = pd.DataFrame()
answer['answer'] = (csv_1['answer'] * a + csv_2['answer'] * b + csv_3['answer'] * c + csv_4['answer'] * d) * e + csv_5['answer'] * f

In [None]:
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer['answer']
print(sample_submission)
sample_submission.to_csv('AutoML_Bayesian_Not_ECLO.csv', index = False)

In [None]:
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer['answer']
print(sample_submission)

In [None]:
minmax = pd.read_csv('/kaggle/input/minmax/minmax_ECLO.csv')
answer = pd.DataFrame()
answer['answer'] = minmax['answer']
sample_submission = pd.read_csv('/kaggle/input/daegunew/sample_submission.csv')
sample_submission['ECLO'] = answer['answer']
print(sample_submission)
sample_submission.to_csv('Minmax_ECLO.csv', index = False)