In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from supervised.automl import AutoML

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 909
seed_everything(SEED)

In [None]:
PATH = '/workspace/daegu/data'

In [None]:
train_df = pd.read_csv(f'{PATH}/train_new.csv', encoding='cp949')
test_df = pd.read_csv(f'{PATH}/test_new.csv', encoding='cp949')

display(train_df)
display(test_df)

In [None]:
sample_submission = pd.read_csv(f'{PATH}/sample_submission.csv')
sample_submission.head()

### preprocess

In [None]:
train_df.columns

In [None]:
features = [
    '시군구', '사고일시', 'year', 'month', 'day', 'hour', '요일', '기상상태', '구',  '도로형태_대', '도로형태_중',  
    '노면상태', '사고유형', 'old_count', 'jay_count', 'ice_count',  'truck_count', 'walker_count', 
    '주차장_수']

labels = ['ECLO']

# feature selection 재선택
X = train_df[features]
X_test = test_df[features]

y = train_df[labels]

print(X.shape, X_test.shape)

In [None]:
holi_weekday = [
    '2019-01-01', '2019-02-04', '2019-02-05', '2019-02-06', '2019-03-01', '2019-05-05', '2019-05-12', '2019-06-06', '2019-08-15', '2019-09-12', '2019-09-13', '2019-09-14', '2019-10-03', '2019-10-09', '2019-12-25',
    '2020-01-01' ,'2020-01-24' ,'2020-01-25', '2020-01-26', '2020-03-01', '2020-04-30', '2020-05-05', '2020-06-06', '2020-08-15', '2020-08-17', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-09', '2020-12-25',
    '2021-01-01' ,'2021-02-11' ,'2021-02-12', '2021-02-13', '2021-03-01', '2021-05-05', '2021-05-19', '2021-06-06', '2021-08-15', '2021-09-20', '2021-09-21', '2021-09-22', '2021-10-03', '2021-10-09', '2021-12-25',
    '2022-01-01' ,'2022-01-31' ,'2022-02-01', '2022-02-02', '2022-03-01', '2022-05-05', '2022-05-08', '2022-06-06', '2022-08-15', '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2020-10-10', '2022-12-25',
    '2023-01-01' ,'2023-01-21' ,'2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01']

X['timestamp'] = pd.to_datetime(X['사고일시'])
X['year'] = X['timestamp'].dt.year
X['month'] = X['timestamp'].dt.month
X['day'] = X['timestamp'].dt.day
X['hour'] = X['timestamp'].dt.hour
X['weekday'] = X['timestamp'].dt.weekday
X['day_of_year'] = X['timestamp'].dt.dayofyear
X['day_of_week'] = X['timestamp'].dt.dayofweek
# X['holiday'] = X.apply(lambda x : 0 if x['day_of_week'] < 5 else 1, axis = 1)
X['holiday'] = np.where((X.day_of_week >= 5) | (X.timestamp.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
X = X.drop(columns=['사고일시', 'timestamp'])


X_test['timestamp'] = pd.to_datetime(X_test['사고일시'])
X_test['year'] = X_test['timestamp'].dt.year
X_test['month'] = X_test['timestamp'].dt.month
X_test['day'] = X_test['timestamp'].dt.day
X_test['hour'] = X_test['timestamp'].dt.hour
X_test['weekday'] = X_test['timestamp'].dt.weekday
X_test['day_of_year'] = X_test['timestamp'].dt.dayofyear
X_test['day_of_week'] = X_test['timestamp'].dt.dayofweek
# X_test['holiday'] = X_test.apply(lambda x : 0 if x['day_of_week'] < 5 else 1, axis = 1)
X_test['holiday'] = np.where((X_test.day_of_week >= 5) | (X_test.timestamp.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
X_test = X_test.drop(columns=['사고일시', 'timestamp'])

In [None]:
def group_season(df):
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    return df['season']

X['cosine_time'] = np.cos(2 * np.pi * X['hour'] / 24)
X_test['cosine_time'] = np.cos(2 * np.pi * X_test['hour'] / 24)
X['season'] = group_season(X)
X_test['season'] = group_season(X_test)

In [None]:
train_x = X.copy()
train_y = y.copy()
test_x = X_test.copy()

#### col '구' One-Hot Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
from category_encoders.target_encoder import TargetEncoder

categorical_features = list(train_x.dtypes[train_x.dtypes == "object"].index)
# 추출된 문자열 변수 확인
display(categorical_features)

for i in categorical_features:
    le = TargetEncoder(cols=[i])
    train_x[i] = le.fit_transform(train_x[i], train_y)
    test_x[i] = le.transform(test_x[i])
    
display(train_x.head())
display(test_x.head())

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the scaler on the train_x data
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# Convert the normalized data back to a DataFrame if needed
train_x = pd.DataFrame(train_x, columns=X.columns)
test_x = pd.DataFrame(test_x, columns=X_test.columns)

In [None]:
train_corr = train_x.copy()
train_corr['ECLO'] = train_y

train_corr.corr()

In [None]:
# 제거할 열 이름 목록
columns_to_drop = ['year']

# train_x와 test_x에서 해당 열들을 제거
train_x = train_x.drop(columns=columns_to_drop, axis=1)
test_x = test_x.drop(columns=columns_to_drop, axis=1)

In [None]:
train_x.columns

### TRAIN

In [None]:
model = AutoML(
    mode = 'Compete',
    ml_task = 'regression',
    algorithms = ['LightGBM', 'CatBoost', 'Xgboost'],
    n_jobs = -1,
    random_state = SEED,
)

In [17]:
model.fit(train_x, train_y)

[1]	train's rmse: 0.997559	validation's rmse: 0.989264
[2]	train's rmse: 0.995287	validation's rmse: 0.987689
[3]	train's rmse: 0.993173	validation's rmse: 0.986038
[4]	train's rmse: 0.991793	validation's rmse: 0.985511
[5]	train's rmse: 0.989921	validation's rmse: 0.984183
[6]	train's rmse: 0.98869	validation's rmse: 0.983721
[7]	train's rmse: 0.987072	validation's rmse: 0.982587
[8]	train's rmse: 0.98554	validation's rmse: 0.981461
[9]	train's rmse: 0.984115	validation's rmse: 0.980573
[10]	train's rmse: 0.982761	validation's rmse: 0.979818
[11]	train's rmse: 0.981776	validation's rmse: 0.979545
[12]	train's rmse: 0.980554	validation's rmse: 0.978884
[13]	train's rmse: 0.979413	validation's rmse: 0.978253
[14]	train's rmse: 0.978504	validation's rmse: 0.978074
[15]	train's rmse: 0.977455	validation's rmse: 0.977657
[16]	train's rmse: 0.976485	validation's rmse: 0.977139
[17]	train's rmse: 0.975501	validation's rmse: 0.976687
[18]	train's rmse: 0.974599	validation's rmse: 0.976383
[19

In [None]:
pred = model.predict(test_x)

In [None]:
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = pred
baseline_submission 

In [None]:
baseline_submission.to_csv(f'{PATH}/submits/automl_compete_3.219286.csv', index=False)