# 필요 package Import

In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import numpy as np
import optuna
from optuna.samplers import TPESampler
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

## Version

### 개발 OS
- window 11

In [2]:
# python version
import sys
sys.version

'3.8.17 (default, Jul  5 2023, 20:44:21) [MSC v.1916 64 bit (AMD64)]'

In [6]:
# 사용 모델 version
import xgboost
import catboost
import sklearn
print("xgboost version :",xgboost.__version__)
print("Catboost version :", catboost.__version__)
print("sklearn version :", sklearn.__version__)

xgboost version : 2.0.1
Catboost version : 1.2
sklearn version : 1.3.0


## XGBRegressor

### 데이터셋 불러오기

In [2]:
train = pd.read_csv('./data/train.csv').drop(columns='SAMPLE_ID')
test = pd.read_csv('./data/test.csv').drop(columns='SAMPLE_ID')

### 데이터셋 전처리

In [3]:
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

 # datetime 컬럼 제거
#train = train.drop(columns='ATA')
#test = test.drop(columns='ATA')

train['quarter'] = train.month.apply(lambda x: 0 if x in range(1, 4) else 1 if x in range(4, 7) else 2 if x in range(7, 10) else 3)
test['quarter'] = test.month.apply(lambda x: 0 if x in range(1, 4) else 1 if x in range(4, 7) else 2 if x in range(7, 10) else 3)

In [4]:
categorical_features = ['FLAG','ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'SHIPMANAGER','year', 'month', 'day', 'hour','minute', 'weekday']
numeric_features = list(set(train.columns)-set(categorical_features))

In [5]:
# encoding
for i in tqdm(categorical_features):
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])

    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

  0%|          | 0/11 [00:00<?, ?it/s]

100%|██████████| 11/11 [00:02<00:00,  3.77it/s]


In [6]:
train = train[train.LENGTH.notnull()] # 주로 사용되는 feature에 결측값이 존재해 제거

#### Feature 엔지니어링

In [7]:
train['BTA'] = train.ATA + pd.to_timedelta(train.CI_HOUR,'h')
train = pd.merge(train,train.groupby(['ARI_CO','U_WIND','V_WIND','BN','AIR_TEMPERATURE']).BTA.mean().reset_index().rename(columns={'BTA':'BTA_mean'}),how = 'left', on = ['ARI_CO','U_WIND','V_WIND','BN','AIR_TEMPERATURE'])
test = pd.merge(test,train.groupby(['ARI_CO','U_WIND','V_WIND','BN','AIR_TEMPERATURE']).BTA.mean().reset_index().rename(columns={'BTA':'BTA_mean'}),how = 'left', on = ['ARI_CO','U_WIND','V_WIND','BN','AIR_TEMPERATURE'])

In [8]:
train['weather_feature'] = (train.BTA_mean-train.ATA)/pd.to_timedelta(1,'h')
test['weather_feature'] = (test.BTA_mean-test.ATA)/pd.to_timedelta(1,'h')

In [9]:
# weather_feature는 음수가 나올 수 없는 데이터이므로 0 미만 값들을 0으로 변경
train.weather_feature = train.weather_feature.apply(lambda x:0 if x<0 else x)
test.weather_feature = test.weather_feature.apply(lambda x:0 if x<0 else x)

In [10]:
# XGB X_train, y_train, X_test 설정
y_train_XGB = train.CI_HOUR
X_train_XGB = train.drop(columns = ['ID','CI_HOUR', 'BTA', 'BTA_mean','ATA','FLAG','minute','hour','day'])
X_test_XGB = test.drop(columns = ['ID','BTA_mean','ATA','FLAG','minute','hour','day'])

### Modeling
- weather_feature가 있는 행과 없는 행을 구분하여 modeling 진행

#### With weather_feature

In [11]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
# 파라미터 튜닝을 위한 코드
'''
def objective(trial):
    params = {
        "n_estimators": 10000,
        "verbosity" : 0,
        "seed" : 42,
        "n_jobs" : -1,
        "eta": 0.1,
        "max_depth": trial.suggest_int("max_depth", 9, 11),
        "objective" : 'reg:absoluteerror',
        "early_stopping_rounds": 50,
        "eval_metric" : 'mae',
        "gamma": 0,
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 20),
        "max_delta_step": trial.suggest_int("max_delta_step", 0, 20),
        "colsample_bytree" : 1,
        "lambda": trial.suggest_int("lambda", 0, 10),
        "alpha": trial.suggest_int("alpha", 0, 10),
        "max_bin": 2048,
    }

    model = XGBRegressor(**params)
    X_train_1 = X_train[X_train.ta.notnull()]
    y_train_1 = y_train[X_train_1.index]
    X_test_1 = X_test[X_test.ta.notnull()]
    XGB_pred_01 = np.zeros((X_test_1.shape[0]))
    mae_list = []
    for tr_idx, val_idx in kf.split(X_train_1, y_train_1):
        tr_x, tr_y = X_train_1.iloc[tr_idx], y_train_1.iloc[tr_idx]
        val_x, val_y = X_train_1.iloc[val_idx], y_train_1.iloc[val_idx]
        model.fit(tr_x, tr_y, eval_set=[(val_x, val_y)], verbose = 0)
        pred = model.predict(val_x)

        mae = mean_absolute_error(val_y, pred)
        mae_list.append(mae)
    
        print('Mae :', mae)

    return np.mean(mae_list)
'''

'\ndef objective(trial):\n    params = {\n        "n_estimators": 10000,\n        "verbosity" : 0,\n        "seed" : 42,\n        "n_jobs" : -1,\n        "eta": 0.1,\n        "max_depth": trial.suggest_int("max_depth", 9, 11),\n        "objective" : \'reg:absoluteerror\',\n        "early_stopping_rounds": 50,\n        "eval_metric" : \'mae\',\n        "gamma": 0,\n        "min_child_weight": trial.suggest_int("min_child_weight", 0, 20),\n        "max_delta_step": trial.suggest_int("max_delta_step", 0, 20),\n        "colsample_bytree" : 1,\n        "lambda": trial.suggest_int("lambda", 0, 10),\n        "alpha": trial.suggest_int("alpha", 0, 10),\n        "max_bin": 2048,\n    }\n\n    model = XGBRegressor(**params)\n    X_train_1 = X_train[X_train.ta.notnull()]\n    y_train_1 = y_train[X_train_1.index]\n    X_test_1 = X_test[X_test.ta.notnull()]\n    XGB_pred_01 = np.zeros((X_test_1.shape[0]))\n    mae_list = []\n    for tr_idx, val_idx in kf.split(X_train_1, y_train_1):\n        tr_x, 

In [13]:
# optuna를 통한 파라미터 튜닝
'''
pruner2 = optuna.pruners.MedianPruner(n_warmup_steps=5)
study2 = optuna.create_study(pruner=pruner2, direction="minimize",sampler=TPESampler(seed=42))
study2.optimize(objective, n_trials=30)

print("Number of finished trials: {}".format(len(study2.trials)))

print("Best trial:")
trial = study2.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
'''

'\npruner2 = optuna.pruners.MedianPruner(n_warmup_steps=5)\nstudy2 = optuna.create_study(pruner=pruner2, direction="minimize",sampler=TPESampler(seed=42))\nstudy2.optimize(objective, n_trials=30)\n\nprint("Number of finished trials: {}".format(len(study2.trials)))\n\nprint("Best trial:")\ntrial = study2.best_trial\n\nprint("  Value: {}".format(trial.value))\n\nprint("  Params: ")\nfor key, value in trial.params.items():\n    print("    {}: {}".format(key, value))\n'

In [14]:
# 가장 잘 나온 파라미터로 학습 결과 추론
X_train_1 = X_train_XGB[X_train_XGB.weather_feature.notnull()].drop(columns = 'year')
y_train_1 = y_train_XGB[X_train_1.index]
X_test_1 = X_test_XGB[X_test_XGB.weather_feature.notnull()].drop(columns = 'year')
model = XGBRegressor(n_estimators = 1000000, verbosity = 1, eta=0.01, max_depth = 11, gamma =0, min_child_weight = 15,
                     max_delta_step = 15, colsample_bytree = 1, reg_lambda = 1, alpha = 0, max_bin = 2048,
                        objective = 'reg:absoluteerror', seed=42, eval_metric= 'mae',early_stopping_rounds=1000, n_jobs = -1)
XGB_pred_01 = np.zeros((X_test_1.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(X_train_1, y_train_1):
    tr_x, tr_y = X_train_1.iloc[tr_idx], y_train_1.iloc[tr_idx]
    val_x, val_y = X_train_1.iloc[val_idx], y_train_1.iloc[val_idx]
    model.fit(tr_x, tr_y, eval_set=[(val_x, val_y)], verbose=100)
    pred = model.predict(val_x)

    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    
    print('Mae :', mae)
    
    sub_pred_01 = model.predict(X_test_1)
    sub_pred_01 = np.clip(sub_pred_01, 0, np.inf)
    sub_pred_01 = np.array(sub_pred_01)

    XGB_pred_01 += sub_pred_01
print(f'{model.__class__.__name__}의 5fold 평균 MAE는 {np.mean(mae_list)}')

[0]	validation_0-mae:65.35732
[100]	validation_0-mae:42.64851
[200]	validation_0-mae:27.52804
[300]	validation_0-mae:17.13797
[400]	validation_0-mae:9.77826
[500]	validation_0-mae:5.68531
[600]	validation_0-mae:3.95062
[700]	validation_0-mae:3.25165
[800]	validation_0-mae:2.94217
[900]	validation_0-mae:2.82519
[1000]	validation_0-mae:2.78978
[1100]	validation_0-mae:2.77634
[1200]	validation_0-mae:2.76833
[1300]	validation_0-mae:2.76449
[1400]	validation_0-mae:2.76122
[1500]	validation_0-mae:2.75837
[1600]	validation_0-mae:2.75802
[1700]	validation_0-mae:2.75782
[1800]	validation_0-mae:2.75515
[1900]	validation_0-mae:2.75280
[2000]	validation_0-mae:2.74260
[2100]	validation_0-mae:2.74098
[2200]	validation_0-mae:2.74104
[2300]	validation_0-mae:2.73874
[2400]	validation_0-mae:2.73727
[2500]	validation_0-mae:2.73640
[2600]	validation_0-mae:2.73600
[2700]	validation_0-mae:2.73559
[2800]	validation_0-mae:2.73555
[2900]	validation_0-mae:2.73539
[3000]	validation_0-mae:2.73525
[3100]	validatio

#### Without weather_feature

In [15]:
# 파라미터 튜닝을 위한 코드
'''
def objective(trial):
    params = {
        "n_estimators": 10000,
        "verbosity" : 0,
        "seed" : 42,
        "n_jobs" : -1,
        "eta": 0.1,
        "max_depth": 7,
        "objective" : 'reg:absoluteerror',
        "early_stopping_rounds": 100,
        "eval_metric" : 'mae',
        "gamma": trial.suggest_int("gamma", 0, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 10),
        "max_delta_step": trial.suggest_float("max_delta_step", 0, 10),
        "colsample_bytree" : trial.suggest_categorical("colsample_bytree", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]),
        "lambda": trial.suggest_int("lambda", 0, 10),
        "alpha": trial.suggest_int("alpha", 0, 10),
        "max_bin": trial.suggest_categorical("max_bin",[32, 64, 128, 256, 512, 1024, 2048]),
    }

    model = XGBRegressor(**params)
    X_train_2 = X_train[X_train.weather_feature.isnull()].drop(columns=['month','BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT','SHIPMANAGER', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE','ATA_LT','BN','weather_feature'])
    y_train_2 = y_train[X_train_2.index]
    X_test_2 = X_test[X_test.weather_feature.isnull()].drop(columns=['month','BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT','SHIPMANAGER', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE','ATA_LT','BN','weather_feature'])
    XGB_pred_02 = np.zeros((X_test_2.shape[0]))
    mae_list = []
    for tr_idx, val_idx in kf.split(X_train_2, y_train_2):
        tr_x, tr_y = X_train_2.iloc[tr_idx], y_train_2.iloc[tr_idx]
        val_x, val_y = X_train_2.iloc[val_idx], y_train_2.iloc[val_idx]
        model.fit(tr_x, tr_y, eval_set=[(val_x, val_y)], verbose = 0)
        pred = model.predict(val_x)

        mae = mean_absolute_error(val_y, pred)
        mae_list.append(mae)
    
        print('Mae :', mae)

    return np.mean(mae_list)
'''

'\ndef objective(trial):\n    params = {\n        "n_estimators": 10000,\n        "verbosity" : 0,\n        "seed" : 42,\n        "n_jobs" : -1,\n        "eta": 0.1,\n        "max_depth": 7,\n        "objective" : \'reg:absoluteerror\',\n        "early_stopping_rounds": 100,\n        "eval_metric" : \'mae\',\n        "gamma": trial.suggest_int("gamma", 0, 10),\n        "min_child_weight": trial.suggest_int("min_child_weight", 0, 10),\n        "max_delta_step": trial.suggest_float("max_delta_step", 0, 10),\n        "colsample_bytree" : trial.suggest_categorical("colsample_bytree", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]),\n        "lambda": trial.suggest_int("lambda", 0, 10),\n        "alpha": trial.suggest_int("alpha", 0, 10),\n        "max_bin": trial.suggest_categorical("max_bin",[32, 64, 128, 256, 512, 1024, 2048]),\n    }\n\n    model = XGBRegressor(**params)\n    X_train_2 = X_train[X_train.weather_feature.isnull()].drop(columns=[\'month\',\'BREADTH\', \'BUILT\', \'DEADWE

In [16]:
# optuna를 통한 파라미터 튜닝
'''
pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(pruner=pruner, direction="minimize",sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=200)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
'''

'\npruner = optuna.pruners.MedianPruner(n_warmup_steps=5)\nstudy = optuna.create_study(pruner=pruner, direction="minimize",sampler=TPESampler(seed=42))\nstudy.optimize(objective, n_trials=200)\n\nprint("Number of finished trials: {}".format(len(study.trials)))\n\nprint("Best trial:")\ntrial = study.best_trial\n\nprint("  Value: {}".format(trial.value))\n\nprint("  Params: ")\nfor key, value in trial.params.items():\n    print("    {}: {}".format(key, value))\n'

In [17]:
# 가장 잘 나온 파라미터로 학습 결과 추론
X_train_2 = X_train_XGB[X_train_XGB.weather_feature.isnull()].drop(columns=['month','BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT','SHIPMANAGER', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE','ATA_LT','BN','weather_feature'])
y_train_2 = y_train_XGB[X_train_2.index]
X_test_2 = X_test_XGB[X_test_XGB.weather_feature.isnull()].drop(columns=['month','BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT','SHIPMANAGER', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE','ATA_LT','BN','weather_feature'])

model =  XGBRegressor(n_estimators = 100000, verbosity = 1, eta=0.01, max_depth = 7, objective = 'reg:absoluteerror',
                      gamma = 0, min_child_weight = 10, max_delta_step = 2.2152489372607116, colsample_bytree = 0.8,
                      reg_lambda = 7, alpha =4, max_bin = 64, 
                        seed=42, eval_metric= 'mae',early_stopping_rounds=1000, n_jobs = -1)
XGB_pred_02 = np.zeros((X_test_2.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(X_train_2, y_train_2):
    tr_x, tr_y = X_train_2.iloc[tr_idx], y_train_2.iloc[tr_idx]
    val_x, val_y = X_train_2.iloc[val_idx], y_train_2.iloc[val_idx]
    model.fit(tr_x, tr_y, eval_set=[(val_x, val_y)], verbose=100)
    pred = model.predict(val_x)

    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    
    print('Mae :', mae)
    
    sub_pred_02 = model.predict(X_test_2)
    sub_pred_02 = np.clip(sub_pred_02, 0, np.inf)
    sub_pred_02 = np.array(sub_pred_02)

    XGB_pred_02 += sub_pred_02
print(f'{model.__class__.__name__}의 5fold 평균 MAE는 {np.mean(mae_list)}')

[0]	validation_0-mae:55.87099
[100]	validation_0-mae:46.30168
[200]	validation_0-mae:43.50172
[300]	validation_0-mae:42.30831
[400]	validation_0-mae:41.63684
[500]	validation_0-mae:41.25333
[600]	validation_0-mae:40.99982
[700]	validation_0-mae:40.84612
[800]	validation_0-mae:40.70657
[900]	validation_0-mae:40.60808
[1000]	validation_0-mae:40.51565
[1100]	validation_0-mae:40.47589
[1200]	validation_0-mae:40.42567
[1300]	validation_0-mae:40.37527
[1400]	validation_0-mae:40.33260
[1500]	validation_0-mae:40.26999
[1600]	validation_0-mae:40.20339
[1700]	validation_0-mae:40.13236
[1800]	validation_0-mae:40.08907
[1900]	validation_0-mae:40.06640
[2000]	validation_0-mae:40.02901
[2100]	validation_0-mae:40.00658
[2200]	validation_0-mae:39.97715
[2300]	validation_0-mae:39.95462
[2400]	validation_0-mae:39.93467
[2500]	validation_0-mae:39.91066
[2600]	validation_0-mae:39.88796
[2700]	validation_0-mae:39.86213
[2800]	validation_0-mae:39.83631
[2900]	validation_0-mae:39.81013
[3000]	validation_0-ma

#### submission

In [18]:
submit = pd.read_csv('./data/sample_submission.csv')

In [19]:
submit.CI_HOUR.iloc[X_test_1.index] = XGB_pred_01/5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit.CI_HOUR.iloc[X_test_1.index] = XGB_pred_01/5


In [20]:
submit.CI_HOUR.iloc[X_test_2.index] = XGB_pred_02/5

In [21]:
# CI_HOUR 는 0보다 작을 수 없기 때문에 0 미만 값을 0으로 처리
submit.CI_HOUR = submit.CI_HOUR.apply(lambda x:0 if x<0 else x)

In [22]:
submit.to_csv('./submission/XGB_submission_final.csv',index=False)

## CatboostRegressor

### 데이터셋 불러오기

In [23]:
train = pd.read_csv('./data/train.csv').drop(columns='SAMPLE_ID')
test = pd.read_csv('./data/test.csv').drop(columns='SAMPLE_ID')

### 데이터셋 전처리

In [24]:
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

train['ARI_CO_PO'] = train.ARI_CO + '-' + train.ARI_PO
test['ARI_CO_PO'] = test.ARI_CO + '-' + test.ARI_PO

In [25]:
# 풍속 관련 feature들이 모두 0인데 기온 feature가 null인 경우 풍속 관련 feature도 null값으로 판단해 전처리
train.loc[(train['U_WIND'] == 0) & (train['V_WIND'] == 0) & (train['AIR_TEMPERATURE'].isnull()) & (train['BN'] == 0), ['U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN']] = np.nan

In [26]:
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'FLAG','SHIPMANAGER','year', 'month', 'day', 'hour','minute', 'weekday','ARI_CO_PO']
numeric_features = list(set(train.columns)-set(categorical_features))

In [27]:
# encoding
for i in tqdm(categorical_features):
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])

    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 12/12 [00:02<00:00,  4.10it/s]


In [28]:
train = train[train.LENGTH.notnull()]

### Feature 엔지니어링

In [29]:
train = pd.merge(train,train.groupby(['ARI_CO','ARI_PO']).CI_HOUR.mean().reset_index().rename(columns={'CI_HOUR':'CO_PO_mean'}),how = 'left',on = ['ARI_CO','ARI_PO'])
test = pd.merge(test,train.groupby(['ARI_CO','ARI_PO']).CI_HOUR.mean().reset_index().rename(columns={'CI_HOUR':'CO_PO_mean'}),how = 'left',on = ['ARI_CO','ARI_PO'])
test.CO_PO_mean = test.CO_PO_mean.fillna(train.CO_PO_mean.mean())

In [30]:
train = pd.merge(train,train.groupby(['ARI_PO','ARI_CO','month','year']).CI_HOUR.mean().reset_index().rename(columns={'CI_HOUR':'CO_PO_ym_mean'}),how = 'left',on = ['ARI_PO','ARI_CO','month','year'])
test = pd.merge(test,train.groupby(['ARI_PO','ARI_CO','month','year']).CI_HOUR.mean().reset_index().rename(columns={'CI_HOUR':'CO_PO_ym_mean'}),how = 'left',on = ['ARI_PO','ARI_CO','month','year'])
test.CO_PO_ym_mean = test.CO_PO_ym_mean.fillna(train.CO_PO_ym_mean.mean())

In [31]:
train = pd.merge(train,train.groupby(['ARI_PO','ARI_CO','SHIP_TYPE_CATEGORY','year']).CI_HOUR.mean().reset_index().rename(columns={'CI_HOUR':'CO_PO_SH_y_mean'}),how = 'left',on = ['ARI_PO','ARI_CO','SHIP_TYPE_CATEGORY','year'])
test = pd.merge(test,train.groupby(['ARI_PO','ARI_CO','SHIP_TYPE_CATEGORY','year']).CI_HOUR.mean().reset_index().rename(columns={'CI_HOUR':'CO_PO_SH_y_mean'}),how = 'left',on = ['ARI_PO','ARI_CO','SHIP_TYPE_CATEGORY','year'])
test.CO_PO_SH_y_mean = test.CO_PO_SH_y_mean.fillna(train.CO_PO_SH_y_mean.mean())

In [32]:
train['BTA'] = train.ATA + pd.to_timedelta(train.CI_HOUR,'h')
train = pd.merge(train,train.groupby(['ARI_CO','U_WIND','V_WIND','BN','AIR_TEMPERATURE']).BTA.mean().reset_index().rename(columns={'BTA':'BTA_mean'}),how = 'left', on = ['ARI_CO','U_WIND','V_WIND','BN','AIR_TEMPERATURE'])
test = pd.merge(test,train.groupby(['ARI_CO','U_WIND','V_WIND','BN','AIR_TEMPERATURE']).BTA.mean().reset_index().rename(columns={'BTA':'BTA_mean'}),how = 'left', on = ['ARI_CO','U_WIND','V_WIND','BN','AIR_TEMPERATURE'])

In [33]:
train['weather_feature'] = (train.BTA_mean-train.ATA)/pd.to_timedelta(1,'h')
test['weather_feature'] = (test.BTA_mean-test.ATA)/pd.to_timedelta(1,'h')

In [34]:
train.weather_feature = train.weather_feature.apply(lambda x:0 if x<0 else x)
test.weather_feature = test.weather_feature.apply(lambda x:0 if x<0 else x)

In [35]:
y_train_cat = train.CI_HOUR
X_train_cat = train.drop(columns = ['ID','CI_HOUR', 'BTA', 'BTA_mean','day','ATA', 'hour', 'minute','BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT','SHIPMANAGER', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE','ATA_LT'])
X_test_cat = test.drop(columns = ['ID','BTA_mean','day','ATA', 'hour', 'minute','BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT','SHIPMANAGER', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE','ATA_LT'])

In [36]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

kf = KFold(n_splits=5, shuffle=True, random_state=42)

X_train_1 = X_train_cat[X_train_cat.weather_feature.notnull()].query('DIST>0').drop(columns=['ARI_CO','ARI_PO'])
y_train_1 = y_train_cat[X_train_1.index]
X_test_1 = X_test_cat[X_test_cat.weather_feature.notnull()].query('DIST>0').drop(columns=['ARI_CO','ARI_PO'])

model = CatBoostRegressor(random_state = 42,n_estimators=200000, eval_metric='MAE',loss_function='MAE')
cat_pred_01 = np.zeros((X_test_1.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(X_train_1, y_train_1):
    tr_x, tr_y = X_train_1.iloc[tr_idx], y_train_1.iloc[tr_idx]
    val_x, val_y = X_train_1.iloc[val_idx], y_train_1.iloc[val_idx]
    model.fit(tr_x, tr_y, eval_set=[(val_x, val_y)],early_stopping_rounds = 50, verbose = 1000)
    pred = model.predict(val_x)

    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    
    sub_pred = np.array(model.predict(X_test_1))/5
    cat_pred_01 += sub_pred
print(f'{model.__class__.__name__}의 5fold 평균 MAE는 {np.mean(mae_list)}')


0:	learn: 89.6129601	test: 90.1657563	best: 90.1657563 (0)	total: 177ms	remaining: 9h 49m 51s
1000:	learn: 6.3667891	test: 6.6609151	best: 6.6609151 (1000)	total: 15.2s	remaining: 50m 15s
2000:	learn: 5.9035141	test: 6.3105401	best: 6.3105401 (2000)	total: 30.9s	remaining: 50m 57s
3000:	learn: 5.6640783	test: 6.1486531	best: 6.1486531 (3000)	total: 48.7s	remaining: 53m 15s
4000:	learn: 5.5359354	test: 6.0949021	best: 6.0948367 (3988)	total: 1m 2s	remaining: 51m 13s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 6.094516909
bestIteration = 4015

Shrink model to first 4016 iterations.
0:	learn: 89.5153045	test: 90.5425305	best: 90.5425305 (0)	total: 14.4ms	remaining: 48m
1000:	learn: 6.3765155	test: 6.4719819	best: 6.4719819 (1000)	total: 14.2s	remaining: 47m 7s
2000:	learn: 5.9229582	test: 6.1445356	best: 6.1445356 (2000)	total: 27.4s	remaining: 45m 7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 6.025036336
bestIteration = 2870

Shrink model to f

In [37]:
X_train_2 = X_train_cat[X_train_cat.weather_feature.isnull()].query('DIST>0').drop(columns=['BN','weather_feature'])
y_train_2 = y_train_cat[X_train_2.index]
X_test_2 = X_test_cat[X_test_cat.weather_feature.isnull()].query('DIST>0').drop(columns=['BN','weather_feature'])

model = CatBoostRegressor(random_state = 42,n_estimators=200000, eval_metric='MAE',loss_function='MAE')
cat_pred_2 = np.zeros((X_test_2.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(X_train_2, y_train_2):
    tr_x, tr_y = X_train_2.iloc[tr_idx], y_train_2.iloc[tr_idx]
    val_x, val_y = X_train_2.iloc[val_idx], y_train_2.iloc[val_idx]
    model.fit(tr_x, tr_y, eval_set=[(val_x, val_y)],early_stopping_rounds = 50, verbose = 1000)
    pred = model.predict(val_x)

    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    
    sub_pred = np.array(model.predict(X_test_2))/5
    cat_pred_2 += sub_pred
print(f'{model.__class__.__name__}의 5fold 평균 MAE는 {np.mean(mae_list)}') #68.92333692944034

0:	learn: 85.1582307	test: 88.0229281	best: 88.0229281 (0)	total: 10.3ms	remaining: 34m 28s
1000:	learn: 67.8022239	test: 71.7638719	best: 71.7638719 (1000)	total: 8.86s	remaining: 29m 21s
2000:	learn: 66.0612134	test: 71.1329283	best: 71.1324559 (1997)	total: 17.5s	remaining: 28m 49s
3000:	learn: 64.8795047	test: 70.8034693	best: 70.8021660 (2994)	total: 26.1s	remaining: 28m 34s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 70.70512195
bestIteration = 3463

Shrink model to first 3464 iterations.
0:	learn: 85.8839433	test: 84.9494293	best: 84.9494293 (0)	total: 9.23ms	remaining: 30m 46s
1000:	learn: 68.3557791	test: 69.6226405	best: 69.6226405 (1000)	total: 8.56s	remaining: 28m 22s
2000:	learn: 66.5377847	test: 69.0098711	best: 69.0098711 (2000)	total: 17.2s	remaining: 28m 18s
3000:	learn: 65.3739278	test: 68.7155004	best: 68.7155004 (3000)	total: 26s	remaining: 28m 25s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 68.57714661
bestIteration = 376

#### submission

In [38]:
submit = pd.read_csv('./data/sample_submission.csv')

In [39]:
submit.CI_HOUR.iloc[X_test_1.index] = cat_pred_01

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit.CI_HOUR.iloc[X_test_1.index] = cat_pred_01


In [40]:
submit.CI_HOUR.iloc[X_test_2.index] = cat_pred_2

In [41]:
# DIST가 0인 경우 대부분의 CI_HOUR 값이 0을 보이고 0이 아닌 경우 human error로 판단해 모두 0으로 처리
submit.CI_HOUR.iloc[test[test.DIST==0].index] = 0

In [42]:
submit.CI_HOUR = submit.CI_HOUR.apply(lambda x:0 if x<0 else x)

In [43]:
submit.to_csv('./submission/Cat_submission_final.csv',index=False)

#### submission ensemble
- XGB, Cat 두 모델로 나온 결과물을 1대1로 앙상블 진행

In [44]:
a = pd.read_csv('./submission/Cat_submission_final.csv')
b = pd.read_csv('./submission/XGB_submission_final.csv')
submit = pd.read_csv('./data/sample_submission.csv')

In [45]:
submit.CI_HOUR = (a.CI_HOUR+b.CI_HOUR)/2

In [46]:
submit.to_csv('./submission/merge_final.csv',index=False)