In [1]:
import numpy as np
import pandas as pd

In [2]:
X_train_val = pd.read_csv('train_features.csv', header=0)
y_train_val = pd.read_csv('train_targets.csv', header=0)
X_test = pd.read_csv('test_features.csv', header=0)
y_test = pd.read_csv('sample_submission.csv')
X_train_val.shape, y_train_val.shape, X_test.shape, y_test.shape

((5808, 11), (5808, 5), (3984, 11), (3984, 5))

In [3]:
for data in [X_train_val, X_test, y_train_val, y_test]:
    data.timestamp = pd.to_datetime(data.timestamp, format='%Y-%m-%d %H:%M:%S')

In [4]:
t_train_start, t_train_end, t_test_start, t_test_end = X_train_val.timestamp.min(), X_train_val.timestamp.max(), \
                                                       X_test.timestamp.min(), X_test.timestamp.max()
t_train_start, t_train_end, t_test_start, t_test_end

(Timestamp('2020-01-01 00:00:00'),
 Timestamp('2020-04-30 23:30:00'),
 Timestamp('2020-05-01 00:00:00'),
 Timestamp('2020-07-22 23:30:00'))

In [5]:
Xy_train_val = pd.merge(X_train_val, y_train_val, on='timestamp')
Xy_train_val.set_index('timestamp', inplace=True)

In [6]:
Xy_test = pd.merge(X_test, y_test, on='timestamp')
Xy_test.set_index('timestamp', inplace=True)

In [7]:
Xy_train_val.shape, Xy_test.shape

((5808, 14), (3984, 14))

In [8]:
Xy_train_val = Xy_train_val.mask(Xy_train_val.sub(Xy_train_val.mean()).div(Xy_train_val.std()).abs().gt(2))

In [9]:
Xy_train_val['A_C2H6'].fillna(Xy_train_val['B_C2H6'].shift(-192), inplace=True)
Xy_train_val['A_C3H8'].fillna(Xy_train_val['B_C3H8'].shift(-192), inplace=True)
Xy_train_val['A_iC4H10'].fillna(Xy_train_val['B_iC4H10'].shift(-192), inplace=True)
Xy_train_val['A_nC4H10'].fillna(Xy_train_val['B_nC4H10'].shift(-192), inplace=True)

In [10]:
Xy = pd.concat([Xy_train_val, Xy_test], axis=0)

for name in ['A_C2H6', 'A_C3H8', 'A_iC4H10', 'A_nC4H10']:
    for i in range(1, 4, 1):
        Xy.loc[:, f'{name}_{i}'] = Xy.loc[:, name].shift(i)
        Xy.loc[:, f'{name}_diff_{i}'] = Xy.loc[:, name].diff(i)
    for i in range(176, 209, 1):
        Xy.loc[:, f'{name}_{i}'] = Xy.loc[:, name].shift(i)
        Xy.loc[:, f'{name}_diff_{i}'] = Xy.loc[:, name].diff(i)

for name in ['A_rate', 'A_CH4', 'A_iC5H12', 'A_nC5H12', 'A_C6H14', 'B_rate']:
    for i in range(1, 4, 1):
        Xy.loc[:, f'{name}_1'] = Xy.loc[:, name].shift(1)
        Xy.loc[:, f'{name}_diff_1'] = Xy.loc[:, name].diff(1)
    for i in range(190, 195, 1):
        Xy.loc[:, f'{name}_{i}'] = Xy.loc[:, name].shift(i)
        Xy.loc[:, f'{name}_diff_{i}'] = Xy.loc[:, name].diff(i)

Xy = Xy.iloc[212:]

Xy['day_of_month'] = Xy.index.day.to_numpy()

Xy.shape, len(Xy.columns), Xy.columns

((9580, 375),
 375,
 Index(['A_rate', 'A_CH4', 'A_C2H6', 'A_C3H8', 'A_iC4H10', 'A_nC4H10',
        'A_iC5H12', 'A_nC5H12', 'A_C6H14', 'B_rate',
        ...
        'B_rate_diff_190', 'B_rate_191', 'B_rate_diff_191', 'B_rate_192',
        'B_rate_diff_192', 'B_rate_193', 'B_rate_diff_193', 'B_rate_194',
        'B_rate_diff_194', 'day_of_month'],
       dtype='object', length=375))

In [11]:
Xy_exp_train_val, Xy_exp_test = Xy.loc[t_train_start:t_train_end], Xy.loc[t_test_start:t_test_end]
Xy_exp_train_val.shape, Xy_exp_test.shape

((5596, 375), (3984, 375))

In [12]:
values_for_filling_na = Xy_exp_train_val.mean(axis=0).values
for i, col in enumerate(Xy_exp_train_val.columns):
    Xy_exp_train_val[col].fillna(values_for_filling_na[i], inplace=True)
    Xy_exp_test[col].fillna(values_for_filling_na[i], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(Xy_exp_train_val.drop(columns=['B_C2H6', 'B_C3H8', 
                                                                                     'B_iC4H10', 'B_nC4H10']), 
                                                      Xy_exp_train_val.loc[:, ['B_C2H6', 'B_C3H8', 
                                                                               'B_iC4H10', 'B_nC4H10']], 
                                                      test_size=0.3, shuffle=False)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((3917, 371), (1679, 371), (3917, 4), (1679, 4))

In [14]:
from catboost import CatBoostRegressor

In [15]:
mape = lambda y_true, y_pred: np.mean(((y_true - y_pred) / y_true).abs().mean().values) * 100

In [16]:
X = Xy_exp_train_val.drop(columns=['B_C2H6', 'B_C3H8', 'B_iC4H10', 'B_nC4H10'])
y = Xy_exp_train_val.loc[:, ['B_C2H6', 'B_C3H8', 'B_iC4H10', 'B_nC4H10']]

In [17]:
from hyperopt import hp, fmin, tpe

In [18]:
def return_best_cb_model(X_tr, y_tr, X_test, y_test, X, y, num_of_evals = 10):
    
    def hyperopt_opt_score(params):
        reg = CatBoostRegressor(allow_writing_files=False, verbose=False, **params, random_seed=42, objective='MAPE',
                                cat_features=['day_of_month']).fit(X_tr, y_tr)
        
        y_pred = reg.predict(X_test)
        
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
    cb_space = {
        'eta': hp.uniform('eta', 0.01, 0.10),
        'n_estimators': hp.choice('n_estimators', np.arange(200, 501, 50)),
        'depth': hp.choice('depth', np.arange(2, 6, 1)),
        'subsample': hp.uniform('subsample', 0.4, 1.0),
        'rsm': hp.uniform('rsm', 0.2, 1.0),
        'l2_leaf_reg': hp.choice('l2_leaf_reg', np.arange(5, 14, 2))
    }
    
    best_params = fmin(fn = hyperopt_opt_score, space = cb_space, algo = tpe.suggest, max_evals = num_of_evals)
    
    best_params['n_estimators'] = np.arange(200, 501, 50)[best_params['n_estimators']]
    best_params['l2_leaf_reg'] = np.arange(5, 14, 2)[best_params['l2_leaf_reg']]
    best_params['depth'] = np.arange(2, 6, 1)[best_params['depth']]
    
    print(best_params)
    
    reg = CatBoostRegressor(allow_writing_files=False, verbose=200, **best_params, random_seed=42, objective='MAPE',
                            cat_features=['day_of_month']).fit(X, y)
    return reg

In [None]:
models = {}
for col in y.columns:
    models[col] = return_best_cb_model(X_train, y_train[col], X_valid, y_valid[col], X, y[col], num_of_evals = 150)

In [None]:
X_test = Xy_exp_test.drop(columns=['B_C2H6', 'B_C3H8', 'B_iC4H10', 'B_nC4H10'])
y_pred = Xy_exp_test.loc[:, ['B_C2H6', 'B_C3H8', 'B_iC4H10', 'B_nC4H10']].copy()
for col in y_pred.columns:
    y_pred[col] = models[col].predict(X_test)

In [None]:
y_pred.to_csv('submission.csv')