In [272]:
from feature.base_dataset import *
from feature.after_dataset import *
from feature.make_dataset import *
from feature.utils import *
from model.model_train import model_train
import pandas as pd
import numpy as np
from glob import glob

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [273]:
def moving_average(input_list):
    """make moving_average dataset

    Args:
        input_list (list): train_input_list

    Returns:
        DataFrame: moving_average time dataset
    """    
    df_moving = pd.DataFrame()
    for i in input_list:
        df = pd.read_csv(i)
        df = limit_range(df)
        df = df.fillna(method='ffill')
        df = df.drop(['DAT', 'obs_time'], axis=1)
           
        ma = df.rolling(6, min_periods=1).mean()

        ma.columns = [''.join(str(col))+'roling' for col in ma.columns]        
        df_moving = pd.concat([df_moving, ma])
    return df_moving

def accumulate(train, test, col):
    '''
    총 누적합
    '''
    train['월간'+col] = 0
    for i in range(28):
        result = (train['일간'+col][i*672:(i+1)*672].cumsum())
        train['월간'+col][i*672:(i+1)*672] = result


    test['월간'+col] = 0
    for i in range(5):
        result = (test['일간'+col][i*672:(i+1)*672].cumsum())
        test["월간"+col][i*672:(i+1)*672] = result



In [274]:
train_input_list = sorted(glob('./data/train_input/*.csv'))
train_target_list = sorted(glob('./data/train_target/*.csv'))

test_input_list = sorted(glob('./data/test_input/*.csv'))
test_target_list = sorted(glob('./data/test_target/*.csv'))

train = make_dataset(train_input_list, train_target_list)
test = make_dataset(test_input_list, test_target_list)

train_rol = moving_average(train_input_list)
test_rol = moving_average(test_input_list)

In [292]:
train

Unnamed: 0,DAT,obs_time,내부온도관측치,내부습도관측치,co2관측치,ec관측치,시간당분무량,일간누적분무량,시간당백색광량,일간누적백색광량,...,시간당분무량roling,일간누적분무량roling,시간당백색광량roling,일간누적백색광량roling,시간당적색광량roling,일간누적적색광량roling,시간당청색광량roling,일간누적청색광량roling,시간당총광량roling,일간누적총광량roling
0,0,0,25.300000,81.835000,536.016667,1.407439,0.0,0.00,0.0000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000
1,0,1,25.680357,81.264286,528.696429,1.409003,126.0,126.00,0.0000,0.000,...,63.000000,63.000000,0.000000,0.000000,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000
2,0,2,25.273333,81.471666,532.833333,1.406913,0.0,126.00,0.0000,0.000,...,42.000000,84.000000,0.000000,0.000000,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000
3,0,3,25.355000,81.398334,545.566667,1.406689,126.0,252.00,0.0000,0.000,...,63.000000,126.000000,0.000000,0.000000,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000
4,0,4,25.391667,81.483333,558.583333,1.411070,0.0,252.00,0.0000,0.000,...,50.400000,151.200000,0.000000,0.000000,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,27,19,26.030000,58.736667,448.500000,1.195415,0.0,2543.12,12.3764,146722.222,...,139.498333,2180.123333,9654.107683,129401.450200,1464.7738,19649.0952,688.737833,9248.093833,11807.619317,158298.639233
668,27,20,27.341666,58.373334,449.183333,1.190780,126.0,2669.12,0.0000,146722.222,...,160.498333,2340.621667,7524.335517,136925.785717,1139.3298,20788.4250,534.698667,9782.792500,9198.363983,167497.003217
669,27,21,27.785000,58.711667,441.933333,1.185593,0.0,2669.12,0.0000,146722.222,...,118.498333,2459.120000,5394.563350,142320.349067,813.8858,21602.3108,380.659500,10163.452000,6589.108650,174086.111867
670,27,22,28.480000,58.121667,437.600000,1.179664,0.0,2669.12,0.0000,146722.222,...,105.000000,2564.120000,3264.791183,145585.140250,488.4418,22090.7526,226.620333,10390.072333,3979.853317,178065.965183


In [275]:
train = time_value(train)
test = time_value(test)

train = limit_range(train)
test = limit_range(test)


In [276]:
train = col_cumsum(train, "시간당분무량", "일간누적분무량")
train = col_cumsum(train, "시간당백색광량", "일간누적백색광량")
train = col_cumsum(train, "시간당적색광량", "일간누적적색광량")
train = col_cumsum(train, "시간당청색광량", "일간누적청색광량")
train = col_cumsum(train, "시간당총광량", "일간누적총광량")


test = col_cumsum_test(test, "시간당분무량", "일간누적분무량")
test = col_cumsum_test(test, "시간당백색광량", "일간누적백색광량")
test = col_cumsum_test(test, "시간당적색광량", "일간누적적색광량")
test = col_cumsum_test(test, "시간당청색광량", "일간누적청색광량")
test = col_cumsum_test(test, "시간당총광량", "일간누적총광량")


accumulate(train, test, '누적분무량')
accumulate(train, test, '누적백색광량')
accumulate(train, test, '누적청색광량')
accumulate(train, test, '누적적색광량')
accumulate(train, test, '누적총광량')

In [277]:
train = train.interpolate()
test = test.interpolate()

In [295]:
train.to_csv('lstm_train.csv', index=False)
test.to_csv('lstm_test.csv', index=False)

In [278]:
train = pd.concat([train, train_rol], axis=1)
test = pd.concat([test, test_rol], axis=1)

In [279]:
X = train.drop(['predicted_weight_g', 'Case'], axis=1)
y = train['predicted_weight_g']

In [280]:
x_train = X[:17472]
x_val = X[17472:]
y_train = y[:17472]
y_val = y[17472:]

In [285]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# X = train.drop(['predicted_weight_g', 'Case'], axis=1)
# y = train['predicted_weight_g']
# x_test = test.drop(['predicted_weight_g', 'Case'], axis=1)
# model = XGBRegressor()

# model.fit(X, y)
# y_pred = model.predict(x_test)

In [286]:
model = CatBoostRegressor(verbose=False, random_seed=1103)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

rmse = mean_squared_error(y_val, y_pred)**0.5
print(f"validation rmse: {rmse}")

validation rmse: 18.562222657914223


In [290]:
y_pred[:24].mean()

1.3122164512537495

In [291]:
y_val[:24].mean()

0.6569833466895963

In [293]:
df_imp = pd.DataFrame({'imp':model.feature_importances_}, index = model.feature_names_)
df_imp = df_imp[df_imp.imp > 0].sort_values('imp').copy()
df_imp

Unnamed: 0,imp
시간당백색광량,0.000457
시간당백색광량roling,0.001819
일간누적백색광량,0.003517
일간누적청색광량,0.004897
시간당청색광량,0.005715
일간누적총광량,0.007882
시간당총광량,0.014182
시간당적색광량,0.019174
일간누적분무량roling,0.023589
시간당총광량roling,0.03422


In [225]:
y_result = pd.DataFrame(y_pred)

result = []
for i in range(140):
    result.append(y_result[i*24:(i+1)*24].values.mean())
    
    
test1 = group_median(test)
submit = test1[['DAT', 'predicted_weight_g']]
submit['DAT'] = submit['DAT']+1
submit['predicted_weight_g'] = 0
submit['predicted_weight_g'] = result


all_target_list = sorted(glob('./data/test_target/*.csv'))
for idx, test_path in enumerate(all_target_list):
    submit_df = pd.read_csv(test_path)
    submit_df['predicted_weight_g'] = submit['predicted_weight_g'][idx*28:idx*28+28].values
    submit_df.to_csv(test_path, index=False)