In [12]:
from feature.make_dataset import make_data
from feature.after_dataset import accumulate
from feature.moving_average import moving_average
import pandas as pd
from glob import glob
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')


In [13]:
train_input_list = './data/train_input/*.csv'
train_target_list = './data/train_target/*.csv'

test_input_list = './data/test_input/*.csv'
test_target_list = './data/test_target/*.csv'

In [14]:
train, test, p_train, p_test = make_data(train_input_list, train_target_list, test_input_list, test_target_list)
train_rol = moving_average(train_input_list, 6)
test_rol = moving_average(test_input_list, 6)

In [15]:
train = train.reset_index()
test = test.reset_index()
train_rol = train_rol.reset_index()
test_rol = test_rol.reset_index()

In [16]:
accumulate(train, test, '누적분무량')
accumulate(train, test, '누적백색광량')
accumulate(train, test, '누적청색광량')
accumulate(train, test, '누적적색광량')
accumulate(train, test, '누적총광량')

In [17]:
train = pd.concat([train, train_rol], axis=1)
test = pd.concat([test, test_rol], axis=1)

In [18]:
drop_feature = ['일간누적분무량', '일간누적백색광량', '일간누적적색광량', '일간누적청색광량', '일간누적총광량', 'index']
# drop_feature = ['시간당분무량', '시간당백색광량', '시간당적색광량', '시간당청색광량', '시간당총광량', 'index']
train = train.drop(drop_feature, axis=1)
test = test.drop(drop_feature, axis=1)


In [19]:
train = train.drop(['Case', 'obs_time'], axis=1)
test = test.drop(['Case', 'obs_time'], axis=1)

In [20]:
X = train.drop(['predicted_weight_g'], axis=1)
y = train['predicted_weight_g']

x_train = X[:728]
y_train = y[:728]
x_val = X[728:]
y_val = y[728:]

cat = CatBoostRegressor(n_estimators=300, verbose=False)

cat.fit(x_train, y_train)

y_pred = cat.predict(x_val)
rmse = mean_squared_error(y_val, y_pred)**0.5
rmse

19.26514105219166

In [21]:
df_imp = pd.DataFrame({'imp':cat.feature_importances_}, index = cat.feature_names_)
df_imp = df_imp[df_imp.imp > 0].sort_values('imp').copy()
df_imp.to_csv('zz.csv')

In [21]:
X = train.drop(['predicted_weight_g'], axis=1)
y = train['predicted_weight_g']
from sklearn.model_selection import StratifiedKFold, KFold

skf = KFold(n_splits=5, shuffle=True, random_state=404)

folds = []

for train_idx, val_idx in skf.split(X, y):
    folds.append((train_idx, val_idx))

XGB_model = {}

for f in range(5):
    print(f'===================================={f+1}============================================')
    train_idx, val_idx = folds[f]

    x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]

    xgb = XGBRegressor(n_estimators=5000, learning_rate=0.05, subsample=0.5,
                       max_depth=6, gamma=500, reg_lambda=500, colsample_bytree=0.5, random_state=42)
    xgb.fit(x_train, y_train)

    y_pred = xgb.predict(x_val)
    rmse = mean_squared_error(y_val, y_pred)**0.5
    print(f"{f + 1} Fold RMSE = {rmse}")
    XGB_model[f] = xgb
    print(f'================================================================================\n\n')


submit = test[['DAT', 'predicted_weight_g']]
x_test = test.drop(['predicted_weight_g'], axis=1)
for fold in range(5):
    submit['predicted_weight_g'] += XGB_model[fold].predict(x_test)/5
all_target_list = sorted(glob.glob('./data/test_target/*.csv'))
for idx, test_path in enumerate(all_target_list):
    submit_df = pd.read_csv(test_path)
    submit_df['predicted_weight_g'] = submit['predicted_weight_g'][idx*28:idx*28+28].values
    submit_df.to_csv(test_path, index=False)

1 Fold RMSE = 13.111832882967468


2 Fold RMSE = 13.541611509450577


3 Fold RMSE = 10.897795076565597


4 Fold RMSE = 13.059043829494


5 Fold RMSE = 14.043638278961364


