In [1]:
from feature.make_dataset import make_data
from feature.after_dataset import accumulate
from feature.moving_average import moving_average
import pandas as pd
from glob import glob
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import glob
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')


In [2]:
train_input_list = './data/train_input/*.csv'
train_target_list = './data/train_target/*.csv'

test_input_list = './data/test_input/*.csv'
test_target_list = './data/test_target/*.csv'

In [3]:
train, test, p_train, p_test = make_data(train_input_list, train_target_list, test_input_list, test_target_list)
train_rol = moving_average(train_input_list)
test_rol = moving_average(test_input_list)

In [4]:
train = train.reset_index()
test = test.reset_index()
train_rol = train_rol.reset_index()
test_rol = test_rol.reset_index()

In [5]:
accumulate(train, test, '누적분무량')
accumulate(train, test, '누적백색광량')
accumulate(train, test, '누적청색광량')
accumulate(train, test, '누적적색광량')
accumulate(train, test, '누적총광량')

In [6]:
train = pd.concat([train, train_rol], axis=1)
test = pd.concat([test, test_rol], axis=1)

In [7]:
drop_feature = ['시간당분무량', '시간당백색광량', '시간당적색광량', '시간당청색광량', '시간당총광량', 'index']
train = train.drop(drop_feature, axis=1)
test = test.drop(drop_feature, axis=1)


In [12]:
X = train.drop(['predicted_weight_g', 'Case', 'obs_time'], axis=1)
y = train['predicted_weight_g']
from sklearn.model_selection import StratifiedKFold, KFold

skf = KFold(n_splits=5, shuffle=True, random_state=404)

folds = []

for train_idx, val_idx in skf.split(X, y):
    folds.append((train_idx, val_idx))

CAT_model = {}

for f in range(5):
    print(f'===================================={f+1}============================================')
    train_idx, val_idx = folds[f]

    x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]

    CAT = CatBoostRegressor(verbose=500, early_stopping_rounds=50, task_type="GPU", random_seed=113, n_estimators=2000)
    CAT.fit(x_train, y_train)

    y_pred = CAT.predict(x_val)
    rmse = mean_squared_error(y_val, y_pred)**0.5
    print(f"{f + 1} Fold RMSE = {rmse}")
    CAT_model[f] = CAT
    print(f'================================================================================\n\n')


submit = test[['DAT', 'predicted_weight_g']]
x_test = test.drop(['predicted_weight_g', 'Case', 'obs_time'], axis=1)
for fold in range(5):
    submit['predicted_weight_g'] += CAT_model[fold].predict(x_test)/5
all_target_list = sorted(glob.glob('./data/test_target/*.csv'))
for idx, test_path in enumerate(all_target_list):
    submit_df = pd.read_csv(test_path)
    submit_df['predicted_weight_g'] = submit['predicted_weight_g'][idx*28:idx*28+28].values
    submit_df.to_csv(test_path, index=False)

Learning rate set to 0.025121
0:	learn: 42.0175521	total: 10.6ms	remaining: 21.2s
500:	learn: 4.6567534	total: 4.89s	remaining: 14.6s
1000:	learn: 3.3889351	total: 9.85s	remaining: 9.83s
1500:	learn: 2.9994725	total: 14.9s	remaining: 4.94s
1999:	learn: 2.8028132	total: 19.9s	remaining: 0us
1 Fold RMSE = 7.577207463172634


Learning rate set to 0.025121
0:	learn: 40.4152822	total: 10.7ms	remaining: 21.4s
500:	learn: 5.5543378	total: 5.01s	remaining: 15s
1000:	learn: 4.3395235	total: 9.98s	remaining: 9.96s
1500:	learn: 4.1372324	total: 14.4s	remaining: 4.78s
1999:	learn: 3.7719270	total: 19s	remaining: 0us
2 Fold RMSE = 8.048566247752781


Learning rate set to 0.025121
0:	learn: 41.5895922	total: 10.6ms	remaining: 21.3s
500:	learn: 4.7756357	total: 4.94s	remaining: 14.8s
1000:	learn: 4.1493631	total: 9.77s	remaining: 9.74s
1500:	learn: 3.5345180	total: 14.7s	remaining: 4.9s
1999:	learn: 3.3261667	total: 19.5s	remaining: 0us
3 Fold RMSE = 6.536041654471992


Learning rate set to 0.025121


In [13]:
df_imp = pd.DataFrame({'imp':CAT.feature_importances_}, index = CAT.feature_names_)
df_imp = df_imp[df_imp.imp > 0].sort_values('imp').copy()
df_imp

Unnamed: 0,imp
"('일간누적분무량', 11)",0.000133
"('ec관측치', 2)",0.000420
"('ec관측치', 16)",0.000580
"('일간누적분무량', 10)",0.001471
"('일간누적총광량', 4)",0.002754
...,...
"('내부온도관측치', 17)",2.091089
월간누적백색광량,10.673972
월간누적적색광량,10.858394
월간누적분무량,19.234657


In [11]:
df_imp.to_csv('zz.csv')