In [1]:
import xgboost as xgb
import pickle
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import timeit

In [11]:
from load_data import *
# from train_xgb import *
from generate_set import *
# from xgb_predict import *

In [19]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [4]:
t_raw = pd.read_pickle(os.path.join('dataframes', 'train_raw.pkl'))
v_raw = pd.read_pickle(os.path.join('dataframes', 'validation_raw.pkl'))
test_b1 = load_raw_file(os.path.join('data', 'ai_challenger_wf2018_testb1_20180829-20181028.nc'))
a_raw = pd.concat([t_raw, v_raw, test_b1])

Loading netCDF4 file data/ai_challenger_wf2018_testb1_20180829-20181028.nc
Loading station 90001
Loading station 90002
Loading station 90003
Loading station 90004
Loading station 90005
Loading station 90006
Loading station 90007
Loading station 90008
Loading station 90009
Loading station 90010


In [9]:
with open(os.path.join('train_data', '1015_1115_2.pkl'), 'rb') as pickle_file:
    restore = pickle.load(pickle_file)
X = restore['X']
Y = restore['Y']
dates = restore['dates']

with open(os.path.join('validation_data', '0928_1029_2.pkl'), 'rb') as pickle_file:
    restore_va = pickle.load(pickle_file)
X_va = restore_va['X']
Y_va = restore_va['Y']
dates_va = restore_va['dates']

In [23]:
best_parameters = gen_best_parameters(X, Y, X_va, Y_va)

Testing 1th day best parameters


KeyboardInterrupt: 

In [None]:
with open(os.path.join('parameters', '1015_1115_3.param'), 'wb') as file:
    pickle.dump(best_parameters, file)

In [13]:
def add_previous_to_X(X, prediction_list):
    if len(prediction_list) == 0:
        return X
    for prediction in prediction_list:
        prediction.index = X.index
    predictions = pd.concat(prediction_list, axis=1)
    predictions.columns = ['prediction_{}'.format(i) for i in range(predictions.shape[1])]
    return pd.concat([X, predictions], axis=1)

In [21]:
def add_cycle_to_X(X, raw, column, predict_hour, add_days, interval=1):
    explain = pd.read_csv(os.path.join('data', 'explain.csv'), index_col=0).dropna(how='any', axis=1)
    
    def get_station_id(X):
        station_list = []
        for index, row in X.iterrows():
            station_onehot = row[['station_{}'.format(i) for i in range(10)]]
            station_id = int('900%02d' % int(np.sum([station_onehot[i] * (i+1) for i in range(10)])))
            station_list.append(station_id)
        return station_list

    def normalize_single_column(df: pd.DataFrame, name) -> pd.DataFrame:
        """
        Normalize data.
        """
        result = pd.DataFrame(df, copy=True)
        scope = explain.loc[name]['scope'][1:-1].split(',')
        min_value = float(scope[0].strip())
        max_value = float(scope[1].strip())

        result[name] = (result[name] - min_value) / (max_value - min_value)
        return result
    
    origin_dates = [pd.Timestamp.fromtimestamp(timestamp) - pd.Timedelta('8 hours') for timestamp in X['timestamp']]
    station_list = get_station_id(X)
    series_list = []
    for day in range(add_days):
        day = (day + 1) * interval
        fetch_dates = pd.Series(origin_dates) - pd.Timedelta('{} days'.format(day))
        series = pd.Series([raw.loc[(station_id, date, predict_hour + 4)][column] 
                            for (station_id, date) in zip(station_list, fetch_dates)])
        series.name = column
        series_list.append(series)
    result = normalize_single_column(pd.DataFrame(series_list).T, column)
    result.columns = ['{}_{}*{}days'.format(column, interval, i + 1) for i in range(add_days)]
    return pd.concat([X, result], axis=1)

In [22]:
def train_eval_xgb_model(X, Y, X_va, Y_va, column, predict_hour, output=False):
    """
    Evaluate one spot XGBoost model to test parameters.
    """
#     y_train_list = [fetch_labels(i, column, Y) for i in range(predict_hour)]
    y_train = fetch_labels(predict_hour, column, Y)
     
#     y_eval_list = [fetch_labels(i, column, Y_va) for i in range(predict_hour)]
    y_eval = fetch_labels(predict_hour, column, Y_va)
    
    X_added = add_cycle_to_X(X, a_raw, column, predict_hour, 7)
#     X_added = add_cycle_to_X(X_added, a_raw, column, predict_hour, 4, 7)
    X_va_added = add_cycle_to_X(X_va, a_raw, column, predict_hour, 7)
#     X_va_added = add_cycle_to_X(X_va_added, a_raw, column, predict_hour, 4, 7)
    
    rmse_list = []
    for max_depth in range(3, 18, 1):
        for min_child_weight in range(0, 12, 1):
#             start = timeit.default_timer()
            xg_reg = xgb.XGBRegressor(max_depth=max_depth, learning_rate=0.15, n_estimators=150, silent=True, 
                                      objective='reg:linear', booster='gbtree', n_jobs=47, alpha=10,
                                      gamma=0, min_child_weight=min_child_weight, max_delta_step=0, subsample=0.9, 
                                      colsample_bytree=0.9, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, 
                                      scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None)
            xg_reg.fit(X_added, y_train, 
                       eval_set=[(X_va_added, y_eval)], verbose=False, 
                       eval_metric='rmse', early_stopping_rounds=15)
#             xg_reg.fit(X_added, y_train)
#             xg_reg.fit(X, y_train)
#             stop = timeit.default_timer()

#             prediction = xg_reg.predict(add_previous_to_X(X_va, y_eval_list))
            prediction = xg_reg.predict(X_va_added)
#             prediction = xg_reg.predict(X_va)

            eval_df = pd.DataFrame(y_eval, columns=[column])
            eval_df = denormalize(eval_df)
            pre_df = pd.DataFrame(prediction, columns=[column])
            pre_df = denormalize(pre_df)
            eval_df.columns = eval_df.columns + '_eval'
            pre_df.columns = pre_df.columns + '_pre'
            compare_df = pd.concat([eval_df, pre_df], axis=1)

            rmse = sqrt(mean_squared_error(compare_df.iloc[:, 0], compare_df.iloc[:, 1]))
            rmse_list.append([max_depth, min_child_weight, rmse])
            if output:
                print('{}-{}-{}'.format(max_depth, min_child_weight, rmse))
    return pd.DataFrame(rmse_list, columns=['max_depth', 'min_child_weight', 'rmse'])

In [16]:
def gen_best_parameters(X, Y, X_va, Y_va):
    result_list = []
    index_list = []
    for hour in range(0, 33):
        print('Testing {}th day best parameters'.format(hour+1))
        for name in Y[0].columns:
            result = train_eval_xgb_model(X, Y, X_va, Y_va, name, hour, False)
            result = result.sort_values(by='rmse').iloc[0]
            index_list.append((hour, name))
            result_list.append(result)
    best_parameters = pd.DataFrame(result_list, index=index_list)
    return best_parameters