In [1]:
import xgboost as xgb
import pickle
import pandas as pd
import numpy as np
import os

In [2]:
from load_data import *
# from train_xgb import *
from generate_set import *
# from xgb_predict import *

In [15]:
with open(os.path.join('train_data', '1015_1115_2.pkl'), 'rb') as pickle_file:
    restore = pickle.load(pickle_file)
X = restore['X']
Y = restore['Y']
dates = restore['dates']

with open(os.path.join('validation_data', '1001_1030_2.pkl'), 'rb') as pickle_file:
    restore_va = pickle.load(pickle_file)
X_va = restore_va['X']
Y_va = restore_va['Y']
dates_va = restore_va['dates']

t_raw = pd.read_pickle(os.path.join('dataframes', 'train_raw.pkl'))
v_raw = pd.read_pickle(os.path.join('dataframes', 'validation_raw.pkl'))
test_b1 = load_raw_file(os.path.join('data', 'ai_challenger_wf2018_testb4_20180829-20181031.nc'))
a_raw = pd.concat([t_raw, v_raw, test_b1])

Loading netCDF4 file data/ai_challenger_wf2018_testb4_20180829-20181031.nc
Loading station 90001
Loading station 90002
Loading station 90003
Loading station 90004
Loading station 90005
Loading station 90006
Loading station 90007
Loading station 90008
Loading station 90009
Loading station 90010


In [22]:
# Current Best: 0915_1015_1
with open(os.path.join('parameters', '1015_1115_3.param'), 'rb') as file:
    parameters = pickle.load(file)

In [23]:
# train_all_xgb(X, Y, X_va, Y_va, os.path.join('models', '1001_1101_2', '1013'), params=parameters)
train_all_xgb(X, Y, X_va, Y_va, os.path.join('models', '1015_1115_2', '4'), params=parameters)

Training 1th day models
Training 2th day models
Training 3th day models
Training 4th day models
Training 5th day models
Training 6th day models
Training 7th day models
Training 8th day models
Training 9th day models
Training 10th day models
Training 11th day models
Training 12th day models
Training 13th day models
Training 14th day models
Training 15th day models
Training 16th day models
Training 17th day models
Training 18th day models
Training 19th day models
Training 20th day models
Training 21th day models
Training 22th day models
Training 23th day models
Training 24th day models
Training 25th day models
Training 26th day models
Training 27th day models
Training 28th day models
Training 29th day models
Training 30th day models
Training 31th day models
Training 32th day models
Training 33th day models


In [13]:
def add_cycle_to_X(X, raw, column, predict_hour, add_days, interval=1):
    explain = pd.read_csv(os.path.join('data', 'explain.csv'), index_col=0).dropna(how='any', axis=1)
    
    def get_station_id(X):
        station_list = []
        for index, row in X.iterrows():
            station_onehot = row[['station_{}'.format(i) for i in range(10)]]
            station_id = int('900%02d' % int(np.sum([station_onehot[i] * (i+1) for i in range(10)])))
            station_list.append(station_id)
        return station_list

    def normalize_single_column(df: pd.DataFrame, name) -> pd.DataFrame:
        """
        Normalize data.
        """
        result = pd.DataFrame(df, copy=True)
        scope = explain.loc[name]['scope'][1:-1].split(',')
        min_value = float(scope[0].strip())
        max_value = float(scope[1].strip())

        result[name] = (result[name] - min_value) / (max_value - min_value)
        return result
    
    origin_dates = [pd.Timestamp.fromtimestamp(timestamp) - pd.Timedelta('8 hours') for timestamp in X['timestamp']]
    station_list = get_station_id(X)
    series_list = []
    for day in range(add_days):
        day = (day + 1) * interval
        fetch_dates = pd.Series(origin_dates) - pd.Timedelta('{} days'.format(day))
        series = pd.Series([raw.loc[(station_id, date, predict_hour + 4)][column] 
                            for (station_id, date) in zip(station_list, fetch_dates)])
        series.name = column
        series_list.append(series)
    result = normalize_single_column(pd.DataFrame(series_list).T, column)
    result.columns = ['{}_{}*{}days'.format(column, interval, i + 1) for i in range(add_days)]
    return pd.concat([X, result], axis=1)

In [14]:
def train_all_xgb(X, Y, X_va, Y_va, dir_prefix, params, start=0, end=None):
    """
    Train models for all time series and features.
    """
    if end is None:
        end = Y[0].shape[0]
    y_train_list = {column: [] for column in Y[0].columns}
    y_eval_list = {column: [] for column in Y[0].columns}
    for i in range(start, end):
        print('Training {}th day models'.format(i + 1))
        for j in range(len(Y[0].columns)):
            column = Y[0].columns[j]
            
            X_added = add_cycle_to_X(X, a_raw, column, i, 7)
#             X_added = add_cycle_to_X(X_added, all_raw, column, i, 4, 7)
            X_va_added = add_cycle_to_X(X_va, a_raw, column, i, 7)
#             X_va_added = add_cycle_to_X(X_va_added, all_raw, column, i, 4, 7)
            
            y_train = fetch_labels(i, column, Y)
            y_validation = fetch_labels(i, column, Y_va)
            
            max_depth = int(params.iloc[i*3+j]['max_depth']) 
            min_child_weight = int(params.iloc[i*3+j]['min_child_weight'])
            xg_reg = xgb.XGBRegressor(max_depth=max_depth, learning_rate=0.15, n_estimators=150, silent=True, 
                                      objective='reg:linear', booster='gbtree', n_jobs=47, alpha=10,
                                      gamma=0, min_child_weight=min_child_weight, max_delta_step=0, subsample=0.9, 
                                      colsample_bytree=0.9, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, 
                                      scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None)
#             xg_reg.fit(add_previous_to_X(X, y_train_list[column]), y_train, 
#                        eval_set=[(add_previous_to_X(X_va, y_eval_list[column]), y_validation)], verbose=False, 
#                        eval_metric='rmse', early_stopping_rounds=15)
            xg_reg.fit(X_added, y_train, 
                       eval_set=[(X_va_added, y_validation)], verbose=False, 
                       eval_metric='rmse', early_stopping_rounds=15)
#             xg_reg.fit(X_added, y_train)
#             xg_reg.fit(X, y_train)
            
            # Save model to file.
            pickle.dump(xg_reg, open('{}_{}_{}.model'.format(dir_prefix, column, i), 'wb'))
            
#             y_train_list[column].append(y_train)
#             y_eval_list[column].append(y_validation)