In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import date, timedelta
from tqdm import tqdm
%matplotlib inline
sns.set_style('dark')

In [2]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
train.columns = ['timestamp', 'year', 'month', 'day', 'hour', 'minute', 'second', 'temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in', 'temp_in']
test.columns = ['timestamp', 'year', 'month', 'day', 'hour', 'minute', 'second', 'temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
train['time'] = pd.to_datetime(train[['year', 'month', 'day', 'hour', 'minute']])
test['time'] = pd.to_datetime(test[['year', 'month', 'day', 'hour', 'minute']])

前10天：3.14 01:00 ~ 3.24 00:59  
中间10天：3.24 1:00 ~ 4.3 00:59  
后10天：4.3 00:59 ~ 4.13 00:59

# Data Cleaning

In [3]:
train = train.loc[train.temp_in.notnull()]

## Interpolation

In [None]:
date = {'year': [], 'month': [], 'day': [], 'hour': [], 'minute': []}
for day in range(14, 32):
    for hour in range(0, 24):
        for minute in range(0, 60):
            date['year'].append(2019)
            date['month'].append(3)
            date['day'].append(day)
            date['hour'].append(hour)
            date['minute'].append(minute)
            
for day in range(1, 14):
    for hour in range(0, 24):
        for minute in range(0, 60):
            date['year'].append(2019)
            date['month'].append(4)
            date['day'].append(day)
            date['hour'].append(hour)
            date['minute'].append(minute)

full_date = pd.DataFrame(date)
full_date['time'] = pd.to_datetime(full_date[['year', 'month', 'day', 'hour', 'minute']])
full_train = full_date.loc[(full_date.time >= '2019-3-14 01') & (full_date.time < '2019-4-3 01'), :]

train = pd.merge(full_train, train, on=['year', 'month', 'day', 'hour', 'minute', 'time'], how='left')
train.drop_duplicates(['month', 'day', 'hour', 'minute'], inplace=True)
train.reset_index(inplace=True, drop=True)

## Fill NaN

In [4]:
# train
# 顺序：前（后）一分钟，前（后）一小时，前（后）一天，上个非空值
def fill_na(row, col):
    if pd.isna(row[col]):
        # 先看看前一分钟有没有
        pre_min = row.time - pd.Timedelta('1 minute')
        next_min = row.time + pd.Timedelta('1 minute')
        pre_hour = row.time - pd.Timedelta('1 hour')
        next_hour = row.time + pd.Timedelta('1 hour')
        pre_day = row.time - pd.Timedelta('1 day')
        next_day = row.time + pd.Timedelta('1 day')
        times = [pre_min, next_min, pre_hour, next_hour, pre_day, next_day]
        for time in times:
            tmp = train.loc[train.time == time]
            if len(tmp) != 0 and pd.notna(tmp[col].values[0]):
                return tmp[col].values[0]
        tmp = train.loc[(train.time < row.time) & (pd.notna(train[col]))]
        return pre[col].values[0]
    return row[col]

train.drop_duplicates(['month', 'day', 'hour', 'minute'], inplace=True)

for feat in tqdm(['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in', 'temp_in']):
    train[feat] = train.apply(fill_na, axis=1, args=(feat,))

train['target'] = train['temp_in'] - train['temp_out']

100%|██████████| 6/6 [00:08<00:00,  1.48s/it]


In [5]:
# test
def avg_pre_next(row, col):
    if pd.isna(row[col]):
        pre_val = test.loc[test.time < row.time].iloc[-1][col]
        next_val = test.loc[test.time > row.time].iloc[0][col]
        return (pre_val + next_val) / 2
    return row[col]

for feat in tqdm(['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']):
    test[feat] = test.apply(avg_pre_next, axis=1, args=(feat,))

100%|██████████| 5/5 [00:00<00:00, 34.94it/s]


## Correct Outliers

### air

In [None]:
fig = plt.figure(figsize=(18,10))
plt.subplot(2,2,1)
sns.boxplot(x=train.air_in)
plt.title('train air_in')
plt.subplot(2,2,2)
sns.boxplot(x=train.air_out)
plt.title('train air_out')
plt.subplot(2,2,3)
sns.boxplot(x=test.air_in)
plt.title('test air_in')
plt.subplot(2,2,4)
sns.boxplot(x=test.air_out)
plt.title('test air_out')

In [None]:
def exponential_smoothing(series, alpha):
    """
        series - dataset with timestamps
        alpha - float [0.0, 1.0], smoothing parameter
    """
    result = [series[0]] # first value is same as series
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return result

In [None]:
alpha = 0.1
train['air_out'] = exponential_smoothing(train.air_out.reset_index(drop=True), alpha)
train['air_in'] = exponential_smoothing(train.air_in.reset_index(drop=True), alpha)

In [None]:
# 用上一分钟和下一分钟的平均值代替异常值
def correct_outlier(row, col, low, high, df):
    if row[col] < low or row[col] > high:
        time = row.time
        pre_val = df.loc[(df.time < time) & (df[col] >= low) & (df[col] <= high)]
        pre_val = pre_val.iloc[-1][col]
        next_val = df.loc[(df.time > time) & (df[col] >= low) & (df[col] <= high)]
        next_val = next_val.iloc[0][col]
        return (pre_val + next_val) / 2
    return row[col]

train.air_in = train.apply(correct_outlier, axis=1, args=('air_in', 965, 1000, train))
test.air_in = test.apply(correct_outlier, axis=1, args=('air_in', 500, 1000, test))
train.air_out = train.apply(correct_outlier, axis=1, args=('air_out', 965, 1000, train))
test.air_out = test.apply(correct_outlier, axis=1, args=('air_out', 960, 1000, test))

In [None]:
fig = plt.figure(figsize=(18, 12))
plt.subplot(3, 2, 1)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].air_in)
plt.title('3.14–3.24 air_in')
plt.subplot(3, 2, 2)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].air_out)
plt.title('3.14–3.24 air_out')
plt.subplot(3, 2, 3)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].air_in)
plt.title('3.24-4.3 air_in')
plt.subplot(3, 2, 4)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].air_out)
plt.title('3.24–4.3 air_out')
plt.subplot(3, 2, 5)
sns.lineplot(data=test.air_in)
plt.title('4.3-4.13 air_in')
plt.subplot(3, 2, 6)
sns.lineplot(data=test.air_out)
plt.title('4.3–4.13 air_out')

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].air_in, shade=True, label='3.14–3.24 air_in')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].air_in, shade=True, label='3.24–4.3 air_in')
sns.kdeplot(data=test.air_in, shade=True, label='4.3-4.13 air_in')
plt.subplot(1,2,2)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].air_out, shade=True, label='3.14–3.24 air_out')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].air_out, shade=True, label='3.24–4.3 air_out')
sns.kdeplot(data=test.air_out, shade=True, label='4.3–4.13 air_out')

### hum

In [None]:
fig = plt.figure(figsize=(18,10))
plt.subplot(2,2,1)
sns.boxplot(x=train.hum_in)
plt.title('train hum_in')
plt.subplot(2,2,2)
sns.boxplot(x=train.hum_out)
plt.title('train hum_out')
plt.subplot(2,2,3)
sns.boxplot(x=test.hum_in)
plt.title('test hum_in')
plt.subplot(2,2,4)
sns.boxplot(x=test.hum_out)
plt.title('test hum_out')

In [None]:
fig = plt.figure(figsize=(18, 12))
plt.subplot(3, 2, 1)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].hum_in)
plt.title('3.14–3.24 hum_in')
plt.subplot(3, 2, 2)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].hum_out)
plt.title('3.14–3.24 hum_out')
plt.subplot(3, 2, 3)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].hum_in)
plt.title('3.24-4.3 hum_in')
plt.subplot(3, 2, 4)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].hum_out)
plt.title('3.24–4.3 hum_out')
plt.subplot(3, 2, 5)
sns.lineplot(data=test.hum_in)
plt.title('4.3-4.13 hum_in')
plt.subplot(3, 2, 6)
sns.lineplot(data=test.hum_out)
plt.title('4.3–4.13 hum_out')

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].hum_in, shade=True, label='3.14–3.24 hum_in')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].hum_in, shade=True, label='3.24–4.3 hum_in')
sns.kdeplot(data=test.hum_in, shade=True, label='4.3-4.13 hum_in')
plt.subplot(1,2,2)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].hum_out, shade=True, label='3.14–3.24 hum_out')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].hum_out, shade=True, label='3.24–4.3 hum_out')
sns.kdeplot(data=test.hum_out, shade=True, label='4.3–4.13 hum_out')

### temp

In [None]:
fig = plt.figure(figsize=(18, 12))
plt.subplot(3, 2, 1)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].temp_in)
plt.title('3.14–3.24 temp_in')
plt.subplot(3, 2, 2)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].temp_out)
plt.title('3.14–3.24 temp_out')
plt.subplot(3, 2, 3)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].temp_in)
plt.title('3.24-4.3 temp_in')
plt.subplot(3, 2, 4)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].temp_out)
plt.title('3.24–4.3 temp_out')
plt.subplot(3, 2, 6)
sns.lineplot(data=test.temp_out)
plt.title('4.3–4.13 temp_out')

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].temp_in, shade=True, label='3.14–3.24 temp_in')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].temp_in, shade=True, label='3.24–4.3 temp_in')
plt.subplot(1,2,2)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].temp_out, shade=True, label='3.14–3.24 temp_out')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].temp_out, shade=True, label='3.24–4.3 temp_out')
sns.kdeplot(data=test.temp_out, shade=True, label='4.3–4.13 temp_out')

# Feature Engineering

In [10]:
matrix = pd.concat([train, test], axis=0, ignore_index=True)

## Lag features

In [11]:
# 基本聚合特征
features = ['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
group_feats = []
for f in tqdm(features):
    matrix['MDH_{}_medi'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('median')
    matrix['MDH_{}_mean'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('mean')
    matrix['MDH_{}_max'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('max')
    matrix['MDH_{}_min'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('min')
    matrix['MDH_{}_std'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('std')
#     matrix['DH_{}_medi'.format(f)] = matrix.groupby(['day','hour'])[f].transform('median')
#     matrix['DH_{}_mean'.format(f)] = matrix.groupby(['day','hour'])[f].transform('mean')
#     matrix['DH_{}_max'.format(f)] = matrix.groupby(['day','hour'])[f].transform('max')
#     matrix['DH_{}_min'.format(f)] = matrix.groupby(['day','hour'])[f].transform('min')
#     matrix['DH_{}_std'.format(f)] = matrix.groupby(['day','hour'])[f].transform('std')

    group_feats.append('MDH_{}_medi'.format(f))
    group_feats.append('MDH_{}_mean'.format(f))
#     group_feats.append('DH_{}_medi'.format(f))
#     group_feats.append('DH_{}_mean'.format(f))

100%|██████████| 5/5 [00:00<00:00, 13.41it/s]


In [12]:
# 基本交叉特征
for f1 in tqdm(features + group_feats):
    for f2 in features + group_feats:
        if f1 != f2:
            colname = '{}_{}_ratio'.format(f1, f2)
            matrix[colname] = matrix[f1].values / matrix[f2].values

matrix = matrix.fillna(method='bfill')

100%|██████████| 15/15 [00:00<00:00, 50.29it/s]


In [13]:
# 历史信息提取
matrix['dt'] = matrix['day'].values + (matrix['month'].values - 3) * 31

features = features + ['temp_in']
for f in features:
    tmp_df = pd.DataFrame()
    for t in tqdm(range(15, 45)):
        tmp = matrix[matrix['dt'] < t].groupby(['hour'])[f].agg({'mean'}).reset_index()
        tmp.columns = ['hour', 'hit_{}_mean'.format(f)]
        tmp['dt'] = t
        tmp_df = tmp_df.append(tmp)
    
    matrix = matrix.merge(tmp_df, on=['dt', 'hour'], how='left')
    
matrix = matrix.fillna(method='bfill')

100%|██████████| 30/30 [00:01<00:00, 28.47it/s]
100%|██████████| 30/30 [00:01<00:00, 26.64it/s]
100%|██████████| 30/30 [00:01<00:00, 28.09it/s]
100%|██████████| 30/30 [00:01<00:00, 27.35it/s]
100%|██████████| 30/30 [00:01<00:00, 26.10it/s]
100%|██████████| 30/30 [00:01<00:00, 28.30it/s]


In [None]:
# lag features
features = ['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
lags = [1, 24]

for l in tqdm(lags):
    tmp = matrix[features+['time']].copy()
    tmp.time += pd.Timedelta(str(l) + ' hour')
    col_names = ['lag_'+str(l)+'_'+f for f in features]
    tmp.columns = col_names + ['time']
    matrix = pd.merge(matrix, tmp, on='time', how='left')
    del tmp

matrix.fillna(method='ffill', inplace=True)

In [None]:
# Trend
for feat in features:
    for n in lags:
        matrix['trend_'+str(n)+'_'+feat] = matrix[feat] - matrix['lag_'+str(n)+'_'+feat]

matrix.fillna(method='ffill', inplace=True)

## Bin features

In [14]:
features = ['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
for f in features:
    matrix[f+'_20_bin'] = pd.cut(matrix[f], 20, duplicates='drop').apply(lambda x:x.left).astype(int)
    matrix[f+'_50_bin'] = pd.cut(matrix[f], 50, duplicates='drop').apply(lambda x:x.left).astype(int)
    matrix[f+'_100_bin'] = pd.cut(matrix[f], 100, duplicates='drop').apply(lambda x:x.left).astype(int)
    matrix[f+'_200_bin'] = pd.cut(matrix[f], 200, duplicates='drop').apply(lambda x:x.left).astype(int)
    
features_20_bin = [f + '_20_bin' for f in features]
for f1 in tqdm(features_20_bin):
    for f2 in features:
        matrix['{}_{}_medi'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('median')
        matrix['{}_{}_mean'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('mean')
        matrix['{}_{}_max'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('max')
        matrix['{}_{}_min'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('min')

features_50_bin = [f + '_50_bin' for f in features]
for f1 in tqdm(features_50_bin):
    for f2 in features:
        matrix['{}_{}_medi'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('median')
        matrix['{}_{}_mean'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('mean')
        matrix['{}_{}_max'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('max')
        matrix['{}_{}_min'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('min')

features_100_bin = [f + '_100_bin' for f in features]
for f1 in tqdm(features_100_bin):
    for f2 in features:
        matrix['{}_{}_medi'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('median')
        matrix['{}_{}_mean'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('mean')
        matrix['{}_{}_max'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('max')
        matrix['{}_{}_min'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('min')

features_200_bin = [f + '_200_bin' for f in features]
for f1 in tqdm(features_200_bin):
    for f2 in features:
        matrix['{}_{}_medi'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('median')
        matrix['{}_{}_mean'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('mean')
        matrix['{}_{}_max'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('max')
        matrix['{}_{}_min'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('min')

100%|██████████| 5/5 [00:08<00:00,  1.78s/it]
100%|██████████| 5/5 [00:12<00:00,  2.46s/it]
100%|██████████| 5/5 [00:14<00:00,  2.89s/it]
100%|██████████| 5/5 [00:17<00:00,  3.50s/it]


## Rolling features

In [15]:
features = ['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
lags = [1,3,6,12,24]
matrix.set_index('time', inplace=True)

for feat in tqdm(features):
    for l in lags:
        lag_hour = str(l) + 'h'
        matrix['mean_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).mean()
        matrix['median_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).median()
        matrix['max_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).max()
        matrix['min_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).min()
        matrix['std_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).std()
        matrix['skew_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).skew()
        matrix['q1_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).quantile(quantile=0.25)
        matrix['q3_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).median(quantile=0.75)
        matrix['var_'+str(l)+'_hours_'+feat] = matrix['std_'+str(l)+'_hours_'+feat] / matrix['mean_'+str(l)+'_hours_'+feat]
        
matrix.reset_index(inplace=True)

100%|██████████| 5/5 [00:02<00:00,  1.94it/s]


## Save data

In [16]:
matrix.to_pickle('data.pkl')

# Predict trend with linear model

In [17]:
# from sklearn.linear_model import LinearRegression

# data = pd.read_pickle('data.pkl')
# features = ['month', 'day', 'hour', 'minute', 'temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
# num = len(train)
# X_train = data[:num][features]
# y_train = data[:num]['temp_in']
# X_test = data[num:][features]
# reg = LinearRegression().fit(X_train, y_train)

In [18]:
# pred_train_lr = reg.predict(X_train)
# pred_test_lr = reg.predict(X_test)

# Modeling

In [19]:
data = pd.read_pickle('data.pkl')

In [20]:
features_to_drop = ['timestamp', 'year', 'second', 'time', 'target', 'temp_in']

num = int(len(train)*0.8)
X_train = data.iloc[:num].drop(features_to_drop, axis=1)
y_train = data.iloc[:num]['target']
# y_train = data.iloc[:num]['temp_in'] - pred_train_lr[:num]
X_val = data.iloc[num:len(train)].drop(features_to_drop, axis=1)
y_val = data.iloc[num:len(train)]['target']
# y_val = data.iloc[num:len(train)]['temp_in'] - pred_train_lr[num:len(train)]
X_test = data.loc[data.time >= '2019-4-3 01'].drop(features_to_drop, axis=1)

## XGBoost

In [21]:
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance

In [22]:
# # grid search

# params = {
#     'eta': [0.05, 0.1, 0.2],
#     'max_depth': [7,8,9],
#     'colsample_bytree': [0.6,0.7],
#     'subsample': [0.6,0.7],
#     'min_child_weight': [4,5,6]
# }

# best_score, best_param = 100, None

# for i, p in enumerate(ParameterGrid(params)):
#     model = XGBRegressor(max_depth=9,
#                          n_estimators=100,
#                          min_child_weight=0.5, 
#                          colsample_bytree=0.6, 
#                          subsample=0.6, 
#                          eta=0.1,
#                          seed=10)
#     model.set_params(**p)
#     model.fit(X_train, 
#               y_train, 
#               eval_metric='mae', 
#               eval_set=[(X_train, y_train), (X_val, y_val)], 
#               verbose=False, 
#               early_stopping_rounds=20)
#     pre_val = model.predict(X_val)
#     score = mean_absolute_error(y_val, pre_val)
#     print('round {}: {:.4f}'.format(i+1, score))
#     print('params: {}'.format(p))
#     print('\n')
#     if score < best_score:
#         best_score = score
#         best_param = p

# print(best_score)
# print(best_param)

In [23]:
%%time

model = XGBRegressor(max_depth=8,
                     n_estimators=50000,
                     min_child_weight=5, 
                     colsample_bytree=0.5, 
                     subsample=0.5, 
                     eta=0.001,
                     seed=2020)
model.fit(X_train, 
          y_train, 
          eval_metric='mae', 
          eval_set=[(X_train, y_train), (X_val, y_val)], 
          verbose=500, 
          early_stopping_rounds=1000)

[0]	validation_0-mae:0.64519	validation_1-mae:0.34407
Multiple eval metrics have been passed: 'validation_1-mae' will be used for early stopping.

Will train until validation_1-mae hasn't improved in 1000 rounds.
[500]	validation_0-mae:0.40224	validation_1-mae:0.24909
[1000]	validation_0-mae:0.25516	validation_1-mae:0.20090
[1500]	validation_0-mae:0.16727	validation_1-mae:0.17865
[2000]	validation_0-mae:0.11589	validation_1-mae:0.16849
[2500]	validation_0-mae:0.08647	validation_1-mae:0.16374
[3000]	validation_0-mae:0.06976	validation_1-mae:0.16140
[3500]	validation_0-mae:0.06009	validation_1-mae:0.15989
[4000]	validation_0-mae:0.05439	validation_1-mae:0.15884
[4500]	validation_0-mae:0.05073	validation_1-mae:0.15809
[5000]	validation_0-mae:0.04827	validation_1-mae:0.15748
[5500]	validation_0-mae:0.04655	validation_1-mae:0.15714
[6000]	validation_0-mae:0.04523	validation_1-mae:0.15689
[6500]	validation_0-mae:0.04421	validation_1-mae:0.15673
[7000]	validation_0-mae:0.04335	validation_1-ma

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, eta=0.001, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.00100000005, max_delta_step=0, max_depth=8,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=50000, n_jobs=2, num_parallel_tree=1,
             random_state=2020, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=2020, subsample=0.5, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [24]:
pred_test_xgb = model.predict(X_test, ntree_limit=model.best_ntree_limit)
submission = pd.DataFrame({'time': test.timestamp, 
                           'temperature': test.temp_out + pred_test_xgb})
submission.to_csv('submissions/xgb_submission.csv', index=False)

In [51]:
features_to_drop = ['timestamp', 'year', 'second', 'time', 'target', 'temp_in']

X_train = data.loc[data.time < '2019-4-3 01'].drop(features_to_drop, axis=1)
y_train = data.loc[data.time < '2019-4-3 01']['target']
X_test = data.loc[data.time >= '2019-4-3 01'].drop(features_to_drop, axis=1)

In [52]:
%%time

model = XGBRegressor(max_depth=8,
                     n_estimators=3000,
                     min_child_weight=5, 
                     colsample_bytree=0.5, 
                     subsample=0.5, 
                     eta=0.001,
                     seed=2020)
model.fit(X_train, 
          y_train, 
          eval_metric='mae', 
          eval_set=[(X_train, y_train), (X_train, y_train)], 
          verbose=500, 
          early_stopping_rounds=1000)

[0]	validation_0-mae:0.58496	validation_1-mae:0.58496
Multiple eval metrics have been passed: 'validation_1-mae' will be used for early stopping.

Will train until validation_1-mae hasn't improved in 1000 rounds.
[500]	validation_0-mae:0.36758	validation_1-mae:0.36758
[1000]	validation_0-mae:0.23616	validation_1-mae:0.23616
[1500]	validation_0-mae:0.15802	validation_1-mae:0.15802
[2000]	validation_0-mae:0.11232	validation_1-mae:0.11232
[2500]	validation_0-mae:0.08608	validation_1-mae:0.08608
[2999]	validation_0-mae:0.07106	validation_1-mae:0.07106
CPU times: user 1h 5min 25s, sys: 13.2 s, total: 1h 5min 38s
Wall time: 33min 34s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, eta=0.001, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.00100000005, max_delta_step=0, max_depth=8,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=3000, n_jobs=0, num_parallel_tree=1,
             random_state=2020, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=2020, subsample=0.5, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [53]:
pred_test_xgb = model.predict(X_test, ntree_limit=model.best_ntree_limit)
submission = pd.DataFrame({'time': test.timestamp, 
                           'temperature': test.temp_out + pred_test_xgb})
submission.to_csv('submissions/xgb_submission.csv', index=False)