In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import date, timedelta
from tqdm import tqdm
%matplotlib inline
sns.set_style('dark')

In [2]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
train.columns = ['timestamp', 'year', 'month', 'day', 'hour', 'minute', 'second', 'temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in', 'temp_in']
test.columns = ['timestamp', 'year', 'month', 'day', 'hour', 'minute', 'second', 'temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
train['time'] = pd.to_datetime(train[['year', 'month', 'day', 'hour', 'minute']])
test['time'] = pd.to_datetime(test[['year', 'month', 'day', 'hour', 'minute']])

前10天：3.14 01:00 ~ 3.24 00:59  
中间10天：3.24 1:00 ~ 4.3 00:59  
后10天：4.3 00:59 ~ 4.13 00:59

# Data Cleaning

In [3]:
train = train.loc[train.temp_in.notnull()]

## Interpolation

In [None]:
date = {'year': [], 'month': [], 'day': [], 'hour': [], 'minute': []}
for day in range(14, 32):
    for hour in range(0, 24):
        for minute in range(0, 60):
            date['year'].append(2019)
            date['month'].append(3)
            date['day'].append(day)
            date['hour'].append(hour)
            date['minute'].append(minute)
            
for day in range(1, 14):
    for hour in range(0, 24):
        for minute in range(0, 60):
            date['year'].append(2019)
            date['month'].append(4)
            date['day'].append(day)
            date['hour'].append(hour)
            date['minute'].append(minute)

full_date = pd.DataFrame(date)
full_date['time'] = pd.to_datetime(full_date[['year', 'month', 'day', 'hour', 'minute']])
full_train = full_date.loc[(full_date.time >= '2019-3-14 01') & (full_date.time < '2019-4-3 01'), :]

train = pd.merge(full_train, train, on=['year', 'month', 'day', 'hour', 'minute', 'time'], how='left')
train.drop_duplicates(['month', 'day', 'hour', 'minute'], inplace=True)
train.reset_index(inplace=True, drop=True)

## Fill NaN

In [4]:
# train
def fill_na(row, col):
    if pd.isna(row[col]):
        # 先看看前一分钟有没有
        time = row.time - pd.Timedelta('1 minute')
        pre = train.loc[train.time == time]
        if len(pre) != 0 and pd.notna(pre[col].values[0]):
            return pre[col].values[0]
        else:
            # 再看昨天有没有
            time = row.time - pd.Timedelta('1 day')
            pre = train.loc[train.time == time]
            if len(pre) != 0 and pd.notna(pre[col].values[0]):
                return pre[col].values[0]
            else:
                # 再看看前天有没有
                time = row.time - pd.Timedelta('2 day')
                pre = train.loc[train.time == time]
                if len(pre) != 0 and pd.notna(pre[col].values[0]):
                    return pre[col].values[0]
                else:
                    # 否则直接找上一个非空值
                    pre = train.loc[(train.time < row.time) & (pd.notna(train[col]))].iloc[-1][col]
                    return pre[col].values[0]
            
    return row[col]

train.drop_duplicates(['month', 'day', 'hour', 'minute'], inplace=True)

for feat in tqdm(['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in', 'temp_in']):
    train[feat] = train.apply(fill_na, axis=1, args=(feat,))

train['target'] = train['temp_in'] - train['temp_out']

100%|██████████| 6/6 [00:08<00:00,  1.40s/it]


In [5]:
# test
def avg_pre_next(row, col):
    if pd.isna(row[col]):
        pre_val = test.loc[test.time < row.time].iloc[-1][col]
        next_val = test.loc[test.time > row.time].iloc[0][col]
        return (pre_val + next_val) / 2
    return row[col]

for feat in tqdm(['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']):
    test[feat] = test.apply(avg_pre_next, axis=1, args=(feat,))

100%|██████████| 5/5 [00:00<00:00, 21.69it/s]


## Correct Outliers

### air

In [None]:
fig = plt.figure(figsize=(18,10))
plt.subplot(2,2,1)
sns.boxplot(x=train.air_in)
plt.title('train air_in')
plt.subplot(2,2,2)
sns.boxplot(x=train.air_out)
plt.title('train air_out')
plt.subplot(2,2,3)
sns.boxplot(x=test.air_in)
plt.title('test air_in')
plt.subplot(2,2,4)
sns.boxplot(x=test.air_out)
plt.title('test air_out')

In [None]:
# 用上一分钟和下一分钟的平均值代替异常值
def correct_outlier(row, col, low, high, df):
    if row[col] < low or row[col] > high:
        time = row.time
        pre_val = df.loc[(df.time < time) & (df[col] >= low) & (df[col] <= high)]
        pre_val = pre_val.iloc[-1][col]
        next_val = df.loc[(df.time > time) & (df[col] >= low) & (df[col] <= high)]
        next_val = next_val.iloc[0][col]
        return (pre_val + next_val) / 2
    return row[col]

train.air_in = train.apply(correct_outlier, axis=1, args=('air_in', 965, 1000, train))
test.air_in = test.apply(correct_outlier, axis=1, args=('air_in', 500, 1000, test))
train.air_out = train.apply(correct_outlier, axis=1, args=('air_out', 965, 1000, train))
test.air_out = test.apply(correct_outlier, axis=1, args=('air_out', 960, 1000, test))

In [None]:
fig = plt.figure(figsize=(18, 12))
plt.subplot(3, 2, 1)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].air_in)
plt.title('3.14–3.24 air_in')
plt.subplot(3, 2, 2)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].air_out)
plt.title('3.14–3.24 air_out')
plt.subplot(3, 2, 3)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].air_in)
plt.title('3.24-4.3 air_in')
plt.subplot(3, 2, 4)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].air_out)
plt.title('3.24–4.3 air_out')
plt.subplot(3, 2, 5)
sns.lineplot(data=test.air_in)
plt.title('4.3-4.13 air_in')
plt.subplot(3, 2, 6)
sns.lineplot(data=test.air_out)
plt.title('4.3–4.13 air_out')

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].air_in, shade=True, label='3.14–3.24 air_in')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].air_in, shade=True, label='3.24–4.3 air_in')
sns.kdeplot(data=test.air_in, shade=True, label='4.3-4.13 air_in')
plt.subplot(1,2,2)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].air_out, shade=True, label='3.14–3.24 air_out')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].air_out, shade=True, label='3.24–4.3 air_out')
sns.kdeplot(data=test.air_out, shade=True, label='4.3–4.13 air_out')

### hum

In [None]:
fig = plt.figure(figsize=(18,10))
plt.subplot(2,2,1)
sns.boxplot(x=train.hum_in)
plt.title('train hum_in')
plt.subplot(2,2,2)
sns.boxplot(x=train.hum_out)
plt.title('train hum_out')
plt.subplot(2,2,3)
sns.boxplot(x=test.hum_in)
plt.title('test hum_in')
plt.subplot(2,2,4)
sns.boxplot(x=test.hum_out)
plt.title('test hum_out')

In [None]:
fig = plt.figure(figsize=(18, 12))
plt.subplot(3, 2, 1)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].hum_in)
plt.title('3.14–3.24 hum_in')
plt.subplot(3, 2, 2)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].hum_out)
plt.title('3.14–3.24 hum_out')
plt.subplot(3, 2, 3)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].hum_in)
plt.title('3.24-4.3 hum_in')
plt.subplot(3, 2, 4)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].hum_out)
plt.title('3.24–4.3 hum_out')
plt.subplot(3, 2, 5)
sns.lineplot(data=test.hum_in)
plt.title('4.3-4.13 hum_in')
plt.subplot(3, 2, 6)
sns.lineplot(data=test.hum_out)
plt.title('4.3–4.13 hum_out')

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].hum_in, shade=True, label='3.14–3.24 hum_in')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].hum_in, shade=True, label='3.24–4.3 hum_in')
sns.kdeplot(data=test.hum_in, shade=True, label='4.3-4.13 hum_in')
plt.subplot(1,2,2)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].hum_out, shade=True, label='3.14–3.24 hum_out')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].hum_out, shade=True, label='3.24–4.3 hum_out')
sns.kdeplot(data=test.hum_out, shade=True, label='4.3–4.13 hum_out')

### temp

In [None]:
fig = plt.figure(figsize=(18, 12))
plt.subplot(3, 2, 1)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].temp_in)
plt.title('3.14–3.24 temp_in')
plt.subplot(3, 2, 2)
sns.lineplot(data=train.loc[train.time < '2019-3-24 01'].temp_out)
plt.title('3.14–3.24 temp_out')
plt.subplot(3, 2, 3)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].temp_in)
plt.title('3.24-4.3 temp_in')
plt.subplot(3, 2, 4)
sns.lineplot(data=train.loc[train.time >= '2019-3-24 01'].temp_out)
plt.title('3.24–4.3 temp_out')
plt.subplot(3, 2, 6)
sns.lineplot(data=test.temp_out)
plt.title('4.3–4.13 temp_out')

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].temp_in, shade=True, label='3.14–3.24 temp_in')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].temp_in, shade=True, label='3.24–4.3 temp_in')
plt.subplot(1,2,2)
sns.kdeplot(data=train.loc[train.time < '2019-3-24 01'].temp_out, shade=True, label='3.14–3.24 temp_out')
sns.kdeplot(data=train.loc[train.time >= '2019-3-24 01'].temp_out, shade=True, label='3.24–4.3 temp_out')
sns.kdeplot(data=test.temp_out, shade=True, label='4.3–4.13 temp_out')

# Feature Engineering

In [None]:
def convert_minute(minute):
    if min(abs(minute - 60), abs(minute - 0)) < abs(minute - 30):
        return 0
    else:
        return 30

test['minute_origin'] = test['minute']
test['minute'] = test['minute_origin'].map(convert_minute)
test['time2'] = pd.to_datetime(test[['year', 'month', 'day', 'hour', 'minute']])

In [6]:
matrix = pd.concat([train, test], axis=0, ignore_index=True)

## Lag features

In [7]:
# 基本聚合特征
features = ['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
group_feats = []
for f in tqdm(features):
    matrix['MDH_{}_medi'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('median')
    matrix['MDH_{}_mean'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('mean')
    matrix['MDH_{}_max'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('max')
    matrix['MDH_{}_min'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('min')
    matrix['MDH_{}_std'.format(f)] = matrix.groupby(['month','day','hour'])[f].transform('std')

    group_feats.append('MDH_{}_medi'.format(f))
    group_feats.append('MDH_{}_mean'.format(f))

100%|██████████| 5/5 [00:00<00:00, 12.02it/s]


In [8]:
# 基本交叉特征
for f1 in tqdm(features + group_feats):
    for f2 in features + group_feats:
        if f1 != f2:
            colname = '{}_{}_ratio'.format(f1, f2)
            matrix[colname] = matrix[f1].values / matrix[f2].values

matrix = matrix.fillna(method='bfill')

100%|██████████| 15/15 [00:00<00:00, 43.44it/s]


In [9]:
# 历史信息提取
matrix['dt'] = matrix['day'].values + (matrix['month'].values - 3) * 31

features = features + ['temp_in']
for f in features:
    tmp_df = pd.DataFrame()
    for t in tqdm(range(15, 45)):
        tmp = matrix[matrix['dt'] < t].groupby(['hour'])[f].agg({'mean'}).reset_index()
        tmp.columns = ['hour', 'hit_{}_mean'.format(f)]
        tmp['dt'] = t
        tmp_df = tmp_df.append(tmp)
    
    matrix = matrix.merge(tmp_df, on=['dt', 'hour'], how='left')
    
matrix = matrix.fillna(method='bfill')

100%|██████████| 30/30 [00:01<00:00, 27.79it/s]
100%|██████████| 30/30 [00:01<00:00, 22.14it/s]
100%|██████████| 30/30 [00:01<00:00, 26.71it/s]
100%|██████████| 30/30 [00:01<00:00, 28.41it/s]
100%|██████████| 30/30 [00:01<00:00, 25.89it/s]
100%|██████████| 30/30 [00:01<00:00, 19.48it/s]


## Shift features

In [None]:
features = ['temp_out', 'air_out', 'air_in', 'hum_out', 'hum_in']
lags = [1, 6, 12, 24]

In [None]:
for feat in features:
    for n in lags:
        train['lag_'+str(n)+'_hours_'+feat] = train[feat].shift(n*60)
        test['lag_'+str(n)+'_hours_'+feat] = test[feat].shift(n*2)

In [None]:
# 填充test开头NaN

def shift_features(row, col, feat, lag):
    if pd.isna(row[col]):
        cur_time = row.time2
        pre_time = cur_time - pd.Timedelta(str(lag)+' hours')
        train_val = train.loc[train.time == pre_time]
        test_val = test.loc[test.time2 == pre_time]
        try:
            return train_val[feat].values[0]
        except:
            return test_val[feat].values[0] if len(test_val) !=0 else np.NaN
    return row[col]

        
for feat in features:
    for n in lags:
        col = 'lag_'+str(n)+'_hours_'+feat
        test[col] = test.apply(shift_features, axis=1, args=(col, feat, n))
        
test['minute'] = test['minute_origin']
test.drop(['minute_origin', 'time2'], axis=1, inplace=True)

In [None]:
# Trend
for feat in features:
    for n in lags:
        train['trend_'+str(n)+'_hours_'+feat] = train[feat] - train['lag_'+str(n)+'_hours_'+feat]
        test['trend_'+str(n)+'_hours_'+feat] = test[feat] - test['lag_'+str(n)+'_hours_'+feat]

## Rolling features

In [11]:
features = ['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
lags = [1,3,6,12,24]
matrix.set_index('time', inplace=True)

for feat in tqdm(features):
    for l in lags:
        lag_hour = str(l) + 'h'
        matrix['mean_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).mean()
        matrix['median_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).median()
        matrix['max_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).max()
        matrix['min_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).min()
        matrix['std_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).std()
        matrix['skew_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).skew()
        matrix['q1_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).quantile(quantile=0.25)
        matrix['q3_'+str(l)+'_hours_'+feat] = matrix[feat].rolling(lag_hour).median(quantile=0.75)
        matrix['var_'+str(l)+'_hours_'+feat] = matrix['std_'+str(l)+'_hours_'+feat] / matrix['mean_'+str(l)+'_hours_'+feat]
        
matrix.reset_index(inplace=True)

100%|██████████| 5/5 [00:02<00:00,  1.95it/s]


## Bin features

In [10]:
features = ['temp_out', 'hum_out', 'air_out', 'hum_in', 'air_in']
for f in features:
    matrix[f+'_20_bin'] = pd.cut(matrix[f], 20, duplicates='drop').apply(lambda x:x.left).astype(int)
    matrix[f+'_50_bin'] = pd.cut(matrix[f], 50, duplicates='drop').apply(lambda x:x.left).astype(int)
    matrix[f+'_100_bin'] = pd.cut(matrix[f], 100, duplicates='drop').apply(lambda x:x.left).astype(int)
    matrix[f+'_200_bin'] = pd.cut(matrix[f], 200, duplicates='drop').apply(lambda x:x.left).astype(int)
    
features_20_bin = [f + '_20_bin' for f in features]
for f1 in tqdm(features_20_bin):
    for f2 in features:
        matrix['{}_{}_medi'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('median')
        matrix['{}_{}_mean'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('mean')
        matrix['{}_{}_max'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('max')
        matrix['{}_{}_min'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('min')

features_50_bin = [f + '_50_bin' for f in features]
for f1 in tqdm(features_50_bin):
    for f2 in features:
        matrix['{}_{}_medi'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('median')
        matrix['{}_{}_mean'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('mean')
        matrix['{}_{}_max'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('max')
        matrix['{}_{}_min'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('min')

features_100_bin = [f + '_100_bin' for f in features]
for f1 in tqdm(features_100_bin):
    for f2 in features:
        matrix['{}_{}_medi'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('median')
        matrix['{}_{}_mean'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('mean')
        matrix['{}_{}_max'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('max')
        matrix['{}_{}_min'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('min')

features_200_bin = [f + '_200_bin' for f in features]
for f1 in tqdm(features_200_bin):
    for f2 in features:
        matrix['{}_{}_medi'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('median')
        matrix['{}_{}_mean'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('mean')
        matrix['{}_{}_max'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('max')
        matrix['{}_{}_min'.format(f1,f2)] = matrix.groupby([f1])[f2].transform('min')

100%|██████████| 5/5 [00:10<00:00,  2.14s/it]
100%|██████████| 5/5 [00:13<00:00,  2.62s/it]
100%|██████████| 5/5 [00:14<00:00,  2.94s/it]
100%|██████████| 5/5 [00:17<00:00,  3.47s/it]


## Save data

In [12]:
matrix.to_pickle('data.pkl')

# Modeling

In [13]:
data = pd.read_pickle('data.pkl')

In [14]:
features_to_drop = ['timestamp', 'year', 'second', 'time', 'target', 'temp_in']

num = int(len(train)*0.8)
X_train = data.iloc[:num].drop(features_to_drop, axis=1)
y_train = data.iloc[:num]['target']
X_val = data.iloc[num:len(train)].drop(features_to_drop, axis=1)
y_val = data.iloc[num:len(train)]['target']
X_test = data.loc[data.time >= '2019-4-3 01'].drop(features_to_drop, axis=1)

## XGBoost

In [15]:
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance

In [None]:
# grid search

params = {
    'eta': [0.05, 0.1, 0.2],
    'max_depth': [7,8,9],
    'colsample_bytree': [0.6,0.7],
    'subsample': [0.6,0.7],
    'min_child_weight': [4,5,6]
}

best_score, best_param = 100, None

for i, p in enumerate(ParameterGrid(params)):
    model = XGBRegressor(max_depth=9,
                         n_estimators=100,
                         min_child_weight=0.5, 
                         colsample_bytree=0.6, 
                         subsample=0.6, 
                         eta=0.1,
                         seed=10)
    model.set_params(**p)
    model.fit(X_train, 
              y_train, 
              eval_metric='mae', 
              eval_set=[(X_train, y_train), (X_val, y_val)], 
              verbose=False, 
              early_stopping_rounds=20)
    pre_val = model.predict(X_val)
    score = mean_absolute_error(y_val, pre_val)
    print('round {}: {:.4f}'.format(i+1, score))
    print('params: {}'.format(p))
    print('\n')
    if score < best_score:
        best_score = score
        best_param = p

print(best_score)
print(best_param)

In [16]:
%%time

model = XGBRegressor(max_depth=8,
                     n_estimators=50000,
                     min_child_weight=5, 
                     colsample_bytree=0.5, 
                     subsample=0.5, 
                     eta=0.001,
                     seed=2020)
model.fit(X_train, 
          y_train, 
          eval_metric='mae', 
          eval_set=[(X_train, y_train), (X_val, y_val)], 
          verbose=500, 
          early_stopping_rounds=1000)

[0]	validation_0-mae:0.64649	validation_1-mae:0.34923
Multiple eval metrics have been passed: 'validation_1-mae' will be used for early stopping.

Will train until validation_1-mae hasn't improved in 1000 rounds.
[500]	validation_0-mae:0.40463	validation_1-mae:0.24958


KeyboardInterrupt: 

In [None]:
test_pred = model.predict(X_test, ntree_limit=model.best_ntree_limit)
submission = pd.DataFrame({'time': test.timestamp, 
                           'temperature': test.temp_out+test_pred})
submission.to_csv('xgb_submission.csv', index=False)