In [1]:
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sub = pd.DataFrame(test_df['time'])

In [3]:
train_df = train_df[train_df['temperature'].notnull()]
# train_df = train_df.fillna(method='bfill')
# test_df = test_df.fillna(method='bfill')

In [4]:
train_df.columns = ['time','year','month','day','hour','min','sec','outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo', 'temperature']
test_df.columns = ['time','year','month','day','hour','min','sec','outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']
tmp = train_df[['year', 'month', 'day', 'hour', 'min']].copy()
tmp.columns = ['year', 'month', 'day', 'hour', 'minute']
train_df['date'] = pd.to_datetime(tmp[['year', 'month', 'day', 'hour', 'minute']])
tmp = test_df[['year', 'month', 'day', 'hour', 'min']].copy()
tmp.columns = ['year', 'month', 'day', 'hour', 'minute']
test_df['date'] = pd.to_datetime(tmp[['year', 'month', 'day', 'hour', 'minute']])

In [5]:
# train
def fill_na(row, col):
    if pd.isna(row[col]):
        # 先看看前一分钟有没有
        time = row.date - pd.Timedelta('1 minute')
        pre = train_df.loc[train_df.date == time]
        if len(pre) != 0 and pd.notna(pre[col].values[0]):
            return pre[col].values[0]
        else:
            # 再看昨天有没有
            time = row.date - pd.Timedelta('1 day')
            pre = train_df.loc[train_df.date == time]
            if len(pre) != 0 and pd.notna(pre[col].values[0]):
                return pre[col].values[0]
            else:
                # 再看看前天有没有
                time = row.date - pd.Timedelta('2 day')
                pre = train_df.loc[train_df.date == time]
                if len(pre) != 0 and pd.notna(pre[col].values[0]):
                    return pre[col].values[0]
                else:
                    # 否则直接找上一个非空值
                    pre = train_df.loc[(train_df.date < row.date) & (pd.notna(train[col]))].iloc[-1][col]
                    return pre[col].values[0]
            
    return row[col]

train_df.drop_duplicates(['month', 'day', 'hour', 'min'], inplace=True)

for feat in tqdm(['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo', 'temperature']):
    train_df[feat] = train_df.apply(fill_na, axis=1, args=(feat,))

# test
def avg_pre_next(row, col):
    if pd.isna(row[col]):
        pre_val = test_df.loc[test_df.date < row.date].iloc[-1][col]
        next_val = test_df.loc[test_df.date > row.date].iloc[0][col]
        return (pre_val + next_val) / 2
    return row[col]

for feat in tqdm(['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']):
    test_df[feat] = test_df.apply(avg_pre_next, axis=1, args=(feat,))

100%|██████████| 6/6 [00:07<00:00,  1.27s/it]
100%|██████████| 5/5 [00:00<00:00, 35.89it/s]


In [6]:
data_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

In [7]:
# 基本聚合特征
group_feats = []
for f in tqdm(['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']):
    data_df['MDH_{}_medi'.format(f)] = data_df.groupby(['month','day','hour'])[f].transform('median')
    data_df['MDH_{}_mean'.format(f)] = data_df.groupby(['month','day','hour'])[f].transform('mean')
    data_df['MDH_{}_max'.format(f)] = data_df.groupby(['month','day','hour'])[f].transform('max')
    data_df['MDH_{}_min'.format(f)] = data_df.groupby(['month','day','hour'])[f].transform('min')
    data_df['MDH_{}_std'.format(f)] = data_df.groupby(['month','day','hour'])[f].transform('std')

    group_feats.append('MDH_{}_medi'.format(f))
    group_feats.append('MDH_{}_mean'.format(f))

100%|██████████| 5/5 [00:00<00:00, 15.70it/s]


In [8]:
# 基本交叉特征
for f1 in tqdm(['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']+group_feats):
    
    for f2 in ['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']+group_feats:
        if f1 != f2:
            colname = '{}_{}_ratio'.format(f1, f2)
            data_df[colname] = data_df[f1].values / data_df[f2].values

100%|██████████| 15/15 [00:00<00:00, 45.00it/s]


In [9]:
data_df = data_df.fillna(method='bfill')

In [10]:
# 历史信息提取
data_df['dt'] = data_df['day'].values + (data_df['month'].values - 3) * 31

In [11]:
for f in ['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo', 'temperature']:
    tmp_df = pd.DataFrame()
    for t in tqdm(range(15, 45)):
        tmp = data_df[data_df['dt']<t].groupby(['hour'])[f].agg({'mean'}).reset_index()
        tmp.columns = ['hour','hit_{}_mean'.format(f)]
        tmp['dt'] = t
        tmp_df = tmp_df.append(tmp)
    
    data_df = data_df.merge(tmp_df, on=['dt','hour'], how='left')
    
data_df = data_df.fillna(method='bfill')

100%|██████████| 30/30 [00:01<00:00, 26.07it/s]
100%|██████████| 30/30 [00:01<00:00, 19.59it/s]
100%|██████████| 30/30 [00:01<00:00, 24.32it/s]
100%|██████████| 30/30 [00:01<00:00, 26.55it/s]
100%|██████████| 30/30 [00:01<00:00, 26.93it/s]
100%|██████████| 30/30 [00:01<00:00, 27.07it/s]


In [12]:
# 离散化
for f in ['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']:
    data_df[f+'_20_bin'] = pd.cut(data_df[f], 20, duplicates='drop').apply(lambda x:x.left).astype(int)
    data_df[f+'_50_bin'] = pd.cut(data_df[f], 50, duplicates='drop').apply(lambda x:x.left).astype(int)
    data_df[f+'_100_bin'] = pd.cut(data_df[f], 100, duplicates='drop').apply(lambda x:x.left).astype(int)
    data_df[f+'_200_bin'] = pd.cut(data_df[f], 200, duplicates='drop').apply(lambda x:x.left).astype(int)
    
for f1 in tqdm(['outdoorTemp_20_bin','outdoorHum_20_bin','outdoorAtmo_20_bin','indoorHum_20_bin','indoorAtmo_20_bin']):
    for f2 in ['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']:
        data_df['{}_{}_medi'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('median')
        data_df['{}_{}_mean'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('mean')
        data_df['{}_{}_max'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('max')
        data_df['{}_{}_min'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('min')
       
        
for f1 in tqdm(['outdoorTemp_50_bin','outdoorHum_50_bin','outdoorAtmo_50_bin','indoorHum_50_bin','indoorAtmo_50_bin']):
    for f2 in ['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']:
        data_df['{}_{}_medi'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('median')
        data_df['{}_{}_mean'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('mean')
        data_df['{}_{}_max'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('max')
        data_df['{}_{}_min'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('min')
        
for f1 in tqdm(['outdoorTemp_100_bin','outdoorHum_100_bin','outdoorAtmo_100_bin','indoorHum_100_bin','indoorAtmo_100_bin']):
    for f2 in ['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']:
        data_df['{}_{}_medi'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('median')
        data_df['{}_{}_mean'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('mean')
        data_df['{}_{}_max'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('max')
        data_df['{}_{}_min'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('min')
        
for f1 in tqdm(['outdoorTemp_200_bin','outdoorHum_200_bin','outdoorAtmo_200_bin','indoorHum_200_bin','indoorAtmo_200_bin']):
    for f2 in ['outdoorTemp','outdoorHum','outdoorAtmo','indoorHum','indoorAtmo']:
        data_df['{}_{}_medi'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('median')
        data_df['{}_{}_mean'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('mean')
        data_df['{}_{}_max'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('max')
        data_df['{}_{}_min'.format(f1,f2)] = data_df.groupby([f1])[f2].transform('min')

100%|██████████| 5/5 [00:11<00:00,  2.22s/it]
100%|██████████| 5/5 [00:13<00:00,  2.75s/it]
100%|██████████| 5/5 [00:15<00:00,  3.08s/it]
100%|██████████| 5/5 [00:19<00:00,  3.88s/it]


In [13]:
drop_columns=["time","year","sec","temperature", 'date']


train_count = train_df.shape[0]
train_df = data_df[:train_count].copy().reset_index(drop=True)
test_df = data_df[train_count:].copy().reset_index(drop=True)

In [14]:
features = train_df[:1].drop(drop_columns,axis=1).columns
x_train = train_df[features]
x_test = test_df[features]

In [15]:
y_train = train_df['temperature'].values - train_df['outdoorTemp'].values

In [16]:
train = np.zeros((x_train.shape[0], 1))
test = np.zeros((x_test.shape[0], 1))

nums = int(x_train.shape[0] * 0.80)

In [17]:
trn_x, trn_y, val_x, val_y = x_train[:nums], y_train[:nums], x_train[nums:], y_train[nums:]

In [18]:
train_matrix = xgb.DMatrix(trn_x , label=trn_y, missing=np.nan)
valid_matrix = xgb.DMatrix(val_x , label=val_y, missing=np.nan)
test_matrix  = xgb.DMatrix(x_test, missing=np.nan)

In [19]:
params = {'booster': 'gbtree',
          'eval_metric': 'mae',
          'min_child_weight': 5,
          'max_depth': 8,
          'subsample': 0.5,
          'colsample_bytree': 0.5,
          'eta': 0.001,
          'seed': 2020,
          'nthread': 36,
          'silent': True,
          }

watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]

model = xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=500, early_stopping_rounds=1000)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mae:0.64649	eval-mae:0.34919
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 1000 rounds.
[500]	train-mae:0.40526	eval-mae:0.24752
[1000]	train-mae:0.26004	eval-mae:0.19836
[1500]	train-mae:0.17420	eval-mae:0.17689
[2000]	train-mae:0.12446	eval-mae:0.16761
[2500]	train-mae:0.09627	eval-mae:0.16345
[3000]	train-mae:0.08047	eval-mae:0.16183
[3500]	train-mae:0.07158	eval-mae:0.16109
[4000]	train-mae:0.06636	eval-mae:0.16048
[4500]	train-mae:0.06318	eval-mae:0.15999
[5000]	train-mae:0.06112	eval-mae:0.15958
[5500]	train-mae:0.05969	eval-mae:0.15934
[6000]	train-mae:0.05865	eval-mae:0.15909
[6500]	train-mae:0.05784	eval-mae

In [20]:
val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit).reshape(-1,1)
test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit).reshape(-1,1)

In [21]:
sub["temperature"] = test_pred + test_df['outdoorTemp'].values.reshape(-1,1)