To Try:

* Remove noisy sites and meter readings (site id)
* rolling statistic
* Site specific indicators
* Treat categorical missing / NaNs

* X Add building and site id features (see https://www.kaggle.com/aitude/ashrae-kfold-lightgbm-without-leak-1-08)
    * Set categorical dataset in lgbm fit
* X Research validation strategy and implement
* X 'Primary use' indicator
* X Additional datebased features (month and quarterly indicators, time trends)
* X LightGBM

In [1]:
import time

import lightgbm as lgb
from sklearn import model_selection, preprocessing, metrics
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from utils import *

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
MAIN = pathlib.Path('/Users/palermopenano/personal/kaggle_energy')
SUBMISSIONS_PATH = MAIN / 'submissions'

# Set Parameters

In [3]:
sample = False
train_full = False         # False to do KFold CV, True to train on full dataset
create_submission = False  # True to generate submission csv on test
submission_name = 'submission_2019-11-24_add_building_site_features.csv'
# submission_name = 'sampleAAA.csv'

# Prepare Training Data

In [4]:
# DNC (does not change)
train = pd.read_csv(MAIN / 'data' / 'train.csv')
train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format=True)

building_metadata = pd.read_csv(MAIN / 'data' / 'building_metadata.csv')

weather_train = pd.read_csv(MAIN / 'data' / 'weather_train.csv')
weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'], infer_datetime_format=True)

## Compute rolling stat

In [5]:
# !!!!!!!
# Rolling statistic for weather data

cols_rol = [
    'air_temperature', 
    'dew_temperature',
    'sea_level_pressure',
    'wind_speed'
]
period = 24

tmp = rolling_stat(
    weather_train, 'timestamp', ['site_id'], 
    cols_rol, period, np.mean
)
weather_train = weather_train.drop(cols_rol, 1)
weather_train = weather_train.merge(tmp, how='inner', on=['site_id', 'timestamp'])

In [6]:
# Take only a random sample of n buildings
if sample:
    train, randbuilding = df_sample_random_buildings(train, 'building_id', n=10)
print(train.shape)

(20216100, 4)


## Merge in to train

In [7]:
# DNC
train = train.merge(building_metadata, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')

print(
    f"Min time {train['timestamp'].min()}",
    f"Max time {train['timestamp'].max()}",
    sep='\n'
)

Min time 2016-01-01 00:00:00
Max time 2016-12-31 23:00:00


In [8]:
# Reduce memory usage
train = reduce_mem_usage(train, cols_exclude=['timestamp'])

Memory usage of dataframe is 2622.02 MB
['building_id', 'meter', 'meter_reading', 'site_id', 'primary_use', 'square_feet', 'year_built', 'floor_count', 'cloud_coverage', 'precip_depth_1_hr', 'wind_direction', 'air_temperature', 'dew_temperature', 'sea_level_pressure', 'wind_speed']
Memory usage after optimization is: 1253.17 MB
Decreased by 52.2%


# Feature Engineering

In [9]:
# Feature engineering: take log of square_feet
train['square_feet'] = np.log1p(train['square_feet'])

In [10]:
# Feature engineering: Add datebased features
# Monday is 0
# If dayofweek is 5 or 6, then it is a weekend
# // is "floored" division (i.e. 6//5 is equal to 1, 3//5 is 0)

add_datepart(
    train, 'timestamp', datetimeformat=None,
    drop=False, time=True, errors="raise"
)
train["weekend"] = train["timestamp"].dt.weekday // 5

In [11]:
# Feature engineering: precip_depth_1
# Convert -1 and NaN precipitation to 0
# Create trace rain indicator
# Create NaN indicator

def precip_depth_1_hr_FE(df, m):
    df['precip_depth_1_hr_nan'] = df['precip_depth_1_hr'].isna()
    
    if m:
        df.loc[df['precip_depth_1_hr'].isna(), 'precip_depth_1_hr'] = m
    else:
        m = df['precip_depth_1_hr'].median()
        df.loc[df['precip_depth_1_hr'].isna(), 'precip_depth_1_hr'] = m

    df['precip_depth_1_hr_isTrace'] = (df['precip_depth_1_hr'] == -1)
    df.loc[df['precip_depth_1_hr'] == -1, 'precip_depth_1_hr'] = 0
    return df, m

train, precip_m = precip_depth_1_hr_FE(train, m=None)
train[['precip_depth_1_hr_nan', 'precip_depth_1_hr_isTrace', 'precip_depth_1_hr']]

Unnamed: 0,precip_depth_1_hr_nan,precip_depth_1_hr_isTrace,precip_depth_1_hr
0,True,False,0.0
1,True,False,0.0
2,True,False,0.0
3,True,False,0.0
4,True,False,0.0
...,...,...,...
20216095,False,True,0.0
20216096,False,True,0.0
20216097,False,True,0.0
20216098,False,True,0.0


In [12]:
# Feature engineering: wind_direction
# Replace nan with median wind_directin angle
# Create nan indicator
# Convert to sine and cosine features

def wind_direction_FE(df, m=None):
    df['wind_direction_nan'] = df['wind_direction'].isna()

    if m:
        df.loc[df['wind_direction'].isna(), 'wind_direction'] = m
    else:
        m = df['wind_direction'].median()
        df.loc[df['wind_direction'].isna(), 'wind_direction'] = m

    df['wind_direction_sin'] = np.sin(np.radians(df['wind_direction']))
    df['wind_direction_cos'] = np.cos(np.radians(df['wind_direction']))
    return df, m

train, wind_direction_m = wind_direction_FE(train, m=None)
train[['wind_direction_nan','wind_direction_sin','wind_direction_cos','wind_direction']]

Unnamed: 0,wind_direction_nan,wind_direction_sin,wind_direction_cos,wind_direction
0,False,0.000000e+00,1.0,0.0
1,False,0.000000e+00,1.0,0.0
2,False,0.000000e+00,1.0,0.0
3,False,0.000000e+00,1.0,0.0
4,False,0.000000e+00,1.0,0.0
...,...,...,...,...
20216095,False,-8.742278e-08,-1.0,180.0
20216096,False,-8.742278e-08,-1.0,180.0
20216097,False,-8.742278e-08,-1.0,180.0
20216098,False,-8.742278e-08,-1.0,180.0


In [13]:
# Feature engineering: primary_use
# Apply label encoder

le = LabelEncoder()
train['primary_use'] = le.fit_transform(train['primary_use'])

## Take Log Meter Reading

In [14]:
# DNC
train['meter_reading'] = np.where(
    train['meter_reading']>=0, train['meter_reading'], 0)
y = np.log1p(train['meter_reading'])

print(y.ndim, y.shape, y.min(), y.max())

1 (20216100,) 0.0 16.902212142944336


## Define Features to Include in Training

In [15]:
cont_feats = [
        'square_feet',
        'floor_count',
        'air_temperature',
        'dew_temperature',
        'sea_level_pressure',
        'wind_speed',
        'precip_depth_1_hr',
        'precip_depth_1_hr_nan', 
        'precip_depth_1_hr_isTrace',
]

cat_feats = [
    'timestampDayofweek',
    'primary_use',
    'year_built',
    'timestampMonth',
#     'timestampWeek',
    'timestampHour',
    'weekend',
    'site_id',
    'building_id'
]

In [16]:
if not train['timestamp'].is_monotonic_increasing:
    raise Exception(
        "timestamp should be sorted in increasing order "
        "for KFold validation to work properly"
    )

In [17]:
# DNC
train = train[cont_feats + cat_feats]
print(
    f"Training on {train.shape[0]} records",
    f"Number of features: {train.shape[1]}",
    sep='\n'
)

Training on 20216100 records
Number of features: 17


# Impute Missing

In [18]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')  # CHANGED
imputed_df = pd.DataFrame(imp.fit_transform(train))
imputed_df.columns = train.columns
train = imputed_df

# KFold CV (Unshuffled)
Variation of cv approach in 

https://www.kaggle.com/kimtaegwan/what-s-your-cv-method?scriptVersionId=22371767

evaluated only on the second fold, since validation set for this are from a time period after the training set. Note disadvantage of current implementation of this approach: missing imputation by mean of a feature leaks into the validation set

In [19]:
if not train_full:

    folds = 2

    kf = model_selection.KFold(
        n_splits=folds, shuffle=False, random_state=42)

    for fold_, (trn_idx, val_idx) in enumerate(kf.split(train, y)):

        # Skip first fold to avoid worst data leakage
        # due to all training set time > validation set time
        if fold_ == 0:
            continue

        print(fold_, trn_idx.shape, val_idx.shape)

        # Note potential leakage here if missing imputation is done before 
        # before this cell
        tr_x, tr_y = train.iloc[trn_idx], y[trn_idx]
        vl_x, vl_y = train.iloc[val_idx], y[val_idx]

#         tr_data = lgb.Dataset(tr_x, label=tr_y)
#         vl_data = lgb.Dataset(vl_x, label=vl_y) 
        
        reg = lgb.LGBMRegressor(
            learning_rate=0.05,
            boosting="gbdt",
            n_estimators=3000,
            feature_fraction=.7,
            min_child_weight=3,
            subsample=0.6,
            colsample_bytree=.9,
            objective='regression',
            metric='rmse',
            n_jobs=8,
            seed=27,
            num_leaves=40
        )

        reg.fit(
            tr_x, tr_y,
            eval_set=[(vl_x, vl_y)],
            early_stopping_rounds=50,
            verbose=100,
            categorical_feature=cat_feats
        )

        gc.collect()

1 (10108050,) (10108050,)


New categorical_feature is ['building_id', 'primary_use', 'site_id', 'timestampDayofweek', 'timestampHour', 'timestampMonth', 'weekend', 'year_built']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[31]	valid_0's rmse: 1.72941


# Train on Full Dataset

In [20]:
# Train on full sample for submission
if train_full:

    print("Training on entire training dataset")
    # Number of estimators based on KFold CV results
    n_estimators_cv = 500

    reg = lgb.LGBMRegressor(
        learning_rate=0.05,
        boosting="gbdt",
        n_estimators=n_estimators_cv,
        feature_fraction=.7,
        min_child_weight=3,
        subsample=0.6,
        colsample_bytree=.9,
        objective='regression',
        metric='rmse',
        n_jobs=8,
        seed=27,
        num_leaves=40
    )
    reg.fit(
        train, y,
        categorical_feature=cat_feats
    )
    
    # clf = LinearRegression()
    # clf.fit(train, y)

In [21]:
# Remove reference and force garbage collection
del train
gc.collect()

0

# Define Dataset to Evaluate

In [22]:
if create_submission:
    # Evaluate on test set
    test = pd.read_csv(MAIN / 'data' / 'test.csv')
    test['timestamp'] = pd.to_datetime(test['timestamp'])

    weather_test = pd.read_csv(MAIN / 'data' / 'weather_test.csv')
    weather_test['timestamp'] = pd.to_datetime(weather_test['timestamp'])

# Apply Evaluation Set Transformations

In [23]:
if create_submission:
    tmp = rolling_stat(
        weather_test, 'timestamp', ['site_id'], 
        cols_rol, period, np.mean
    )
    weather_test = weather_test.drop(cols_rol, 1)
    weather_test = weather_test.merge(tmp, how='inner', on=['site_id', 'timestamp'])

    # DNC
    # Merge into training
    test = test.merge(building_metadata, on='building_id', how='left')
    test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

    if sample:
        test = test[test['building_id'].isin(randbuilding)]

    print("Apply date operation...")
    add_datepart(
        test, 'timestamp', datetimeformat=None,
        drop=False, time=True, errors="raise"
    )
    test["weekend"] = test["timestamp"].dt.weekday // 5

    # Apply feature engineering to test set
    print("Apply feature engineering and imputed values...")
    test,_ = precip_depth_1_hr_FE(test, m=precip_m)
    test, _ = wind_direction_FE(test, m=wind_direction_m)
    test['primary_use'] = le.transform(test['primary_use'])  # CHANGED

    # Remove binding from namespace
    # and force release of memory
    del building_metadata, weather_train
    gc.collect()

    test = test[cont_feats + cat_feats + ['row_id']]
    test['square_feet'] = np.log1p(test['square_feet'])

    test_v = test.drop('row_id', 1).values
    test_v = imp.transform(test_v)
    test_v.shape

# Generate Submission Scores

In [24]:
if create_submission:
    print("Generating submission")

    test['meter_reading'] = np.expm1(reg.predict(test_v))
    # Save predictions as a column in a df
    # Clip to a min of 0 and infinity (a_max is None)
    test['meter_reading'] = np.clip(test['meter_reading'].values, 0, None)
    sample_submission = test[['row_id', 'meter_reading']]

    sample_submission.loc[:,'meter_reading'] = (
        sample_submission.loc[:, 'meter_reading'].
        astype('float32').
        round(2)
    )

    sample_submission.loc[:,'row_id'] = (
        sample_submission.loc[:, 'row_id'].
        astype('int32')
    )

    sample_submission.memory_usage().sum() // 1024**2

    # DNC
    sample_submission.to_csv(SUBMISSIONS_PATH / submission_name, index=False)

    print(sample_submission.shape)
    print(sample_submission['meter_reading'].min(), sample_submission['meter_reading'].max())